5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
140 HV *paren_names; /* Paren names */
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
147 char *starttry; /* -Dr: where regtry was called. */
148 #define RExC_starttry (pRExC_state->starttry)
151 const char *lastparse;
153 AV *paren_name_list; /* idx -> name */
154 #define RExC_lastparse (pRExC_state->lastparse)
155 #define RExC_lastnum (pRExC_state->lastnum)
156 #define RExC_paren_name_list (pRExC_state->paren_name_list)
160 #define RExC_flags (pRExC_state->flags)
161 #define RExC_precomp (pRExC_state->precomp)
162 #define RExC_rx_sv (pRExC_state->rx_sv)
163 #define RExC_rx (pRExC_state->rx)
164 #define RExC_rxi (pRExC_state->rxi)
165 #define RExC_start (pRExC_state->start)
166 #define RExC_end (pRExC_state->end)
167 #define RExC_parse (pRExC_state->parse)
168 #define RExC_whilem_seen (pRExC_state->whilem_seen)
169 #ifdef RE_TRACK_PATTERN_OFFSETS
170 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
172 #define RExC_emit (pRExC_state->emit)
173 #define RExC_emit_start (pRExC_state->emit_start)
174 #define RExC_emit_bound (pRExC_state->emit_bound)
175 #define RExC_naughty (pRExC_state->naughty)
176 #define RExC_sawback (pRExC_state->sawback)
177 #define RExC_seen (pRExC_state->seen)
178 #define RExC_size (pRExC_state->size)
179 #define RExC_npar (pRExC_state->npar)
180 #define RExC_nestroot (pRExC_state->nestroot)
181 #define RExC_extralen (pRExC_state->extralen)
182 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
183 #define RExC_seen_evals (pRExC_state->seen_evals)
184 #define RExC_utf8 (pRExC_state->utf8)
185 #define RExC_uni_semantics (pRExC_state->uni_semantics)
186 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
187 #define RExC_open_parens (pRExC_state->open_parens)
188 #define RExC_close_parens (pRExC_state->close_parens)
189 #define RExC_opend (pRExC_state->opend)
190 #define RExC_paren_names (pRExC_state->paren_names)
191 #define RExC_recurse (pRExC_state->recurse)
192 #define RExC_recurse_count (pRExC_state->recurse_count)
193 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
194 #define RExC_contains_locale (pRExC_state->contains_locale)
197 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
198 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
199 ((*s) == '{' && regcurly(s)))
202 #undef SPSTART /* dratted cpp namespace... */
205 * Flags to be passed up and down.
207 #define WORST 0 /* Worst case. */
208 #define HASWIDTH 0x01 /* Known to match non-null strings. */
210 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
211 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
213 #define SPSTART 0x04 /* Starts with * or +. */
214 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
215 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
217 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
219 /* whether trie related optimizations are enabled */
220 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
221 #define TRIE_STUDY_OPT
222 #define FULL_TRIE_STUDY
228 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
229 #define PBITVAL(paren) (1 << ((paren) & 7))
230 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
231 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
232 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
234 /* If not already in utf8, do a longjmp back to the beginning */
235 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
236 #define REQUIRE_UTF8 STMT_START { \
237 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
240 /* About scan_data_t.
242 During optimisation we recurse through the regexp program performing
243 various inplace (keyhole style) optimisations. In addition study_chunk
244 and scan_commit populate this data structure with information about
245 what strings MUST appear in the pattern. We look for the longest
246 string that must appear at a fixed location, and we look for the
247 longest string that may appear at a floating location. So for instance
252 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
253 strings (because they follow a .* construct). study_chunk will identify
254 both FOO and BAR as being the longest fixed and floating strings respectively.
256 The strings can be composites, for instance
260 will result in a composite fixed substring 'foo'.
262 For each string some basic information is maintained:
264 - offset or min_offset
265 This is the position the string must appear at, or not before.
266 It also implicitly (when combined with minlenp) tells us how many
267 characters must match before the string we are searching for.
268 Likewise when combined with minlenp and the length of the string it
269 tells us how many characters must appear after the string we have
273 Only used for floating strings. This is the rightmost point that
274 the string can appear at. If set to I32 max it indicates that the
275 string can occur infinitely far to the right.
278 A pointer to the minimum length of the pattern that the string
279 was found inside. This is important as in the case of positive
280 lookahead or positive lookbehind we can have multiple patterns
285 The minimum length of the pattern overall is 3, the minimum length
286 of the lookahead part is 3, but the minimum length of the part that
287 will actually match is 1. So 'FOO's minimum length is 3, but the
288 minimum length for the F is 1. This is important as the minimum length
289 is used to determine offsets in front of and behind the string being
290 looked for. Since strings can be composites this is the length of the
291 pattern at the time it was committed with a scan_commit. Note that
292 the length is calculated by study_chunk, so that the minimum lengths
293 are not known until the full pattern has been compiled, thus the
294 pointer to the value.
298 In the case of lookbehind the string being searched for can be
299 offset past the start point of the final matching string.
300 If this value was just blithely removed from the min_offset it would
301 invalidate some of the calculations for how many chars must match
302 before or after (as they are derived from min_offset and minlen and
303 the length of the string being searched for).
304 When the final pattern is compiled and the data is moved from the
305 scan_data_t structure into the regexp structure the information
306 about lookbehind is factored in, with the information that would
307 have been lost precalculated in the end_shift field for the
310 The fields pos_min and pos_delta are used to store the minimum offset
311 and the delta to the maximum offset at the current point in the pattern.
315 typedef struct scan_data_t {
316 /*I32 len_min; unused */
317 /*I32 len_delta; unused */
321 I32 last_end; /* min value, <0 unless valid. */
324 SV **longest; /* Either &l_fixed, or &l_float. */
325 SV *longest_fixed; /* longest fixed string found in pattern */
326 I32 offset_fixed; /* offset where it starts */
327 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
328 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
329 SV *longest_float; /* longest floating string found in pattern */
330 I32 offset_float_min; /* earliest point in string it can appear */
331 I32 offset_float_max; /* latest point in string it can appear */
332 I32 *minlen_float; /* pointer to the minlen relevant to the string */
333 I32 lookbehind_float; /* is the position of the string modified by LB */
337 struct regnode_charclass_class *start_class;
341 * Forward declarations for pregcomp()'s friends.
344 static const scan_data_t zero_scan_data =
345 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
347 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
348 #define SF_BEFORE_SEOL 0x0001
349 #define SF_BEFORE_MEOL 0x0002
350 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
351 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
354 # define SF_FIX_SHIFT_EOL (0+2)
355 # define SF_FL_SHIFT_EOL (0+4)
357 # define SF_FIX_SHIFT_EOL (+2)
358 # define SF_FL_SHIFT_EOL (+4)
361 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
362 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
364 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
365 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
366 #define SF_IS_INF 0x0040
367 #define SF_HAS_PAR 0x0080
368 #define SF_IN_PAR 0x0100
369 #define SF_HAS_EVAL 0x0200
370 #define SCF_DO_SUBSTR 0x0400
371 #define SCF_DO_STCLASS_AND 0x0800
372 #define SCF_DO_STCLASS_OR 0x1000
373 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
374 #define SCF_WHILEM_VISITED_POS 0x2000
376 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
377 #define SCF_SEEN_ACCEPT 0x8000
379 #define UTF cBOOL(RExC_utf8)
380 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
381 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
382 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
383 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
384 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
385 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
386 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
388 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
390 #define OOB_UNICODE 12345678
391 #define OOB_NAMEDCLASS -1
393 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
394 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
397 /* length of regex to show in messages that don't mark a position within */
398 #define RegexLengthToShowInErrorMessages 127
401 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
402 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
403 * op/pragma/warn/regcomp.
405 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
406 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
408 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
411 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
412 * arg. Show regex, up to a maximum length. If it's too long, chop and add
415 #define _FAIL(code) STMT_START { \
416 const char *ellipses = ""; \
417 IV len = RExC_end - RExC_precomp; \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
421 if (len > RegexLengthToShowInErrorMessages) { \
422 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
423 len = RegexLengthToShowInErrorMessages - 10; \
429 #define FAIL(msg) _FAIL( \
430 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
431 msg, (int)len, RExC_precomp, ellipses))
433 #define FAIL2(msg,arg) _FAIL( \
434 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
435 arg, (int)len, RExC_precomp, ellipses))
438 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
440 #define Simple_vFAIL(m) STMT_START { \
441 const IV offset = RExC_parse - RExC_precomp; \
442 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
443 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
449 #define vFAIL(m) STMT_START { \
451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
456 * Like Simple_vFAIL(), but accepts two arguments.
458 #define Simple_vFAIL2(m,a1) STMT_START { \
459 const IV offset = RExC_parse - RExC_precomp; \
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
465 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
467 #define vFAIL2(m,a1) STMT_START { \
469 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
470 Simple_vFAIL2(m, a1); \
475 * Like Simple_vFAIL(), but accepts three arguments.
477 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
478 const IV offset = RExC_parse - RExC_precomp; \
479 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
480 (int)offset, RExC_precomp, RExC_precomp + offset); \
484 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
486 #define vFAIL3(m,a1,a2) STMT_START { \
488 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
489 Simple_vFAIL3(m, a1, a2); \
493 * Like Simple_vFAIL(), but accepts four arguments.
495 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
496 const IV offset = RExC_parse - RExC_precomp; \
497 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
498 (int)offset, RExC_precomp, RExC_precomp + offset); \
501 #define ckWARNreg(loc,m) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 (int)offset, RExC_precomp, RExC_precomp + offset); \
507 #define ckWARNregdep(loc,m) STMT_START { \
508 const IV offset = loc - RExC_precomp; \
509 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
514 #define ckWARN2regdep(loc,m, a1) STMT_START { \
515 const IV offset = loc - RExC_precomp; \
516 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
518 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define ckWARN2reg(loc, m, a1) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
527 #define vWARN3(loc, m, a1, a2) STMT_START { \
528 const IV offset = loc - RExC_precomp; \
529 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
530 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
533 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
534 const IV offset = loc - RExC_precomp; \
535 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
536 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
539 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
542 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
545 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
546 const IV offset = loc - RExC_precomp; \
547 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
548 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
551 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
552 const IV offset = loc - RExC_precomp; \
553 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
554 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
558 /* Allow for side effects in s */
559 #define REGC(c,s) STMT_START { \
560 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
563 /* Macros for recording node offsets. 20001227 mjd@plover.com
564 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
565 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
566 * Element 0 holds the number n.
567 * Position is 1 indexed.
569 #ifndef RE_TRACK_PATTERN_OFFSETS
570 #define Set_Node_Offset_To_R(node,byte)
571 #define Set_Node_Offset(node,byte)
572 #define Set_Cur_Node_Offset
573 #define Set_Node_Length_To_R(node,len)
574 #define Set_Node_Length(node,len)
575 #define Set_Node_Cur_Length(node)
576 #define Node_Offset(n)
577 #define Node_Length(n)
578 #define Set_Node_Offset_Length(node,offset,len)
579 #define ProgLen(ri) ri->u.proglen
580 #define SetProgLen(ri,x) ri->u.proglen = x
582 #define ProgLen(ri) ri->u.offsets[0]
583 #define SetProgLen(ri,x) ri->u.offsets[0] = x
584 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
586 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
587 __LINE__, (int)(node), (int)(byte))); \
589 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
591 RExC_offsets[2*(node)-1] = (byte); \
596 #define Set_Node_Offset(node,byte) \
597 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
598 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
600 #define Set_Node_Length_To_R(node,len) STMT_START { \
602 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
603 __LINE__, (int)(node), (int)(len))); \
605 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
607 RExC_offsets[2*(node)] = (len); \
612 #define Set_Node_Length(node,len) \
613 Set_Node_Length_To_R((node)-RExC_emit_start, len)
614 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
615 #define Set_Node_Cur_Length(node) \
616 Set_Node_Length(node, RExC_parse - parse_start)
618 /* Get offsets and lengths */
619 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
620 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
622 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
623 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
624 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
628 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
629 #define EXPERIMENTAL_INPLACESCAN
630 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
632 #define DEBUG_STUDYDATA(str,data,depth) \
633 DEBUG_OPTIMISE_MORE_r(if(data){ \
634 PerlIO_printf(Perl_debug_log, \
635 "%*s" str "Pos:%"IVdf"/%"IVdf \
636 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
637 (int)(depth)*2, "", \
638 (IV)((data)->pos_min), \
639 (IV)((data)->pos_delta), \
640 (UV)((data)->flags), \
641 (IV)((data)->whilem_c), \
642 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
643 is_inf ? "INF " : "" \
645 if ((data)->last_found) \
646 PerlIO_printf(Perl_debug_log, \
647 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
648 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
649 SvPVX_const((data)->last_found), \
650 (IV)((data)->last_end), \
651 (IV)((data)->last_start_min), \
652 (IV)((data)->last_start_max), \
653 ((data)->longest && \
654 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
655 SvPVX_const((data)->longest_fixed), \
656 (IV)((data)->offset_fixed), \
657 ((data)->longest && \
658 (data)->longest==&((data)->longest_float)) ? "*" : "", \
659 SvPVX_const((data)->longest_float), \
660 (IV)((data)->offset_float_min), \
661 (IV)((data)->offset_float_max) \
663 PerlIO_printf(Perl_debug_log,"\n"); \
666 static void clear_re(pTHX_ void *r);
668 /* Mark that we cannot extend a found fixed substring at this point.
669 Update the longest found anchored substring and the longest found
670 floating substrings if needed. */
673 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
675 const STRLEN l = CHR_SVLEN(data->last_found);
676 const STRLEN old_l = CHR_SVLEN(*data->longest);
677 GET_RE_DEBUG_FLAGS_DECL;
679 PERL_ARGS_ASSERT_SCAN_COMMIT;
681 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
682 SvSetMagicSV(*data->longest, data->last_found);
683 if (*data->longest == data->longest_fixed) {
684 data->offset_fixed = l ? data->last_start_min : data->pos_min;
685 if (data->flags & SF_BEFORE_EOL)
687 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
689 data->flags &= ~SF_FIX_BEFORE_EOL;
690 data->minlen_fixed=minlenp;
691 data->lookbehind_fixed=0;
693 else { /* *data->longest == data->longest_float */
694 data->offset_float_min = l ? data->last_start_min : data->pos_min;
695 data->offset_float_max = (l
696 ? data->last_start_max
697 : data->pos_min + data->pos_delta);
698 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
699 data->offset_float_max = I32_MAX;
700 if (data->flags & SF_BEFORE_EOL)
702 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
704 data->flags &= ~SF_FL_BEFORE_EOL;
705 data->minlen_float=minlenp;
706 data->lookbehind_float=0;
709 SvCUR_set(data->last_found, 0);
711 SV * const sv = data->last_found;
712 if (SvUTF8(sv) && SvMAGICAL(sv)) {
713 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
719 data->flags &= ~SF_BEFORE_EOL;
720 DEBUG_STUDYDATA("commit: ",data,0);
723 /* Can match anything (initialization) */
725 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
727 PERL_ARGS_ASSERT_CL_ANYTHING;
729 ANYOF_BITMAP_SETALL(cl);
730 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
731 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL
732 /* Even though no bitmap is in use here, we need to set
733 * the flag below so an AND with a node that does have one
734 * doesn't lose that one. The flag should get cleared if
735 * the other one doesn't; and the code in regexec.c is
736 * structured so this being set when not needed does no
737 * harm. It seemed a little cleaner to set it here than do
738 * a special case in cl_and() */
739 |ANYOF_NONBITMAP_NON_UTF8;
741 /* If any portion of the regex is to operate under locale rules,
742 * initialization includes it. The reason this isn't done for all regexes
743 * is that the optimizer was written under the assumption that locale was
744 * all-or-nothing. Given the complexity and lack of documentation in the
745 * optimizer, and that there are inadequate test cases for locale, so many
746 * parts of it may not work properly, it is safest to avoid locale unless
748 if (RExC_contains_locale) {
749 ANYOF_CLASS_SETALL(cl); /* /l uses class */
750 cl->flags |= ANYOF_LOCALE;
753 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
757 /* Can match anything (initialization) */
759 S_cl_is_anything(const struct regnode_charclass_class *cl)
763 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
765 for (value = 0; value <= ANYOF_MAX; value += 2)
766 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
768 if (!(cl->flags & ANYOF_UNICODE_ALL))
770 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
775 /* Can match anything (initialization) */
777 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
779 PERL_ARGS_ASSERT_CL_INIT;
781 Zero(cl, 1, struct regnode_charclass_class);
783 cl_anything(pRExC_state, cl);
784 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
787 /* These two functions currently do the exact same thing */
788 #define cl_init_zero S_cl_init
790 /* 'AND' a given class with another one. Can create false positives. 'cl'
791 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
792 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
794 S_cl_and(struct regnode_charclass_class *cl,
795 const struct regnode_charclass_class *and_with)
797 PERL_ARGS_ASSERT_CL_AND;
799 assert(and_with->type == ANYOF);
801 /* I (khw) am not sure all these restrictions are necessary XXX */
802 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
803 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
804 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
806 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
809 if (and_with->flags & ANYOF_INVERT)
810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 cl->bitmap[i] &= ~and_with->bitmap[i];
813 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
814 cl->bitmap[i] &= and_with->bitmap[i];
815 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
817 if (and_with->flags & ANYOF_INVERT) {
819 /* Here, the and'ed node is inverted. Get the AND of the flags that
820 * aren't affected by the inversion. Those that are affected are
821 * handled individually below */
822 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
823 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
824 cl->flags |= affected_flags;
826 /* We currently don't know how to deal with things that aren't in the
827 * bitmap, but we know that the intersection is no greater than what
828 * is already in cl, so let there be false positives that get sorted
829 * out after the synthetic start class succeeds, and the node is
830 * matched for real. */
832 /* The inversion of these two flags indicate that the resulting
833 * intersection doesn't have them */
834 if (and_with->flags & ANYOF_UNICODE_ALL) {
835 cl->flags &= ~ANYOF_UNICODE_ALL;
837 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
838 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
841 else { /* and'd node is not inverted */
842 if (! ANYOF_NONBITMAP(and_with)) {
844 /* Here 'and_with' doesn't match anything outside the bitmap
845 * (except possibly ANYOF_UNICODE_ALL), which means the
846 * intersection can't either, except for ANYOF_UNICODE_ALL, in
847 * which case we don't know what the intersection is, but it's no
848 * greater than what cl already has, so can just leave it alone,
849 * with possible false positives */
850 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
851 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
852 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
855 else if (! ANYOF_NONBITMAP(cl)) {
857 /* Here, 'and_with' does match something outside the bitmap, and cl
858 * doesn't have a list of things to match outside the bitmap. If
859 * cl can match all code points above 255, the intersection will
860 * be those above-255 code points that 'and_with' matches. There
861 * may be false positives from code points in 'and_with' that are
862 * outside the bitmap but below 256, but those get sorted out
863 * after the synthetic start class succeeds). If cl can't match
864 * all Unicode code points, it means here that it can't match *
865 * anything outside the bitmap, so we leave the bitmap empty */
866 if (cl->flags & ANYOF_UNICODE_ALL) {
867 ARG_SET(cl, ARG(and_with));
871 /* Here, both 'and_with' and cl match something outside the
872 * bitmap. Currently we do not do the intersection, so just match
873 * whatever cl had at the beginning. */
877 /* Take the intersection of the two sets of flags */
878 cl->flags &= and_with->flags;
882 /* 'OR' a given class with another one. Can create false positives. 'cl'
883 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
884 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
886 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
888 PERL_ARGS_ASSERT_CL_OR;
890 if (or_with->flags & ANYOF_INVERT) {
892 /* Here, the or'd node is to be inverted. This means we take the
893 * complement of everything not in the bitmap, but currently we don't
894 * know what that is, so give up and match anything */
895 if (ANYOF_NONBITMAP(or_with)) {
896 cl_anything(pRExC_state, cl);
899 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
900 * <= (B1 | !B2) | (CL1 | !CL2)
901 * which is wasteful if CL2 is small, but we ignore CL2:
902 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
903 * XXXX Can we handle case-fold? Unclear:
904 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
905 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
907 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
908 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
909 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
912 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
913 cl->bitmap[i] |= ~or_with->bitmap[i];
914 } /* XXXX: logic is complicated otherwise */
916 cl_anything(pRExC_state, cl);
919 /* And, we can just take the union of the flags that aren't affected
920 * by the inversion */
921 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
923 /* For the remaining flags:
924 ANYOF_UNICODE_ALL and inverted means to not match anything above
925 255, which means that the union with cl should just be
926 what cl has in it, so can ignore this flag
927 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
928 is 127-255 to match them, but then invert that, so the
929 union with cl should just be what cl has in it, so can
932 } else { /* 'or_with' is not inverted */
933 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
934 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
935 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
936 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
939 /* OR char bitmap and class bitmap separately */
940 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
941 cl->bitmap[i] |= or_with->bitmap[i];
942 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
943 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
944 cl->classflags[i] |= or_with->classflags[i];
945 cl->flags |= ANYOF_CLASS;
948 else { /* XXXX: logic is complicated, leave it along for a moment. */
949 cl_anything(pRExC_state, cl);
952 if (ANYOF_NONBITMAP(or_with)) {
954 /* Use the added node's outside-the-bit-map match if there isn't a
955 * conflict. If there is a conflict (both nodes match something
956 * outside the bitmap, but what they match outside is not the same
957 * pointer, and hence not easily compared until XXX we extend
958 * inversion lists this far), give up and allow the start class to
959 * match everything outside the bitmap. If that stuff is all above
960 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
961 if (! ANYOF_NONBITMAP(cl)) {
962 ARG_SET(cl, ARG(or_with));
964 else if (ARG(cl) != ARG(or_with)) {
966 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
967 cl_anything(pRExC_state, cl);
970 cl->flags |= ANYOF_UNICODE_ALL;
975 cl->flags |= or_with->flags;
980 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
981 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
982 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
983 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
988 dump_trie(trie,widecharmap,revcharmap)
989 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
990 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
992 These routines dump out a trie in a somewhat readable format.
993 The _interim_ variants are used for debugging the interim
994 tables that are used to generate the final compressed
995 representation which is what dump_trie expects.
997 Part of the reason for their existence is to provide a form
998 of documentation as to how the different representations function.
1003 Dumps the final compressed table form of the trie to Perl_debug_log.
1004 Used for debugging make_trie().
1008 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1009 AV *revcharmap, U32 depth)
1012 SV *sv=sv_newmortal();
1013 int colwidth= widecharmap ? 6 : 4;
1015 GET_RE_DEBUG_FLAGS_DECL;
1017 PERL_ARGS_ASSERT_DUMP_TRIE;
1019 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1020 (int)depth * 2 + 2,"",
1021 "Match","Base","Ofs" );
1023 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1024 SV ** const tmp = av_fetch( revcharmap, state, 0);
1026 PerlIO_printf( Perl_debug_log, "%*s",
1028 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1029 PL_colors[0], PL_colors[1],
1030 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031 PERL_PV_ESCAPE_FIRSTCHAR
1036 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1037 (int)depth * 2 + 2,"");
1039 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1040 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1041 PerlIO_printf( Perl_debug_log, "\n");
1043 for( state = 1 ; state < trie->statecount ; state++ ) {
1044 const U32 base = trie->states[ state ].trans.base;
1046 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1048 if ( trie->states[ state ].wordnum ) {
1049 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1051 PerlIO_printf( Perl_debug_log, "%6s", "" );
1054 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1059 while( ( base + ofs < trie->uniquecharcount ) ||
1060 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1061 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1064 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1066 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1067 if ( ( base + ofs >= trie->uniquecharcount ) &&
1068 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1069 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1071 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1073 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1075 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1079 PerlIO_printf( Perl_debug_log, "]");
1082 PerlIO_printf( Perl_debug_log, "\n" );
1084 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1085 for (word=1; word <= trie->wordcount; word++) {
1086 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1087 (int)word, (int)(trie->wordinfo[word].prev),
1088 (int)(trie->wordinfo[word].len));
1090 PerlIO_printf(Perl_debug_log, "\n" );
1093 Dumps a fully constructed but uncompressed trie in list form.
1094 List tries normally only are used for construction when the number of
1095 possible chars (trie->uniquecharcount) is very high.
1096 Used for debugging make_trie().
1099 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1100 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1104 SV *sv=sv_newmortal();
1105 int colwidth= widecharmap ? 6 : 4;
1106 GET_RE_DEBUG_FLAGS_DECL;
1108 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1110 /* print out the table precompression. */
1111 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1112 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1113 "------:-----+-----------------\n" );
1115 for( state=1 ; state < next_alloc ; state ++ ) {
1118 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1119 (int)depth * 2 + 2,"", (UV)state );
1120 if ( ! trie->states[ state ].wordnum ) {
1121 PerlIO_printf( Perl_debug_log, "%5s| ","");
1123 PerlIO_printf( Perl_debug_log, "W%4x| ",
1124 trie->states[ state ].wordnum
1127 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1128 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1130 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1132 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1133 PL_colors[0], PL_colors[1],
1134 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1135 PERL_PV_ESCAPE_FIRSTCHAR
1137 TRIE_LIST_ITEM(state,charid).forid,
1138 (UV)TRIE_LIST_ITEM(state,charid).newstate
1141 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1142 (int)((depth * 2) + 14), "");
1145 PerlIO_printf( Perl_debug_log, "\n");
1150 Dumps a fully constructed but uncompressed trie in table form.
1151 This is the normal DFA style state transition table, with a few
1152 twists to facilitate compression later.
1153 Used for debugging make_trie().
1156 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1157 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1162 SV *sv=sv_newmortal();
1163 int colwidth= widecharmap ? 6 : 4;
1164 GET_RE_DEBUG_FLAGS_DECL;
1166 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1169 print out the table precompression so that we can do a visual check
1170 that they are identical.
1173 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1175 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1176 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1178 PerlIO_printf( Perl_debug_log, "%*s",
1180 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1181 PL_colors[0], PL_colors[1],
1182 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1183 PERL_PV_ESCAPE_FIRSTCHAR
1189 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1191 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1192 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1195 PerlIO_printf( Perl_debug_log, "\n" );
1197 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1199 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1200 (int)depth * 2 + 2,"",
1201 (UV)TRIE_NODENUM( state ) );
1203 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1206 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1208 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1210 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1211 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1213 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1214 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1222 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1223 startbranch: the first branch in the whole branch sequence
1224 first : start branch of sequence of branch-exact nodes.
1225 May be the same as startbranch
1226 last : Thing following the last branch.
1227 May be the same as tail.
1228 tail : item following the branch sequence
1229 count : words in the sequence
1230 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1231 depth : indent depth
1233 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1235 A trie is an N'ary tree where the branches are determined by digital
1236 decomposition of the key. IE, at the root node you look up the 1st character and
1237 follow that branch repeat until you find the end of the branches. Nodes can be
1238 marked as "accepting" meaning they represent a complete word. Eg:
1242 would convert into the following structure. Numbers represent states, letters
1243 following numbers represent valid transitions on the letter from that state, if
1244 the number is in square brackets it represents an accepting state, otherwise it
1245 will be in parenthesis.
1247 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1251 (1) +-i->(6)-+-s->[7]
1253 +-s->(3)-+-h->(4)-+-e->[5]
1255 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1257 This shows that when matching against the string 'hers' we will begin at state 1
1258 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1259 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1260 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1261 single traverse. We store a mapping from accepting to state to which word was
1262 matched, and then when we have multiple possibilities we try to complete the
1263 rest of the regex in the order in which they occured in the alternation.
1265 The only prior NFA like behaviour that would be changed by the TRIE support is
1266 the silent ignoring of duplicate alternations which are of the form:
1268 / (DUPE|DUPE) X? (?{ ... }) Y /x
1270 Thus EVAL blocks following a trie may be called a different number of times with
1271 and without the optimisation. With the optimisations dupes will be silently
1272 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1273 the following demonstrates:
1275 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1277 which prints out 'word' three times, but
1279 'words'=~/(word|word|word)(?{ print $1 })S/
1281 which doesnt print it out at all. This is due to other optimisations kicking in.
1283 Example of what happens on a structural level:
1285 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1287 1: CURLYM[1] {1,32767}(18)
1298 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1299 and should turn into:
1301 1: CURLYM[1] {1,32767}(18)
1303 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1311 Cases where tail != last would be like /(?foo|bar)baz/:
1321 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1322 and would end up looking like:
1325 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1332 d = uvuni_to_utf8_flags(d, uv, 0);
1334 is the recommended Unicode-aware way of saying
1339 #define TRIE_STORE_REVCHAR \
1342 SV *zlopp = newSV(2); \
1343 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1344 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1345 SvCUR_set(zlopp, kapow - flrbbbbb); \
1348 av_push(revcharmap, zlopp); \
1350 char ooooff = (char)uvc; \
1351 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1355 #define TRIE_READ_CHAR STMT_START { \
1359 if ( foldlen > 0 ) { \
1360 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1365 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1366 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1367 foldlen -= UNISKIP( uvc ); \
1368 scan = foldbuf + UNISKIP( uvc ); \
1371 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1381 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1382 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1383 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1384 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1386 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1387 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1388 TRIE_LIST_CUR( state )++; \
1391 #define TRIE_LIST_NEW(state) STMT_START { \
1392 Newxz( trie->states[ state ].trans.list, \
1393 4, reg_trie_trans_le ); \
1394 TRIE_LIST_CUR( state ) = 1; \
1395 TRIE_LIST_LEN( state ) = 4; \
1398 #define TRIE_HANDLE_WORD(state) STMT_START { \
1399 U16 dupe= trie->states[ state ].wordnum; \
1400 regnode * const noper_next = regnext( noper ); \
1403 /* store the word for dumping */ \
1405 if (OP(noper) != NOTHING) \
1406 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1408 tmp = newSVpvn_utf8( "", 0, UTF ); \
1409 av_push( trie_words, tmp ); \
1413 trie->wordinfo[curword].prev = 0; \
1414 trie->wordinfo[curword].len = wordlen; \
1415 trie->wordinfo[curword].accept = state; \
1417 if ( noper_next < tail ) { \
1419 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1420 trie->jump[curword] = (U16)(noper_next - convert); \
1422 jumper = noper_next; \
1424 nextbranch= regnext(cur); \
1428 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1429 /* chain, so that when the bits of chain are later */\
1430 /* linked together, the dups appear in the chain */\
1431 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1432 trie->wordinfo[dupe].prev = curword; \
1434 /* we haven't inserted this word yet. */ \
1435 trie->states[ state ].wordnum = curword; \
1440 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1441 ( ( base + charid >= ucharcount \
1442 && base + charid < ubound \
1443 && state == trie->trans[ base - ucharcount + charid ].check \
1444 && trie->trans[ base - ucharcount + charid ].next ) \
1445 ? trie->trans[ base - ucharcount + charid ].next \
1446 : ( state==1 ? special : 0 ) \
1450 #define MADE_JUMP_TRIE 2
1451 #define MADE_EXACT_TRIE 4
1454 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1457 /* first pass, loop through and scan words */
1458 reg_trie_data *trie;
1459 HV *widecharmap = NULL;
1460 AV *revcharmap = newAV();
1462 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1467 regnode *jumper = NULL;
1468 regnode *nextbranch = NULL;
1469 regnode *convert = NULL;
1470 U32 *prev_states; /* temp array mapping each state to previous one */
1471 /* we just use folder as a flag in utf8 */
1472 const U8 * folder = NULL;
1475 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1476 AV *trie_words = NULL;
1477 /* along with revcharmap, this only used during construction but both are
1478 * useful during debugging so we store them in the struct when debugging.
1481 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1482 STRLEN trie_charcount=0;
1484 SV *re_trie_maxbuff;
1485 GET_RE_DEBUG_FLAGS_DECL;
1487 PERL_ARGS_ASSERT_MAKE_TRIE;
1489 PERL_UNUSED_ARG(depth);
1494 case EXACTFU: folder = PL_fold_latin1; break;
1495 case EXACTF: folder = PL_fold; break;
1496 case EXACTFL: folder = PL_fold_locale; break;
1499 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1501 trie->startstate = 1;
1502 trie->wordcount = word_count;
1503 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1504 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1505 if (!(UTF && folder))
1506 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1507 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1508 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1511 trie_words = newAV();
1514 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1515 if (!SvIOK(re_trie_maxbuff)) {
1516 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1519 PerlIO_printf( Perl_debug_log,
1520 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1521 (int)depth * 2 + 2, "",
1522 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1523 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1527 /* Find the node we are going to overwrite */
1528 if ( first == startbranch && OP( last ) != BRANCH ) {
1529 /* whole branch chain */
1532 /* branch sub-chain */
1533 convert = NEXTOPER( first );
1536 /* -- First loop and Setup --
1538 We first traverse the branches and scan each word to determine if it
1539 contains widechars, and how many unique chars there are, this is
1540 important as we have to build a table with at least as many columns as we
1543 We use an array of integers to represent the character codes 0..255
1544 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1545 native representation of the character value as the key and IV's for the
1548 *TODO* If we keep track of how many times each character is used we can
1549 remap the columns so that the table compression later on is more
1550 efficient in terms of memory by ensuring the most common value is in the
1551 middle and the least common are on the outside. IMO this would be better
1552 than a most to least common mapping as theres a decent chance the most
1553 common letter will share a node with the least common, meaning the node
1554 will not be compressible. With a middle is most common approach the worst
1555 case is when we have the least common nodes twice.
1559 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1560 regnode * const noper = NEXTOPER( cur );
1561 const U8 *uc = (U8*)STRING( noper );
1562 const U8 * const e = uc + STR_LEN( noper );
1564 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1565 const U8 *scan = (U8*)NULL;
1566 U32 wordlen = 0; /* required init */
1568 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1570 if (OP(noper) == NOTHING) {
1574 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1575 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1576 regardless of encoding */
1578 for ( ; uc < e ; uc += len ) {
1579 TRIE_CHARCOUNT(trie)++;
1583 if ( !trie->charmap[ uvc ] ) {
1584 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1586 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1590 /* store the codepoint in the bitmap, and its folded
1592 TRIE_BITMAP_SET(trie,uvc);
1594 /* store the folded codepoint */
1595 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1598 /* store first byte of utf8 representation of
1599 variant codepoints */
1600 if (! UNI_IS_INVARIANT(uvc)) {
1601 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1604 set_bit = 0; /* We've done our bit :-) */
1609 widecharmap = newHV();
1611 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1614 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1616 if ( !SvTRUE( *svpp ) ) {
1617 sv_setiv( *svpp, ++trie->uniquecharcount );
1622 if( cur == first ) {
1625 } else if (chars < trie->minlen) {
1627 } else if (chars > trie->maxlen) {
1631 } /* end first pass */
1632 DEBUG_TRIE_COMPILE_r(
1633 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1634 (int)depth * 2 + 2,"",
1635 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1636 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1637 (int)trie->minlen, (int)trie->maxlen )
1641 We now know what we are dealing with in terms of unique chars and
1642 string sizes so we can calculate how much memory a naive
1643 representation using a flat table will take. If it's over a reasonable
1644 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1645 conservative but potentially much slower representation using an array
1648 At the end we convert both representations into the same compressed
1649 form that will be used in regexec.c for matching with. The latter
1650 is a form that cannot be used to construct with but has memory
1651 properties similar to the list form and access properties similar
1652 to the table form making it both suitable for fast searches and
1653 small enough that its feasable to store for the duration of a program.
1655 See the comment in the code where the compressed table is produced
1656 inplace from the flat tabe representation for an explanation of how
1657 the compression works.
1662 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1665 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1667 Second Pass -- Array Of Lists Representation
1669 Each state will be represented by a list of charid:state records
1670 (reg_trie_trans_le) the first such element holds the CUR and LEN
1671 points of the allocated array. (See defines above).
1673 We build the initial structure using the lists, and then convert
1674 it into the compressed table form which allows faster lookups
1675 (but cant be modified once converted).
1678 STRLEN transcount = 1;
1680 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1681 "%*sCompiling trie using list compiler\n",
1682 (int)depth * 2 + 2, ""));
1684 trie->states = (reg_trie_state *)
1685 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1686 sizeof(reg_trie_state) );
1690 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1692 regnode * const noper = NEXTOPER( cur );
1693 U8 *uc = (U8*)STRING( noper );
1694 const U8 * const e = uc + STR_LEN( noper );
1695 U32 state = 1; /* required init */
1696 U16 charid = 0; /* sanity init */
1697 U8 *scan = (U8*)NULL; /* sanity init */
1698 STRLEN foldlen = 0; /* required init */
1699 U32 wordlen = 0; /* required init */
1700 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1702 if (OP(noper) != NOTHING) {
1703 for ( ; uc < e ; uc += len ) {
1708 charid = trie->charmap[ uvc ];
1710 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1714 charid=(U16)SvIV( *svpp );
1717 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1724 if ( !trie->states[ state ].trans.list ) {
1725 TRIE_LIST_NEW( state );
1727 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1728 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1729 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1734 newstate = next_alloc++;
1735 prev_states[newstate] = state;
1736 TRIE_LIST_PUSH( state, charid, newstate );
1741 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1745 TRIE_HANDLE_WORD(state);
1747 } /* end second pass */
1749 /* next alloc is the NEXT state to be allocated */
1750 trie->statecount = next_alloc;
1751 trie->states = (reg_trie_state *)
1752 PerlMemShared_realloc( trie->states,
1754 * sizeof(reg_trie_state) );
1756 /* and now dump it out before we compress it */
1757 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1758 revcharmap, next_alloc,
1762 trie->trans = (reg_trie_trans *)
1763 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1770 for( state=1 ; state < next_alloc ; state ++ ) {
1774 DEBUG_TRIE_COMPILE_MORE_r(
1775 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1779 if (trie->states[state].trans.list) {
1780 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1784 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1785 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1786 if ( forid < minid ) {
1788 } else if ( forid > maxid ) {
1792 if ( transcount < tp + maxid - minid + 1) {
1794 trie->trans = (reg_trie_trans *)
1795 PerlMemShared_realloc( trie->trans,
1797 * sizeof(reg_trie_trans) );
1798 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1800 base = trie->uniquecharcount + tp - minid;
1801 if ( maxid == minid ) {
1803 for ( ; zp < tp ; zp++ ) {
1804 if ( ! trie->trans[ zp ].next ) {
1805 base = trie->uniquecharcount + zp - minid;
1806 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1807 trie->trans[ zp ].check = state;
1813 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1814 trie->trans[ tp ].check = state;
1819 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1820 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1821 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1822 trie->trans[ tid ].check = state;
1824 tp += ( maxid - minid + 1 );
1826 Safefree(trie->states[ state ].trans.list);
1829 DEBUG_TRIE_COMPILE_MORE_r(
1830 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1833 trie->states[ state ].trans.base=base;
1835 trie->lasttrans = tp + 1;
1839 Second Pass -- Flat Table Representation.
1841 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1842 We know that we will need Charcount+1 trans at most to store the data
1843 (one row per char at worst case) So we preallocate both structures
1844 assuming worst case.
1846 We then construct the trie using only the .next slots of the entry
1849 We use the .check field of the first entry of the node temporarily to
1850 make compression both faster and easier by keeping track of how many non
1851 zero fields are in the node.
1853 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1856 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1857 number representing the first entry of the node, and state as a
1858 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1859 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1860 are 2 entrys per node. eg:
1868 The table is internally in the right hand, idx form. However as we also
1869 have to deal with the states array which is indexed by nodenum we have to
1870 use TRIE_NODENUM() to convert.
1873 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1874 "%*sCompiling trie using table compiler\n",
1875 (int)depth * 2 + 2, ""));
1877 trie->trans = (reg_trie_trans *)
1878 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1879 * trie->uniquecharcount + 1,
1880 sizeof(reg_trie_trans) );
1881 trie->states = (reg_trie_state *)
1882 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1883 sizeof(reg_trie_state) );
1884 next_alloc = trie->uniquecharcount + 1;
1887 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1889 regnode * const noper = NEXTOPER( cur );
1890 const U8 *uc = (U8*)STRING( noper );
1891 const U8 * const e = uc + STR_LEN( noper );
1893 U32 state = 1; /* required init */
1895 U16 charid = 0; /* sanity init */
1896 U32 accept_state = 0; /* sanity init */
1897 U8 *scan = (U8*)NULL; /* sanity init */
1899 STRLEN foldlen = 0; /* required init */
1900 U32 wordlen = 0; /* required init */
1901 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1903 if ( OP(noper) != NOTHING ) {
1904 for ( ; uc < e ; uc += len ) {
1909 charid = trie->charmap[ uvc ];
1911 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1912 charid = svpp ? (U16)SvIV(*svpp) : 0;
1916 if ( !trie->trans[ state + charid ].next ) {
1917 trie->trans[ state + charid ].next = next_alloc;
1918 trie->trans[ state ].check++;
1919 prev_states[TRIE_NODENUM(next_alloc)]
1920 = TRIE_NODENUM(state);
1921 next_alloc += trie->uniquecharcount;
1923 state = trie->trans[ state + charid ].next;
1925 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1927 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1930 accept_state = TRIE_NODENUM( state );
1931 TRIE_HANDLE_WORD(accept_state);
1933 } /* end second pass */
1935 /* and now dump it out before we compress it */
1936 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1938 next_alloc, depth+1));
1942 * Inplace compress the table.*
1944 For sparse data sets the table constructed by the trie algorithm will
1945 be mostly 0/FAIL transitions or to put it another way mostly empty.
1946 (Note that leaf nodes will not contain any transitions.)
1948 This algorithm compresses the tables by eliminating most such
1949 transitions, at the cost of a modest bit of extra work during lookup:
1951 - Each states[] entry contains a .base field which indicates the
1952 index in the state[] array wheres its transition data is stored.
1954 - If .base is 0 there are no valid transitions from that node.
1956 - If .base is nonzero then charid is added to it to find an entry in
1959 -If trans[states[state].base+charid].check!=state then the
1960 transition is taken to be a 0/Fail transition. Thus if there are fail
1961 transitions at the front of the node then the .base offset will point
1962 somewhere inside the previous nodes data (or maybe even into a node
1963 even earlier), but the .check field determines if the transition is
1967 The following process inplace converts the table to the compressed
1968 table: We first do not compress the root node 1,and mark all its
1969 .check pointers as 1 and set its .base pointer as 1 as well. This
1970 allows us to do a DFA construction from the compressed table later,
1971 and ensures that any .base pointers we calculate later are greater
1974 - We set 'pos' to indicate the first entry of the second node.
1976 - We then iterate over the columns of the node, finding the first and
1977 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1978 and set the .check pointers accordingly, and advance pos
1979 appropriately and repreat for the next node. Note that when we copy
1980 the next pointers we have to convert them from the original
1981 NODEIDX form to NODENUM form as the former is not valid post
1984 - If a node has no transitions used we mark its base as 0 and do not
1985 advance the pos pointer.
1987 - If a node only has one transition we use a second pointer into the
1988 structure to fill in allocated fail transitions from other states.
1989 This pointer is independent of the main pointer and scans forward
1990 looking for null transitions that are allocated to a state. When it
1991 finds one it writes the single transition into the "hole". If the
1992 pointer doesnt find one the single transition is appended as normal.
1994 - Once compressed we can Renew/realloc the structures to release the
1997 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1998 specifically Fig 3.47 and the associated pseudocode.
2002 const U32 laststate = TRIE_NODENUM( next_alloc );
2005 trie->statecount = laststate;
2007 for ( state = 1 ; state < laststate ; state++ ) {
2009 const U32 stateidx = TRIE_NODEIDX( state );
2010 const U32 o_used = trie->trans[ stateidx ].check;
2011 U32 used = trie->trans[ stateidx ].check;
2012 trie->trans[ stateidx ].check = 0;
2014 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2015 if ( flag || trie->trans[ stateidx + charid ].next ) {
2016 if ( trie->trans[ stateidx + charid ].next ) {
2018 for ( ; zp < pos ; zp++ ) {
2019 if ( ! trie->trans[ zp ].next ) {
2023 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2024 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2025 trie->trans[ zp ].check = state;
2026 if ( ++zp > pos ) pos = zp;
2033 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2035 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2036 trie->trans[ pos ].check = state;
2041 trie->lasttrans = pos + 1;
2042 trie->states = (reg_trie_state *)
2043 PerlMemShared_realloc( trie->states, laststate
2044 * sizeof(reg_trie_state) );
2045 DEBUG_TRIE_COMPILE_MORE_r(
2046 PerlIO_printf( Perl_debug_log,
2047 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2048 (int)depth * 2 + 2,"",
2049 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2052 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2055 } /* end table compress */
2057 DEBUG_TRIE_COMPILE_MORE_r(
2058 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2059 (int)depth * 2 + 2, "",
2060 (UV)trie->statecount,
2061 (UV)trie->lasttrans)
2063 /* resize the trans array to remove unused space */
2064 trie->trans = (reg_trie_trans *)
2065 PerlMemShared_realloc( trie->trans, trie->lasttrans
2066 * sizeof(reg_trie_trans) );
2068 { /* Modify the program and insert the new TRIE node */
2069 U8 nodetype =(U8)(flags & 0xFF);
2073 regnode *optimize = NULL;
2074 #ifdef RE_TRACK_PATTERN_OFFSETS
2077 U32 mjd_nodelen = 0;
2078 #endif /* RE_TRACK_PATTERN_OFFSETS */
2079 #endif /* DEBUGGING */
2081 This means we convert either the first branch or the first Exact,
2082 depending on whether the thing following (in 'last') is a branch
2083 or not and whther first is the startbranch (ie is it a sub part of
2084 the alternation or is it the whole thing.)
2085 Assuming its a sub part we convert the EXACT otherwise we convert
2086 the whole branch sequence, including the first.
2088 /* Find the node we are going to overwrite */
2089 if ( first != startbranch || OP( last ) == BRANCH ) {
2090 /* branch sub-chain */
2091 NEXT_OFF( first ) = (U16)(last - first);
2092 #ifdef RE_TRACK_PATTERN_OFFSETS
2094 mjd_offset= Node_Offset((convert));
2095 mjd_nodelen= Node_Length((convert));
2098 /* whole branch chain */
2100 #ifdef RE_TRACK_PATTERN_OFFSETS
2103 const regnode *nop = NEXTOPER( convert );
2104 mjd_offset= Node_Offset((nop));
2105 mjd_nodelen= Node_Length((nop));
2109 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2110 (int)depth * 2 + 2, "",
2111 (UV)mjd_offset, (UV)mjd_nodelen)
2114 /* But first we check to see if there is a common prefix we can
2115 split out as an EXACT and put in front of the TRIE node. */
2116 trie->startstate= 1;
2117 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2119 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2123 const U32 base = trie->states[ state ].trans.base;
2125 if ( trie->states[state].wordnum )
2128 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2129 if ( ( base + ofs >= trie->uniquecharcount ) &&
2130 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2131 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2133 if ( ++count > 1 ) {
2134 SV **tmp = av_fetch( revcharmap, ofs, 0);
2135 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2136 if ( state == 1 ) break;
2138 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2140 PerlIO_printf(Perl_debug_log,
2141 "%*sNew Start State=%"UVuf" Class: [",
2142 (int)depth * 2 + 2, "",
2145 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2146 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2148 TRIE_BITMAP_SET(trie,*ch);
2150 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2152 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2156 TRIE_BITMAP_SET(trie,*ch);
2158 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2159 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2165 SV **tmp = av_fetch( revcharmap, idx, 0);
2167 char *ch = SvPV( *tmp, len );
2169 SV *sv=sv_newmortal();
2170 PerlIO_printf( Perl_debug_log,
2171 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2172 (int)depth * 2 + 2, "",
2174 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2175 PL_colors[0], PL_colors[1],
2176 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2177 PERL_PV_ESCAPE_FIRSTCHAR
2182 OP( convert ) = nodetype;
2183 str=STRING(convert);
2186 STR_LEN(convert) += len;
2192 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2197 trie->prefixlen = (state-1);
2199 regnode *n = convert+NODE_SZ_STR(convert);
2200 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2201 trie->startstate = state;
2202 trie->minlen -= (state - 1);
2203 trie->maxlen -= (state - 1);
2205 /* At least the UNICOS C compiler choked on this
2206 * being argument to DEBUG_r(), so let's just have
2209 #ifdef PERL_EXT_RE_BUILD
2215 regnode *fix = convert;
2216 U32 word = trie->wordcount;
2218 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2219 while( ++fix < n ) {
2220 Set_Node_Offset_Length(fix, 0, 0);
2223 SV ** const tmp = av_fetch( trie_words, word, 0 );
2225 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2226 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2228 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2236 NEXT_OFF(convert) = (U16)(tail - convert);
2237 DEBUG_r(optimize= n);
2243 if ( trie->maxlen ) {
2244 NEXT_OFF( convert ) = (U16)(tail - convert);
2245 ARG_SET( convert, data_slot );
2246 /* Store the offset to the first unabsorbed branch in
2247 jump[0], which is otherwise unused by the jump logic.
2248 We use this when dumping a trie and during optimisation. */
2250 trie->jump[0] = (U16)(nextbranch - convert);
2252 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2253 * and there is a bitmap
2254 * and the first "jump target" node we found leaves enough room
2255 * then convert the TRIE node into a TRIEC node, with the bitmap
2256 * embedded inline in the opcode - this is hypothetically faster.
2258 if ( !trie->states[trie->startstate].wordnum
2260 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2262 OP( convert ) = TRIEC;
2263 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2264 PerlMemShared_free(trie->bitmap);
2267 OP( convert ) = TRIE;
2269 /* store the type in the flags */
2270 convert->flags = nodetype;
2274 + regarglen[ OP( convert ) ];
2276 /* XXX We really should free up the resource in trie now,
2277 as we won't use them - (which resources?) dmq */
2279 /* needed for dumping*/
2280 DEBUG_r(if (optimize) {
2281 regnode *opt = convert;
2283 while ( ++opt < optimize) {
2284 Set_Node_Offset_Length(opt,0,0);
2287 Try to clean up some of the debris left after the
2290 while( optimize < jumper ) {
2291 mjd_nodelen += Node_Length((optimize));
2292 OP( optimize ) = OPTIMIZED;
2293 Set_Node_Offset_Length(optimize,0,0);
2296 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2298 } /* end node insert */
2299 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2301 /* Finish populating the prev field of the wordinfo array. Walk back
2302 * from each accept state until we find another accept state, and if
2303 * so, point the first word's .prev field at the second word. If the
2304 * second already has a .prev field set, stop now. This will be the
2305 * case either if we've already processed that word's accept state,
2306 * or that state had multiple words, and the overspill words were
2307 * already linked up earlier.
2314 for (word=1; word <= trie->wordcount; word++) {
2316 if (trie->wordinfo[word].prev)
2318 state = trie->wordinfo[word].accept;
2320 state = prev_states[state];
2323 prev = trie->states[state].wordnum;
2327 trie->wordinfo[word].prev = prev;
2329 Safefree(prev_states);
2333 /* and now dump out the compressed format */
2334 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2336 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2338 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2339 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2341 SvREFCNT_dec(revcharmap);
2345 : trie->startstate>1
2351 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2353 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2355 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2356 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2359 We find the fail state for each state in the trie, this state is the longest proper
2360 suffix of the current state's 'word' that is also a proper prefix of another word in our
2361 trie. State 1 represents the word '' and is thus the default fail state. This allows
2362 the DFA not to have to restart after its tried and failed a word at a given point, it
2363 simply continues as though it had been matching the other word in the first place.
2365 'abcdgu'=~/abcdefg|cdgu/
2366 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2367 fail, which would bring us to the state representing 'd' in the second word where we would
2368 try 'g' and succeed, proceeding to match 'cdgu'.
2370 /* add a fail transition */
2371 const U32 trie_offset = ARG(source);
2372 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2374 const U32 ucharcount = trie->uniquecharcount;
2375 const U32 numstates = trie->statecount;
2376 const U32 ubound = trie->lasttrans + ucharcount;
2380 U32 base = trie->states[ 1 ].trans.base;
2383 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2384 GET_RE_DEBUG_FLAGS_DECL;
2386 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2388 PERL_UNUSED_ARG(depth);
2392 ARG_SET( stclass, data_slot );
2393 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2394 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2395 aho->trie=trie_offset;
2396 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2397 Copy( trie->states, aho->states, numstates, reg_trie_state );
2398 Newxz( q, numstates, U32);
2399 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2402 /* initialize fail[0..1] to be 1 so that we always have
2403 a valid final fail state */
2404 fail[ 0 ] = fail[ 1 ] = 1;
2406 for ( charid = 0; charid < ucharcount ; charid++ ) {
2407 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2409 q[ q_write ] = newstate;
2410 /* set to point at the root */
2411 fail[ q[ q_write++ ] ]=1;
2414 while ( q_read < q_write) {
2415 const U32 cur = q[ q_read++ % numstates ];
2416 base = trie->states[ cur ].trans.base;
2418 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2419 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2421 U32 fail_state = cur;
2424 fail_state = fail[ fail_state ];
2425 fail_base = aho->states[ fail_state ].trans.base;
2426 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2428 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2429 fail[ ch_state ] = fail_state;
2430 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2432 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2434 q[ q_write++ % numstates] = ch_state;
2438 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2439 when we fail in state 1, this allows us to use the
2440 charclass scan to find a valid start char. This is based on the principle
2441 that theres a good chance the string being searched contains lots of stuff
2442 that cant be a start char.
2444 fail[ 0 ] = fail[ 1 ] = 0;
2445 DEBUG_TRIE_COMPILE_r({
2446 PerlIO_printf(Perl_debug_log,
2447 "%*sStclass Failtable (%"UVuf" states): 0",
2448 (int)(depth * 2), "", (UV)numstates
2450 for( q_read=1; q_read<numstates; q_read++ ) {
2451 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2453 PerlIO_printf(Perl_debug_log, "\n");
2456 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2461 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2462 * These need to be revisited when a newer toolchain becomes available.
2464 #if defined(__sparc64__) && defined(__GNUC__)
2465 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2466 # undef SPARC64_GCC_WORKAROUND
2467 # define SPARC64_GCC_WORKAROUND 1
2471 #define DEBUG_PEEP(str,scan,depth) \
2472 DEBUG_OPTIMISE_r({if (scan){ \
2473 SV * const mysv=sv_newmortal(); \
2474 regnode *Next = regnext(scan); \
2475 regprop(RExC_rx, mysv, scan); \
2476 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2477 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2478 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2485 #define JOIN_EXACT(scan,min,flags) \
2486 if (PL_regkind[OP(scan)] == EXACT) \
2487 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2490 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2491 /* Merge several consecutive EXACTish nodes into one. */
2492 regnode *n = regnext(scan);
2494 regnode *next = scan + NODE_SZ_STR(scan);
2498 regnode *stop = scan;
2499 GET_RE_DEBUG_FLAGS_DECL;
2501 PERL_UNUSED_ARG(depth);
2504 PERL_ARGS_ASSERT_JOIN_EXACT;
2505 #ifndef EXPERIMENTAL_INPLACESCAN
2506 PERL_UNUSED_ARG(flags);
2507 PERL_UNUSED_ARG(val);
2509 DEBUG_PEEP("join",scan,depth);
2511 /* Skip NOTHING, merge EXACT*. */
2513 ( PL_regkind[OP(n)] == NOTHING ||
2514 (stringok && (OP(n) == OP(scan))))
2516 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2518 if (OP(n) == TAIL || n > next)
2520 if (PL_regkind[OP(n)] == NOTHING) {
2521 DEBUG_PEEP("skip:",n,depth);
2522 NEXT_OFF(scan) += NEXT_OFF(n);
2523 next = n + NODE_STEP_REGNODE;
2530 else if (stringok) {
2531 const unsigned int oldl = STR_LEN(scan);
2532 regnode * const nnext = regnext(n);
2534 DEBUG_PEEP("merg",n,depth);
2537 if (oldl + STR_LEN(n) > U8_MAX)
2539 NEXT_OFF(scan) += NEXT_OFF(n);
2540 STR_LEN(scan) += STR_LEN(n);
2541 next = n + NODE_SZ_STR(n);
2542 /* Now we can overwrite *n : */
2543 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2551 #ifdef EXPERIMENTAL_INPLACESCAN
2552 if (flags && !NEXT_OFF(n)) {
2553 DEBUG_PEEP("atch", val, depth);
2554 if (reg_off_by_arg[OP(n)]) {
2555 ARG_SET(n, val - n);
2558 NEXT_OFF(n) = val - n;
2564 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2565 #define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2566 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2567 #define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2570 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2571 && ( STR_LEN(scan) >= 6 ) )
2574 Two problematic code points in Unicode casefolding of EXACT nodes:
2576 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2577 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2583 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2584 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2586 This means that in case-insensitive matching (or "loose matching",
2587 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2588 length of the above casefolded versions) can match a target string
2589 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2590 This would rather mess up the minimum length computation.
2592 What we'll do is to look for the tail four bytes, and then peek
2593 at the preceding two bytes to see whether we need to decrease
2594 the minimum length by four (six minus two).
2596 Thanks to the design of UTF-8, there cannot be false matches:
2597 A sequence of valid UTF-8 bytes cannot be a subsequence of
2598 another valid sequence of UTF-8 bytes.
2601 char * const s0 = STRING(scan), *s, *t;
2602 char * const s1 = s0 + STR_LEN(scan) - 1;
2603 char * const s2 = s1 - 4;
2604 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2605 const char t0[] = "\xaf\x49\xaf\x42";
2607 const char t0[] = "\xcc\x88\xcc\x81";
2609 const char * const t1 = t0 + 3;
2612 s < s2 && (t = ninstr(s, s1, t0, t1));
2615 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2616 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2618 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2619 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2627 n = scan + NODE_SZ_STR(scan);
2629 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2636 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2640 /* REx optimizer. Converts nodes into quicker variants "in place".
2641 Finds fixed substrings. */
2643 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2644 to the position after last scanned or to NULL. */
2646 #define INIT_AND_WITHP \
2647 assert(!and_withp); \
2648 Newx(and_withp,1,struct regnode_charclass_class); \
2649 SAVEFREEPV(and_withp)
2651 /* this is a chain of data about sub patterns we are processing that
2652 need to be handled separately/specially in study_chunk. Its so
2653 we can simulate recursion without losing state. */
2655 typedef struct scan_frame {
2656 regnode *last; /* last node to process in this frame */
2657 regnode *next; /* next node to process when last is reached */
2658 struct scan_frame *prev; /*previous frame*/
2659 I32 stop; /* what stopparen do we use */
2663 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2665 #define CASE_SYNST_FNC(nAmE) \
2667 if (flags & SCF_DO_STCLASS_AND) { \
2668 for (value = 0; value < 256; value++) \
2669 if (!is_ ## nAmE ## _cp(value)) \
2670 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2673 for (value = 0; value < 256; value++) \
2674 if (is_ ## nAmE ## _cp(value)) \
2675 ANYOF_BITMAP_SET(data->start_class, value); \
2679 if (flags & SCF_DO_STCLASS_AND) { \
2680 for (value = 0; value < 256; value++) \
2681 if (is_ ## nAmE ## _cp(value)) \
2682 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2685 for (value = 0; value < 256; value++) \
2686 if (!is_ ## nAmE ## _cp(value)) \
2687 ANYOF_BITMAP_SET(data->start_class, value); \
2694 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2695 I32 *minlenp, I32 *deltap,
2700 struct regnode_charclass_class *and_withp,
2701 U32 flags, U32 depth)
2702 /* scanp: Start here (read-write). */
2703 /* deltap: Write maxlen-minlen here. */
2704 /* last: Stop before this one. */
2705 /* data: string data about the pattern */
2706 /* stopparen: treat close N as END */
2707 /* recursed: which subroutines have we recursed into */
2708 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2711 I32 min = 0, pars = 0, code;
2712 regnode *scan = *scanp, *next;
2714 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2715 int is_inf_internal = 0; /* The studied chunk is infinite */
2716 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2717 scan_data_t data_fake;
2718 SV *re_trie_maxbuff = NULL;
2719 regnode *first_non_open = scan;
2720 I32 stopmin = I32_MAX;
2721 scan_frame *frame = NULL;
2722 GET_RE_DEBUG_FLAGS_DECL;
2724 PERL_ARGS_ASSERT_STUDY_CHUNK;
2727 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2731 while (first_non_open && OP(first_non_open) == OPEN)
2732 first_non_open=regnext(first_non_open);
2737 while ( scan && OP(scan) != END && scan < last ){
2738 /* Peephole optimizer: */
2739 DEBUG_STUDYDATA("Peep:", data,depth);
2740 DEBUG_PEEP("Peep",scan,depth);
2741 JOIN_EXACT(scan,&min,0);
2743 /* Follow the next-chain of the current node and optimize
2744 away all the NOTHINGs from it. */
2745 if (OP(scan) != CURLYX) {
2746 const int max = (reg_off_by_arg[OP(scan)]
2748 /* I32 may be smaller than U16 on CRAYs! */
2749 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2750 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2754 /* Skip NOTHING and LONGJMP. */
2755 while ((n = regnext(n))
2756 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2757 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2758 && off + noff < max)
2760 if (reg_off_by_arg[OP(scan)])
2763 NEXT_OFF(scan) = off;
2768 /* The principal pseudo-switch. Cannot be a switch, since we
2769 look into several different things. */
2770 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2771 || OP(scan) == IFTHEN) {
2772 next = regnext(scan);
2774 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2776 if (OP(next) == code || code == IFTHEN) {
2777 /* NOTE - There is similar code to this block below for handling
2778 TRIE nodes on a re-study. If you change stuff here check there
2780 I32 max1 = 0, min1 = I32_MAX, num = 0;
2781 struct regnode_charclass_class accum;
2782 regnode * const startbranch=scan;
2784 if (flags & SCF_DO_SUBSTR)
2785 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2786 if (flags & SCF_DO_STCLASS)
2787 cl_init_zero(pRExC_state, &accum);
2789 while (OP(scan) == code) {
2790 I32 deltanext, minnext, f = 0, fake;
2791 struct regnode_charclass_class this_class;
2794 data_fake.flags = 0;
2796 data_fake.whilem_c = data->whilem_c;
2797 data_fake.last_closep = data->last_closep;
2800 data_fake.last_closep = &fake;
2802 data_fake.pos_delta = delta;
2803 next = regnext(scan);
2804 scan = NEXTOPER(scan);
2806 scan = NEXTOPER(scan);
2807 if (flags & SCF_DO_STCLASS) {
2808 cl_init(pRExC_state, &this_class);
2809 data_fake.start_class = &this_class;
2810 f = SCF_DO_STCLASS_AND;
2812 if (flags & SCF_WHILEM_VISITED_POS)
2813 f |= SCF_WHILEM_VISITED_POS;
2815 /* we suppose the run is continuous, last=next...*/
2816 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2818 stopparen, recursed, NULL, f,depth+1);
2821 if (max1 < minnext + deltanext)
2822 max1 = minnext + deltanext;
2823 if (deltanext == I32_MAX)
2824 is_inf = is_inf_internal = 1;
2826 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2828 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2829 if ( stopmin > minnext)
2830 stopmin = min + min1;
2831 flags &= ~SCF_DO_SUBSTR;
2833 data->flags |= SCF_SEEN_ACCEPT;
2836 if (data_fake.flags & SF_HAS_EVAL)
2837 data->flags |= SF_HAS_EVAL;
2838 data->whilem_c = data_fake.whilem_c;
2840 if (flags & SCF_DO_STCLASS)
2841 cl_or(pRExC_state, &accum, &this_class);
2843 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2845 if (flags & SCF_DO_SUBSTR) {
2846 data->pos_min += min1;
2847 data->pos_delta += max1 - min1;
2848 if (max1 != min1 || is_inf)
2849 data->longest = &(data->longest_float);
2852 delta += max1 - min1;
2853 if (flags & SCF_DO_STCLASS_OR) {
2854 cl_or(pRExC_state, data->start_class, &accum);
2856 cl_and(data->start_class, and_withp);
2857 flags &= ~SCF_DO_STCLASS;
2860 else if (flags & SCF_DO_STCLASS_AND) {
2862 cl_and(data->start_class, &accum);
2863 flags &= ~SCF_DO_STCLASS;
2866 /* Switch to OR mode: cache the old value of
2867 * data->start_class */
2869 StructCopy(data->start_class, and_withp,
2870 struct regnode_charclass_class);
2871 flags &= ~SCF_DO_STCLASS_AND;
2872 StructCopy(&accum, data->start_class,
2873 struct regnode_charclass_class);
2874 flags |= SCF_DO_STCLASS_OR;
2875 data->start_class->flags |= ANYOF_EOS;
2879 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2882 Assuming this was/is a branch we are dealing with: 'scan' now
2883 points at the item that follows the branch sequence, whatever
2884 it is. We now start at the beginning of the sequence and look
2891 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2893 If we can find such a subsequence we need to turn the first
2894 element into a trie and then add the subsequent branch exact
2895 strings to the trie.
2899 1. patterns where the whole set of branches can be converted.
2901 2. patterns where only a subset can be converted.
2903 In case 1 we can replace the whole set with a single regop
2904 for the trie. In case 2 we need to keep the start and end
2907 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2908 becomes BRANCH TRIE; BRANCH X;
2910 There is an additional case, that being where there is a
2911 common prefix, which gets split out into an EXACT like node
2912 preceding the TRIE node.
2914 If x(1..n)==tail then we can do a simple trie, if not we make
2915 a "jump" trie, such that when we match the appropriate word
2916 we "jump" to the appropriate tail node. Essentially we turn
2917 a nested if into a case structure of sorts.
2922 if (!re_trie_maxbuff) {
2923 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2924 if (!SvIOK(re_trie_maxbuff))
2925 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2927 if ( SvIV(re_trie_maxbuff)>=0 ) {
2929 regnode *first = (regnode *)NULL;
2930 regnode *last = (regnode *)NULL;
2931 regnode *tail = scan;
2936 SV * const mysv = sv_newmortal(); /* for dumping */
2938 /* var tail is used because there may be a TAIL
2939 regop in the way. Ie, the exacts will point to the
2940 thing following the TAIL, but the last branch will
2941 point at the TAIL. So we advance tail. If we
2942 have nested (?:) we may have to move through several
2946 while ( OP( tail ) == TAIL ) {
2947 /* this is the TAIL generated by (?:) */
2948 tail = regnext( tail );
2953 regprop(RExC_rx, mysv, tail );
2954 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2955 (int)depth * 2 + 2, "",
2956 "Looking for TRIE'able sequences. Tail node is: ",
2957 SvPV_nolen_const( mysv )
2963 step through the branches, cur represents each
2964 branch, noper is the first thing to be matched
2965 as part of that branch and noper_next is the
2966 regnext() of that node. if noper is an EXACT
2967 and noper_next is the same as scan (our current
2968 position in the regex) then the EXACT branch is
2969 a possible optimization target. Once we have
2970 two or more consecutive such branches we can
2971 create a trie of the EXACT's contents and stich
2972 it in place. If the sequence represents all of
2973 the branches we eliminate the whole thing and
2974 replace it with a single TRIE. If it is a
2975 subsequence then we need to stitch it in. This
2976 means the first branch has to remain, and needs
2977 to be repointed at the item on the branch chain
2978 following the last branch optimized. This could
2979 be either a BRANCH, in which case the
2980 subsequence is internal, or it could be the
2981 item following the branch sequence in which
2982 case the subsequence is at the end.
2986 /* dont use tail as the end marker for this traverse */
2987 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2988 regnode * const noper = NEXTOPER( cur );
2989 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2990 regnode * const noper_next = regnext( noper );
2994 regprop(RExC_rx, mysv, cur);
2995 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2996 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2998 regprop(RExC_rx, mysv, noper);
2999 PerlIO_printf( Perl_debug_log, " -> %s",
3000 SvPV_nolen_const(mysv));
3003 regprop(RExC_rx, mysv, noper_next );
3004 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3005 SvPV_nolen_const(mysv));
3007 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3008 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3010 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3011 : PL_regkind[ OP( noper ) ] == EXACT )
3012 || OP(noper) == NOTHING )
3014 && noper_next == tail
3019 if ( !first || optype == NOTHING ) {
3020 if (!first) first = cur;
3021 optype = OP( noper );
3027 Currently we do not believe that the trie logic can
3028 handle case insensitive matching properly when the
3029 pattern is not unicode (thus forcing unicode semantics).
3031 If/when this is fixed the following define can be swapped
3032 in below to fully enable trie logic.
3034 XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3037 #define TRIE_TYPE_IS_SAFE 1
3040 #define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
3042 if ( last && TRIE_TYPE_IS_SAFE ) {
3043 make_trie( pRExC_state,
3044 startbranch, first, cur, tail, count,
3047 if ( PL_regkind[ OP( noper ) ] == EXACT
3049 && noper_next == tail
3054 optype = OP( noper );
3064 regprop(RExC_rx, mysv, cur);
3065 PerlIO_printf( Perl_debug_log,
3066 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3067 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3071 if ( last && TRIE_TYPE_IS_SAFE ) {
3072 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3073 #ifdef TRIE_STUDY_OPT
3074 if ( ((made == MADE_EXACT_TRIE &&
3075 startbranch == first)
3076 || ( first_non_open == first )) &&
3078 flags |= SCF_TRIE_RESTUDY;
3079 if ( startbranch == first
3082 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3092 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3093 scan = NEXTOPER(NEXTOPER(scan));
3094 } else /* single branch is optimized. */
3095 scan = NEXTOPER(scan);
3097 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3098 scan_frame *newframe = NULL;
3103 if (OP(scan) != SUSPEND) {
3104 /* set the pointer */
3105 if (OP(scan) == GOSUB) {
3107 RExC_recurse[ARG2L(scan)] = scan;
3108 start = RExC_open_parens[paren-1];
3109 end = RExC_close_parens[paren-1];
3112 start = RExC_rxi->program + 1;
3116 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3117 SAVEFREEPV(recursed);
3119 if (!PAREN_TEST(recursed,paren+1)) {
3120 PAREN_SET(recursed,paren+1);
3121 Newx(newframe,1,scan_frame);
3123 if (flags & SCF_DO_SUBSTR) {
3124 SCAN_COMMIT(pRExC_state,data,minlenp);
3125 data->longest = &(data->longest_float);
3127 is_inf = is_inf_internal = 1;
3128 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3129 cl_anything(pRExC_state, data->start_class);
3130 flags &= ~SCF_DO_STCLASS;
3133 Newx(newframe,1,scan_frame);
3136 end = regnext(scan);
3141 SAVEFREEPV(newframe);
3142 newframe->next = regnext(scan);
3143 newframe->last = last;
3144 newframe->stop = stopparen;
3145 newframe->prev = frame;
3155 else if (OP(scan) == EXACT) {
3156 I32 l = STR_LEN(scan);
3159 const U8 * const s = (U8*)STRING(scan);
3160 l = utf8_length(s, s + l);
3161 uc = utf8_to_uvchr(s, NULL);
3163 uc = *((U8*)STRING(scan));
3166 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3167 /* The code below prefers earlier match for fixed
3168 offset, later match for variable offset. */
3169 if (data->last_end == -1) { /* Update the start info. */
3170 data->last_start_min = data->pos_min;
3171 data->last_start_max = is_inf
3172 ? I32_MAX : data->pos_min + data->pos_delta;
3174 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3176 SvUTF8_on(data->last_found);
3178 SV * const sv = data->last_found;
3179 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3180 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3181 if (mg && mg->mg_len >= 0)
3182 mg->mg_len += utf8_length((U8*)STRING(scan),
3183 (U8*)STRING(scan)+STR_LEN(scan));
3185 data->last_end = data->pos_min + l;
3186 data->pos_min += l; /* As in the first entry. */
3187 data->flags &= ~SF_BEFORE_EOL;
3189 if (flags & SCF_DO_STCLASS_AND) {
3190 /* Check whether it is compatible with what we know already! */
3194 /* If compatible, we or it in below. It is compatible if is
3195 * in the bitmp and either 1) its bit or its fold is set, or 2)
3196 * it's for a locale. Even if there isn't unicode semantics
3197 * here, at runtime there may be because of matching against a
3198 * utf8 string, so accept a possible false positive for
3199 * latin1-range folds */
3201 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3202 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3203 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3204 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3209 ANYOF_CLASS_ZERO(data->start_class);
3210 ANYOF_BITMAP_ZERO(data->start_class);
3212 ANYOF_BITMAP_SET(data->start_class, uc);
3213 else if (uc >= 0x100) {
3216 /* Some Unicode code points fold to the Latin1 range; as
3217 * XXX temporary code, instead of figuring out if this is
3218 * one, just assume it is and set all the start class bits
3219 * that could be some such above 255 code point's fold
3220 * which will generate fals positives. As the code
3221 * elsewhere that does compute the fold settles down, it
3222 * can be extracted out and re-used here */
3223 for (i = 0; i < 256; i++){
3224 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3225 ANYOF_BITMAP_SET(data->start_class, i);
3229 data->start_class->flags &= ~ANYOF_EOS;
3231 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3233 else if (flags & SCF_DO_STCLASS_OR) {
3234 /* false positive possible if the class is case-folded */
3236 ANYOF_BITMAP_SET(data->start_class, uc);
3238 data->start_class->flags |= ANYOF_UNICODE_ALL;
3239 data->start_class->flags &= ~ANYOF_EOS;
3240 cl_and(data->start_class, and_withp);
3242 flags &= ~SCF_DO_STCLASS;
3244 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3245 I32 l = STR_LEN(scan);
3246 UV uc = *((U8*)STRING(scan));
3248 /* Search for fixed substrings supports EXACT only. */
3249 if (flags & SCF_DO_SUBSTR) {
3251 SCAN_COMMIT(pRExC_state, data, minlenp);
3254 const U8 * const s = (U8 *)STRING(scan);
3255 l = utf8_length(s, s + l);
3256 uc = utf8_to_uvchr(s, NULL);
3259 if (flags & SCF_DO_SUBSTR)
3261 if (flags & SCF_DO_STCLASS_AND) {
3262 /* Check whether it is compatible with what we know already! */
3265 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3266 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3267 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3271 ANYOF_CLASS_ZERO(data->start_class);
3272 ANYOF_BITMAP_ZERO(data->start_class);
3274 ANYOF_BITMAP_SET(data->start_class, uc);
3275 data->start_class->flags &= ~ANYOF_EOS;
3276 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3277 if (OP(scan) == EXACTFL) {
3278 /* XXX This set is probably no longer necessary, and
3279 * probably wrong as LOCALE now is on in the initial
3281 data->start_class->flags |= ANYOF_LOCALE;
3285 /* Also set the other member of the fold pair. In case
3286 * that unicode semantics is called for at runtime, use
3287 * the full latin1 fold. (Can't do this for locale,
3288 * because not known until runtime */
3289 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3292 else if (uc >= 0x100) {
3294 for (i = 0; i < 256; i++){
3295 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3296 ANYOF_BITMAP_SET(data->start_class, i);
3301 else if (flags & SCF_DO_STCLASS_OR) {
3302 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3303 /* false positive possible if the class is case-folded.
3304 Assume that the locale settings are the same... */
3306 ANYOF_BITMAP_SET(data->start_class, uc);
3307 if (OP(scan) != EXACTFL) {
3309 /* And set the other member of the fold pair, but
3310 * can't do that in locale because not known until
3312 ANYOF_BITMAP_SET(data->start_class,
3313 PL_fold_latin1[uc]);
3316 data->start_class->flags &= ~ANYOF_EOS;
3318 cl_and(data->start_class, and_withp);
3320 flags &= ~SCF_DO_STCLASS;
3322 else if (REGNODE_VARIES(OP(scan))) {
3323 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3324 I32 f = flags, pos_before = 0;
3325 regnode * const oscan = scan;
3326 struct regnode_charclass_class this_class;
3327 struct regnode_charclass_class *oclass = NULL;
3328 I32 next_is_eval = 0;
3330 switch (PL_regkind[OP(scan)]) {
3331 case WHILEM: /* End of (?:...)* . */
3332 scan = NEXTOPER(scan);
3335 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3336 next = NEXTOPER(scan);
3337 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3339 maxcount = REG_INFTY;
3340 next = regnext(scan);
3341 scan = NEXTOPER(scan);
3345 if (flags & SCF_DO_SUBSTR)
3350 if (flags & SCF_DO_STCLASS) {
3352 maxcount = REG_INFTY;
3353 next = regnext(scan);
3354 scan = NEXTOPER(scan);
3357 is_inf = is_inf_internal = 1;
3358 scan = regnext(scan);
3359 if (flags & SCF_DO_SUBSTR) {
3360 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3361 data->longest = &(data->longest_float);
3363 goto optimize_curly_tail;
3365 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3366 && (scan->flags == stopparen))
3371 mincount = ARG1(scan);
3372 maxcount = ARG2(scan);
3374 next = regnext(scan);
3375 if (OP(scan) == CURLYX) {
3376 I32 lp = (data ? *(data->last_closep) : 0);
3377 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3379 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3380 next_is_eval = (OP(scan) == EVAL);
3382 if (flags & SCF_DO_SUBSTR) {
3383 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3384 pos_before = data->pos_min;
3388 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3390 data->flags |= SF_IS_INF;
3392 if (flags & SCF_DO_STCLASS) {
3393 cl_init(pRExC_state, &this_class);
3394 oclass = data->start_class;
3395 data->start_class = &this_class;
3396 f |= SCF_DO_STCLASS_AND;
3397 f &= ~SCF_DO_STCLASS_OR;
3399 /* Exclude from super-linear cache processing any {n,m}
3400 regops for which the combination of input pos and regex
3401 pos is not enough information to determine if a match
3404 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3405 regex pos at the \s*, the prospects for a match depend not
3406 only on the input position but also on how many (bar\s*)
3407 repeats into the {4,8} we are. */
3408 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3409 f &= ~SCF_WHILEM_VISITED_POS;
3411 /* This will finish on WHILEM, setting scan, or on NULL: */
3412 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3413 last, data, stopparen, recursed, NULL,
3415 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3417 if (flags & SCF_DO_STCLASS)
3418 data->start_class = oclass;
3419 if (mincount == 0 || minnext == 0) {
3420 if (flags & SCF_DO_STCLASS_OR) {
3421 cl_or(pRExC_state, data->start_class, &this_class);
3423 else if (flags & SCF_DO_STCLASS_AND) {
3424 /* Switch to OR mode: cache the old value of
3425 * data->start_class */
3427 StructCopy(data->start_class, and_withp,
3428 struct regnode_charclass_class);
3429 flags &= ~SCF_DO_STCLASS_AND;
3430 StructCopy(&this_class, data->start_class,
3431 struct regnode_charclass_class);
3432 flags |= SCF_DO_STCLASS_OR;
3433 data->start_class->flags |= ANYOF_EOS;
3435 } else { /* Non-zero len */
3436 if (flags & SCF_DO_STCLASS_OR) {
3437 cl_or(pRExC_state, data->start_class, &this_class);
3438 cl_and(data->start_class, and_withp);
3440 else if (flags & SCF_DO_STCLASS_AND)
3441 cl_and(data->start_class, &this_class);
3442 flags &= ~SCF_DO_STCLASS;
3444 if (!scan) /* It was not CURLYX, but CURLY. */
3446 if ( /* ? quantifier ok, except for (?{ ... }) */
3447 (next_is_eval || !(mincount == 0 && maxcount == 1))
3448 && (minnext == 0) && (deltanext == 0)
3449 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3450 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3452 ckWARNreg(RExC_parse,
3453 "Quantifier unexpected on zero-length expression");
3456 min += minnext * mincount;
3457 is_inf_internal |= ((maxcount == REG_INFTY
3458 && (minnext + deltanext) > 0)
3459 || deltanext == I32_MAX);
3460 is_inf |= is_inf_internal;
3461 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3463 /* Try powerful optimization CURLYX => CURLYN. */
3464 if ( OP(oscan) == CURLYX && data
3465 && data->flags & SF_IN_PAR
3466 && !(data->flags & SF_HAS_EVAL)
3467 && !deltanext && minnext == 1 ) {
3468 /* Try to optimize to CURLYN. */
3469 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3470 regnode * const nxt1 = nxt;
3477 if (!REGNODE_SIMPLE(OP(nxt))
3478 && !(PL_regkind[OP(nxt)] == EXACT
3479 && STR_LEN(nxt) == 1))
3485 if (OP(nxt) != CLOSE)
3487 if (RExC_open_parens) {
3488 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3489 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3491 /* Now we know that nxt2 is the only contents: */
3492 oscan->flags = (U8)ARG(nxt);
3494 OP(nxt1) = NOTHING; /* was OPEN. */
3497 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3498 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3499 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3500 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3501 OP(nxt + 1) = OPTIMIZED; /* was count. */
3502 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3507 /* Try optimization CURLYX => CURLYM. */
3508 if ( OP(oscan) == CURLYX && data
3509 && !(data->flags & SF_HAS_PAR)
3510 && !(data->flags & SF_HAS_EVAL)
3511 && !deltanext /* atom is fixed width */
3512 && minnext != 0 /* CURLYM can't handle zero width */
3514 /* XXXX How to optimize if data == 0? */
3515 /* Optimize to a simpler form. */
3516 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3520 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3521 && (OP(nxt2) != WHILEM))
3523 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3524 /* Need to optimize away parenths. */
3525 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3526 /* Set the parenth number. */
3527 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3529 oscan->flags = (U8)ARG(nxt);
3530 if (RExC_open_parens) {
3531 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3532 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3534 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3535 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3538 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3539 OP(nxt + 1) = OPTIMIZED; /* was count. */
3540 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3541 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3544 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3545 regnode *nnxt = regnext(nxt1);
3547 if (reg_off_by_arg[OP(nxt1)])
3548 ARG_SET(nxt1, nxt2 - nxt1);
3549 else if (nxt2 - nxt1 < U16_MAX)
3550 NEXT_OFF(nxt1) = nxt2 - nxt1;
3552 OP(nxt) = NOTHING; /* Cannot beautify */
3557 /* Optimize again: */
3558 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3559 NULL, stopparen, recursed, NULL, 0,depth+1);
3564 else if ((OP(oscan) == CURLYX)
3565 && (flags & SCF_WHILEM_VISITED_POS)
3566 /* See the comment on a similar expression above.
3567 However, this time it's not a subexpression
3568 we care about, but the expression itself. */
3569 && (maxcount == REG_INFTY)
3570 && data && ++data->whilem_c < 16) {
3571 /* This stays as CURLYX, we can put the count/of pair. */
3572 /* Find WHILEM (as in regexec.c) */
3573 regnode *nxt = oscan + NEXT_OFF(oscan);
3575 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3577 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3578 | (RExC_whilem_seen << 4)); /* On WHILEM */
3580 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3582 if (flags & SCF_DO_SUBSTR) {
3583 SV *last_str = NULL;
3584 int counted = mincount != 0;
3586 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3587 #if defined(SPARC64_GCC_WORKAROUND)
3590 const char *s = NULL;
3593 if (pos_before >= data->last_start_min)
3596 b = data->last_start_min;
3599 s = SvPV_const(data->last_found, l);
3600 old = b - data->last_start_min;
3603 I32 b = pos_before >= data->last_start_min
3604 ? pos_before : data->last_start_min;
3606 const char * const s = SvPV_const(data->last_found, l);
3607 I32 old = b - data->last_start_min;
3611 old = utf8_hop((U8*)s, old) - (U8*)s;
3613 /* Get the added string: */
3614 last_str = newSVpvn_utf8(s + old, l, UTF);
3615 if (deltanext == 0 && pos_before == b) {
3616 /* What was added is a constant string */
3618 SvGROW(last_str, (mincount * l) + 1);
3619 repeatcpy(SvPVX(last_str) + l,
3620 SvPVX_const(last_str), l, mincount - 1);
3621 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3622 /* Add additional parts. */
3623 SvCUR_set(data->last_found,
3624 SvCUR(data->last_found) - l);
3625 sv_catsv(data->last_found, last_str);
3627 SV * sv = data->last_found;
3629 SvUTF8(sv) && SvMAGICAL(sv) ?
3630 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3631 if (mg && mg->mg_len >= 0)
3632 mg->mg_len += CHR_SVLEN(last_str) - l;
3634 data->last_end += l * (mincount - 1);
3637 /* start offset must point into the last copy */
3638 data->last_start_min += minnext * (mincount - 1);
3639 data->last_start_max += is_inf ? I32_MAX
3640 : (maxcount - 1) * (minnext + data->pos_delta);
3643 /* It is counted once already... */
3644 data->pos_min += minnext * (mincount - counted);
3645 data->pos_delta += - counted * deltanext +
3646 (minnext + deltanext) * maxcount - minnext * mincount;
3647 if (mincount != maxcount) {
3648 /* Cannot extend fixed substrings found inside
3650 SCAN_COMMIT(pRExC_state,data,minlenp);
3651 if (mincount && last_str) {
3652 SV * const sv = data->last_found;
3653 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3654 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3658 sv_setsv(sv, last_str);
3659 data->last_end = data->pos_min;
3660 data->last_start_min =
3661 data->pos_min - CHR_SVLEN(last_str);
3662 data->last_start_max = is_inf
3664 : data->pos_min + data->pos_delta
3665 - CHR_SVLEN(last_str);
3667 data->longest = &(data->longest_float);
3669 SvREFCNT_dec(last_str);
3671 if (data && (fl & SF_HAS_EVAL))
3672 data->flags |= SF_HAS_EVAL;
3673 optimize_curly_tail:
3674 if (OP(oscan) != CURLYX) {
3675 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3677 NEXT_OFF(oscan) += NEXT_OFF(next);
3680 default: /* REF, ANYOFV, and CLUMP only? */
3681 if (flags & SCF_DO_SUBSTR) {
3682 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3683 data->longest = &(data->longest_float);
3685 is_inf = is_inf_internal = 1;
3686 if (flags & SCF_DO_STCLASS_OR)
3687 cl_anything(pRExC_state, data->start_class);
3688 flags &= ~SCF_DO_STCLASS;
3692 else if (OP(scan) == LNBREAK) {
3693 if (flags & SCF_DO_STCLASS) {
3695 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3696 if (flags & SCF_DO_STCLASS_AND) {
3697 for (value = 0; value < 256; value++)
3698 if (!is_VERTWS_cp(value))
3699 ANYOF_BITMAP_CLEAR(data->start_class, value);
3702 for (value = 0; value < 256; value++)
3703 if (is_VERTWS_cp(value))
3704 ANYOF_BITMAP_SET(data->start_class, value);
3706 if (flags & SCF_DO_STCLASS_OR)
3707 cl_and(data->start_class, and_withp);
3708 flags &= ~SCF_DO_STCLASS;
3712 if (flags & SCF_DO_SUBSTR) {
3713 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3715 data->pos_delta += 1;
3716 data->longest = &(data->longest_float);
3719 else if (OP(scan) == FOLDCHAR) {
3720 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3721 flags &= ~SCF_DO_STCLASS;
3724 if (flags & SCF_DO_SUBSTR) {
3725 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3727 data->pos_delta += d;
3728 data->longest = &(data->longest_float);
3731 else if (REGNODE_SIMPLE(OP(scan))) {
3734 if (flags & SCF_DO_SUBSTR) {
3735 SCAN_COMMIT(pRExC_state,data,minlenp);
3739 if (flags & SCF_DO_STCLASS) {
3740 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3742 /* Some of the logic below assumes that switching
3743 locale on will only add false positives. */
3744 switch (PL_regkind[OP(scan)]) {
3748 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3749 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3750 cl_anything(pRExC_state, data->start_class);
3753 if (OP(scan) == SANY)
3755 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3756 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3757 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3758 cl_anything(pRExC_state, data->start_class);
3760 if (flags & SCF_DO_STCLASS_AND || !value)
3761 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3764 if (flags & SCF_DO_STCLASS_AND)
3765 cl_and(data->start_class,
3766 (struct regnode_charclass_class*)scan);
3768 cl_or(pRExC_state, data->start_class,
3769 (struct regnode_charclass_class*)scan);
3772 if (flags & SCF_DO_STCLASS_AND) {
3773 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3774 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3775 if (OP(scan) == ALNUMU) {
3776 for (value = 0; value < 256; value++) {
3777 if (!isWORDCHAR_L1(value)) {
3778 ANYOF_BITMAP_CLEAR(data->start_class, value);
3782 for (value = 0; value < 256; value++) {
3783 if (!isALNUM(value)) {
3784 ANYOF_BITMAP_CLEAR(data->start_class, value);
3791 if (data->start_class->flags & ANYOF_LOCALE)
3792 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3794 /* Even if under locale, set the bits for non-locale
3795 * in case it isn't a true locale-node. This will
3796 * create false positives if it truly is locale */
3797 if (OP(scan) == ALNUMU) {
3798 for (value = 0; value < 256; value++) {
3799 if (isWORDCHAR_L1(value)) {
3800 ANYOF_BITMAP_SET(data->start_class, value);
3804 for (value = 0; value < 256; value++) {
3805 if (isALNUM(value)) {
3806 ANYOF_BITMAP_SET(data->start_class, value);
3813 if (flags & SCF_DO_STCLASS_AND) {
3814 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3815 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3816 if (OP(scan) == NALNUMU) {
3817 for (value = 0; value < 256; value++) {
3818 if (isWORDCHAR_L1(value)) {
3819 ANYOF_BITMAP_CLEAR(data->start_class, value);
3823 for (value = 0; value < 256; value++) {
3824 if (isALNUM(value)) {
3825 ANYOF_BITMAP_CLEAR(data->start_class, value);
3832 if (data->start_class->flags & ANYOF_LOCALE)
3833 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3835 /* Even if under locale, set the bits for non-locale in
3836 * case it isn't a true locale-node. This will create
3837 * false positives if it truly is locale */
3838 if (OP(scan) == NALNUMU) {
3839 for (value = 0; value < 256; value++) {
3840 if (! isWORDCHAR_L1(value)) {
3841 ANYOF_BITMAP_SET(data->start_class, value);
3845 for (value = 0; value < 256; value++) {
3846 if (! isALNUM(value)) {
3847 ANYOF_BITMAP_SET(data->start_class, value);
3854 if (flags & SCF_DO_STCLASS_AND) {
3855 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3856 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3857 if (OP(scan) == SPACEU) {
3858 for (value = 0; value < 256; value++) {
3859 if (!isSPACE_L1(value)) {
3860 ANYOF_BITMAP_CLEAR(data->start_class, value);
3864 for (value = 0; value < 256; value++) {
3865 if (!isSPACE(value)) {
3866 ANYOF_BITMAP_CLEAR(data->start_class, value);
3873 if (data->start_class->flags & ANYOF_LOCALE) {
3874 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3876 if (OP(scan) == SPACEU) {
3877 for (value = 0; value < 256; value++) {
3878 if (isSPACE_L1(value)) {
3879 ANYOF_BITMAP_SET(data->start_class, value);
3883 for (value = 0; value < 256; value++) {
3884 if (isSPACE(value)) {
3885 ANYOF_BITMAP_SET(data->start_class, value);
3892 if (flags & SCF_DO_STCLASS_AND) {
3893 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3894 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3895 if (OP(scan) == NSPACEU) {
3896 for (value = 0; value < 256; value++) {
3897 if (isSPACE_L1(value)) {
3898 ANYOF_BITMAP_CLEAR(data->start_class, value);
3902 for (value = 0; value < 256; value++) {
3903 if (isSPACE(value)) {
3904 ANYOF_BITMAP_CLEAR(data->start_class, value);
3911 if (data->start_class->flags & ANYOF_LOCALE)
3912 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3913 if (OP(scan) == NSPACEU) {
3914 for (value = 0; value < 256; value++) {
3915 if (!isSPACE_L1(value)) {
3916 ANYOF_BITMAP_SET(data->start_class, value);
3921 for (value = 0; value < 256; value++) {
3922 if (!isSPACE(value)) {
3923 ANYOF_BITMAP_SET(data->start_class, value);
3930 if (flags & SCF_DO_STCLASS_AND) {
3931 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3932 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3933 for (value = 0; value < 256; value++)
3934 if (!isDIGIT(value))
3935 ANYOF_BITMAP_CLEAR(data->start_class, value);
3939 if (data->start_class->flags & ANYOF_LOCALE)
3940 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3941 for (value = 0; value < 256; value++)
3943 ANYOF_BITMAP_SET(data->start_class, value);
3947 if (flags & SCF_DO_STCLASS_AND) {
3948 if (!(data->start_class->flags & ANYOF_LOCALE))
3949 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3950 for (value = 0; value < 256; value++)
3952 ANYOF_BITMAP_CLEAR(data->start_class, value);
3955 if (data->start_class->flags & ANYOF_LOCALE)
3956 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3957 for (value = 0; value < 256; value++)
3958 if (!isDIGIT(value))
3959 ANYOF_BITMAP_SET(data->start_class, value);
3962 CASE_SYNST_FNC(VERTWS);
3963 CASE_SYNST_FNC(HORIZWS);
3966 if (flags & SCF_DO_STCLASS_OR)
3967 cl_and(data->start_class, and_withp);
3968 flags &= ~SCF_DO_STCLASS;
3971 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3972 data->flags |= (OP(scan) == MEOL
3976 else if ( PL_regkind[OP(scan)] == BRANCHJ
3977 /* Lookbehind, or need to calculate parens/evals/stclass: */
3978 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3979 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3980 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3981 || OP(scan) == UNLESSM )
3983 /* Negative Lookahead/lookbehind
3984 In this case we can't do fixed string optimisation.
3987 I32 deltanext, minnext, fake = 0;
3989 struct regnode_charclass_class intrnl;
3992 data_fake.flags = 0;
3994 data_fake.whilem_c = data->whilem_c;
3995 data_fake.last_closep = data->last_closep;
3998 data_fake.last_closep = &fake;
3999 data_fake.pos_delta = delta;
4000 if ( flags & SCF_DO_STCLASS && !scan->flags
4001 && OP(scan) == IFMATCH ) { /* Lookahead */
4002 cl_init(pRExC_state, &intrnl);
4003 data_fake.start_class = &intrnl;
4004 f |= SCF_DO_STCLASS_AND;
4006 if (flags & SCF_WHILEM_VISITED_POS)
4007 f |= SCF_WHILEM_VISITED_POS;
4008 next = regnext(scan);
4009 nscan = NEXTOPER(NEXTOPER(scan));
4010 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4011 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4014 FAIL("Variable length lookbehind not implemented");
4016 else if (minnext > (I32)U8_MAX) {
4017 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4019 scan->flags = (U8)minnext;
4022 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4024 if (data_fake.flags & SF_HAS_EVAL)
4025 data->flags |= SF_HAS_EVAL;
4026 data->whilem_c = data_fake.whilem_c;
4028 if (f & SCF_DO_STCLASS_AND) {
4029 if (flags & SCF_DO_STCLASS_OR) {
4030 /* OR before, AND after: ideally we would recurse with
4031 * data_fake to get the AND applied by study of the
4032 * remainder of the pattern, and then derecurse;
4033 * *** HACK *** for now just treat as "no information".
4034 * See [perl #56690].
4036 cl_init(pRExC_state, data->start_class);
4038 /* AND before and after: combine and continue */
4039 const int was = (data->start_class->flags & ANYOF_EOS);
4041 cl_and(data->start_class, &intrnl);
4043 data->start_class->flags |= ANYOF_EOS;
4047 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4049 /* Positive Lookahead/lookbehind
4050 In this case we can do fixed string optimisation,
4051 but we must be careful about it. Note in the case of
4052 lookbehind the positions will be offset by the minimum
4053 length of the pattern, something we won't know about
4054 until after the recurse.
4056 I32 deltanext, fake = 0;
4058 struct regnode_charclass_class intrnl;
4060 /* We use SAVEFREEPV so that when the full compile
4061 is finished perl will clean up the allocated
4062 minlens when it's all done. This way we don't
4063 have to worry about freeing them when we know
4064 they wont be used, which would be a pain.
4067 Newx( minnextp, 1, I32 );
4068 SAVEFREEPV(minnextp);
4071 StructCopy(data, &data_fake, scan_data_t);
4072 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4075 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4076 data_fake.last_found=newSVsv(data->last_found);
4080 data_fake.last_closep = &fake;
4081 data_fake.flags = 0;
4082 data_fake.pos_delta = delta;
4084 data_fake.flags |= SF_IS_INF;
4085 if ( flags & SCF_DO_STCLASS && !scan->flags
4086 && OP(scan) == IFMATCH ) { /* Lookahead */
4087 cl_init(pRExC_state, &intrnl);
4088 data_fake.start_class = &intrnl;
4089 f |= SCF_DO_STCLASS_AND;
4091 if (flags & SCF_WHILEM_VISITED_POS)
4092 f |= SCF_WHILEM_VISITED_POS;
4093 next = regnext(scan);
4094 nscan = NEXTOPER(NEXTOPER(scan));
4096 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4097 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4100 FAIL("Variable length lookbehind not implemented");
4102 else if (*minnextp > (I32)U8_MAX) {
4103 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4105 scan->flags = (U8)*minnextp;
4110 if (f & SCF_DO_STCLASS_AND) {
4111 const int was = (data->start_class->flags & ANYOF_EOS);
4113 cl_and(data->start_class, &intrnl);
4115 data->start_class->flags |= ANYOF_EOS;
4118 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4120 if (data_fake.flags & SF_HAS_EVAL)
4121 data->flags |= SF_HAS_EVAL;
4122 data->whilem_c = data_fake.whilem_c;
4123 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4124 if (RExC_rx->minlen<*minnextp)
4125 RExC_rx->minlen=*minnextp;
4126 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4127 SvREFCNT_dec(data_fake.last_found);
4129 if ( data_fake.minlen_fixed != minlenp )
4131 data->offset_fixed= data_fake.offset_fixed;
4132 data->minlen_fixed= data_fake.minlen_fixed;
4133 data->lookbehind_fixed+= scan->flags;
4135 if ( data_fake.minlen_float != minlenp )
4137 data->minlen_float= data_fake.minlen_float;
4138 data->offset_float_min=data_fake.offset_float_min;
4139 data->offset_float_max=data_fake.offset_float_max;
4140 data->lookbehind_float+= scan->flags;
4149 else if (OP(scan) == OPEN) {
4150 if (stopparen != (I32)ARG(scan))
4153 else if (OP(scan) == CLOSE) {
4154 if (stopparen == (I32)ARG(scan)) {
4157 if ((I32)ARG(scan) == is_par) {
4158 next = regnext(scan);
4160 if ( next && (OP(next) != WHILEM) && next < last)
4161 is_par = 0; /* Disable optimization */
4164 *(data->last_closep) = ARG(scan);
4166 else if (OP(scan) == EVAL) {
4168 data->flags |= SF_HAS_EVAL;
4170 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4171 if (flags & SCF_DO_SUBSTR) {
4172 SCAN_COMMIT(pRExC_state,data,minlenp);
4173 flags &= ~SCF_DO_SUBSTR;
4175 if (data && OP(scan)==ACCEPT) {
4176 data->flags |= SCF_SEEN_ACCEPT;
4181 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4183 if (flags & SCF_DO_SUBSTR) {
4184 SCAN_COMMIT(pRExC_state,data,minlenp);
4185 data->longest = &(data->longest_float);
4187 is_inf = is_inf_internal = 1;
4188 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4189 cl_anything(pRExC_state, data->start_class);
4190 flags &= ~SCF_DO_STCLASS;
4192 else if (OP(scan) == GPOS) {
4193 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4194 !(delta || is_inf || (data && data->pos_delta)))
4196 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4197 RExC_rx->extflags |= RXf_ANCH_GPOS;
4198 if (RExC_rx->gofs < (U32)min)
4199 RExC_rx->gofs = min;
4201 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4205 #ifdef TRIE_STUDY_OPT
4206 #ifdef FULL_TRIE_STUDY
4207 else if (PL_regkind[OP(scan)] == TRIE) {
4208 /* NOTE - There is similar code to this block above for handling
4209 BRANCH nodes on the initial study. If you change stuff here
4211 regnode *trie_node= scan;
4212 regnode *tail= regnext(scan);
4213 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4214 I32 max1 = 0, min1 = I32_MAX;
4215 struct regnode_charclass_class accum;
4217 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4218 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4219 if (flags & SCF_DO_STCLASS)
4220 cl_init_zero(pRExC_state, &accum);
4226 const regnode *nextbranch= NULL;
4229 for ( word=1 ; word <= trie->wordcount ; word++)
4231 I32 deltanext=0, minnext=0, f = 0, fake;
4232 struct regnode_charclass_class this_class;
4234 data_fake.flags = 0;
4236 data_fake.whilem_c = data->whilem_c;
4237 data_fake.last_closep = data->last_closep;
4240 data_fake.last_closep = &fake;
4241 data_fake.pos_delta = delta;
4242 if (flags & SCF_DO_STCLASS) {
4243 cl_init(pRExC_state, &this_class);
4244 data_fake.start_class = &this_class;
4245 f = SCF_DO_STCLASS_AND;
4247 if (flags & SCF_WHILEM_VISITED_POS)
4248 f |= SCF_WHILEM_VISITED_POS;
4250 if (trie->jump[word]) {
4252 nextbranch = trie_node + trie->jump[0];
4253 scan= trie_node + trie->jump[word];
4254 /* We go from the jump point to the branch that follows
4255 it. Note this means we need the vestigal unused branches
4256 even though they arent otherwise used.
4258 minnext = study_chunk(pRExC_state, &scan, minlenp,
4259 &deltanext, (regnode *)nextbranch, &data_fake,
4260 stopparen, recursed, NULL, f,depth+1);
4262 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4263 nextbranch= regnext((regnode*)nextbranch);
4265 if (min1 > (I32)(minnext + trie->minlen))
4266 min1 = minnext + trie->minlen;
4267 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4268 max1 = minnext + deltanext + trie->maxlen;
4269 if (deltanext == I32_MAX)
4270 is_inf = is_inf_internal = 1;
4272 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4274 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4275 if ( stopmin > min + min1)
4276 stopmin = min + min1;
4277 flags &= ~SCF_DO_SUBSTR;
4279 data->flags |= SCF_SEEN_ACCEPT;
4282 if (data_fake.flags & SF_HAS_EVAL)
4283 data->flags |= SF_HAS_EVAL;
4284 data->whilem_c = data_fake.whilem_c;
4286 if (flags & SCF_DO_STCLASS)
4287 cl_or(pRExC_state, &accum, &this_class);
4290 if (flags & SCF_DO_SUBSTR) {
4291 data->pos_min += min1;
4292 data->pos_delta += max1 - min1;
4293 if (max1 != min1 || is_inf)
4294 data->longest = &(data->longest_float);
4297 delta += max1 - min1;
4298 if (flags & SCF_DO_STCLASS_OR) {
4299 cl_or(pRExC_state, data->start_class, &accum);
4301 cl_and(data->start_class, and_withp);
4302 flags &= ~SCF_DO_STCLASS;
4305 else if (flags & SCF_DO_STCLASS_AND) {
4307 cl_and(data->start_class, &accum);
4308 flags &= ~SCF_DO_STCLASS;
4311 /* Switch to OR mode: cache the old value of
4312 * data->start_class */
4314 StructCopy(data->start_class, and_withp,
4315 struct regnode_charclass_class);
4316 flags &= ~SCF_DO_STCLASS_AND;
4317 StructCopy(&accum, data->start_class,
4318 struct regnode_charclass_class);
4319 flags |= SCF_DO_STCLASS_OR;
4320 data->start_class->flags |= ANYOF_EOS;
4327 else if (PL_regkind[OP(scan)] == TRIE) {
4328 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4331 min += trie->minlen;
4332 delta += (trie->maxlen - trie->minlen);
4333 flags &= ~SCF_DO_STCLASS; /* xxx */
4334 if (flags & SCF_DO_SUBSTR) {
4335 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4336 data->pos_min += trie->minlen;
4337 data->pos_delta += (trie->maxlen - trie->minlen);
4338 if (trie->maxlen != trie->minlen)
4339 data->longest = &(data->longest_float);
4341 if (trie->jump) /* no more substrings -- for now /grr*/
4342 flags &= ~SCF_DO_SUBSTR;
4344 #endif /* old or new */
4345 #endif /* TRIE_STUDY_OPT */
4347 /* Else: zero-length, ignore. */
4348 scan = regnext(scan);
4353 stopparen = frame->stop;
4354 frame = frame->prev;
4355 goto fake_study_recurse;
4360 DEBUG_STUDYDATA("pre-fin:",data,depth);
4363 *deltap = is_inf_internal ? I32_MAX : delta;
4364 if (flags & SCF_DO_SUBSTR && is_inf)
4365 data->pos_delta = I32_MAX - data->pos_min;
4366 if (is_par > (I32)U8_MAX)
4368 if (is_par && pars==1 && data) {
4369 data->flags |= SF_IN_PAR;
4370 data->flags &= ~SF_HAS_PAR;
4372 else if (pars && data) {
4373 data->flags |= SF_HAS_PAR;
4374 data->flags &= ~SF_IN_PAR;
4376 if (flags & SCF_DO_STCLASS_OR)
4377 cl_and(data->start_class, and_withp);
4378 if (flags & SCF_TRIE_RESTUDY)
4379 data->flags |= SCF_TRIE_RESTUDY;
4381 DEBUG_STUDYDATA("post-fin:",data,depth);
4383 return min < stopmin ? min : stopmin;
4387 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4389 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4391 PERL_ARGS_ASSERT_ADD_DATA;
4393 Renewc(RExC_rxi->data,
4394 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4395 char, struct reg_data);
4397 Renew(RExC_rxi->data->what, count + n, U8);
4399 Newx(RExC_rxi->data->what, n, U8);
4400 RExC_rxi->data->count = count + n;
4401 Copy(s, RExC_rxi->data->what + count, n, U8);
4405 /*XXX: todo make this not included in a non debugging perl */
4406 #ifndef PERL_IN_XSUB_RE
4408 Perl_reginitcolors(pTHX)
4411 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4413 char *t = savepv(s);
4417 t = strchr(t, '\t');
4423 PL_colors[i] = t = (char *)"";
4428 PL_colors[i++] = (char *)"";
4435 #ifdef TRIE_STUDY_OPT
4436 #define CHECK_RESTUDY_GOTO \
4438 (data.flags & SCF_TRIE_RESTUDY) \
4442 #define CHECK_RESTUDY_GOTO
4446 - pregcomp - compile a regular expression into internal code
4448 * We can't allocate space until we know how big the compiled form will be,
4449 * but we can't compile it (and thus know how big it is) until we've got a
4450 * place to put the code. So we cheat: we compile it twice, once with code
4451 * generation turned off and size counting turned on, and once "for real".
4452 * This also means that we don't allocate space until we are sure that the
4453 * thing really will compile successfully, and we never have to move the
4454 * code and thus invalidate pointers into it. (Note that it has to be in
4455 * one piece because free() must be able to free it all.) [NB: not true in perl]
4457 * Beware that the optimization-preparation code in here knows about some
4458 * of the structure of the compiled regexp. [I'll say.]
4463 #ifndef PERL_IN_XSUB_RE
4464 #define RE_ENGINE_PTR &PL_core_reg_engine
4466 extern const struct regexp_engine my_reg_engine;
4467 #define RE_ENGINE_PTR &my_reg_engine
4470 #ifndef PERL_IN_XSUB_RE
4472 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4475 HV * const table = GvHV(PL_hintgv);
4477 PERL_ARGS_ASSERT_PREGCOMP;
4479 /* Dispatch a request to compile a regexp to correct
4482 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4483 GET_RE_DEBUG_FLAGS_DECL;
4484 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4485 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4487 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4490 return CALLREGCOMP_ENG(eng, pattern, flags);
4493 return Perl_re_compile(aTHX_ pattern, flags);
4498 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4503 register regexp_internal *ri;
4512 /* these are all flags - maybe they should be turned
4513 * into a single int with different bit masks */
4514 I32 sawlookahead = 0;
4517 bool used_setjump = FALSE;
4518 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4523 RExC_state_t RExC_state;
4524 RExC_state_t * const pRExC_state = &RExC_state;
4525 #ifdef TRIE_STUDY_OPT
4527 RExC_state_t copyRExC_state;
4529 GET_RE_DEBUG_FLAGS_DECL;
4531 PERL_ARGS_ASSERT_RE_COMPILE;
4533 DEBUG_r(if (!PL_colorset) reginitcolors());
4535 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4536 RExC_uni_semantics = 0;
4537 RExC_contains_locale = 0;
4539 /****************** LONG JUMP TARGET HERE***********************/
4540 /* Longjmp back to here if have to switch in midstream to utf8 */
4541 if (! RExC_orig_utf8) {
4542 JMPENV_PUSH(jump_ret);
4543 used_setjump = TRUE;
4546 if (jump_ret == 0) { /* First time through */
4547 exp = SvPV(pattern, plen);
4549 /* ignore the utf8ness if the pattern is 0 length */
4551 RExC_utf8 = RExC_orig_utf8 = 0;
4555 SV *dsv= sv_newmortal();
4556 RE_PV_QUOTED_DECL(s, RExC_utf8,
4557 dsv, exp, plen, 60);
4558 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4559 PL_colors[4],PL_colors[5],s);
4562 else { /* longjumped back */
4565 /* If the cause for the longjmp was other than changing to utf8, pop
4566 * our own setjmp, and longjmp to the correct handler */
4567 if (jump_ret != UTF8_LONGJMP) {
4569 JMPENV_JUMP(jump_ret);
4574 /* It's possible to write a regexp in ascii that represents Unicode
4575 codepoints outside of the byte range, such as via \x{100}. If we
4576 detect such a sequence we have to convert the entire pattern to utf8
4577 and then recompile, as our sizing calculation will have been based
4578 on 1 byte == 1 character, but we will need to use utf8 to encode
4579 at least some part of the pattern, and therefore must convert the whole
4582 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4583 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4584 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4586 RExC_orig_utf8 = RExC_utf8 = 1;
4590 #ifdef TRIE_STUDY_OPT
4594 pm_flags = orig_pm_flags;
4596 if (initial_charset == REGEX_LOCALE_CHARSET) {
4597 RExC_contains_locale = 1;
4599 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4601 /* Set to use unicode semantics if the pattern is in utf8 and has the
4602 * 'depends' charset specified, as it means unicode when utf8 */
4603 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4607 RExC_flags = pm_flags;
4611 RExC_in_lookbehind = 0;
4612 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4613 RExC_seen_evals = 0;
4616 /* First pass: determine size, legality. */
4624 RExC_emit = &PL_regdummy;
4625 RExC_whilem_seen = 0;
4626 RExC_open_parens = NULL;
4627 RExC_close_parens = NULL;
4629 RExC_paren_names = NULL;
4631 RExC_paren_name_list = NULL;
4633 RExC_recurse = NULL;
4634 RExC_recurse_count = 0;
4636 #if 0 /* REGC() is (currently) a NOP at the first pass.
4637 * Clever compilers notice this and complain. --jhi */
4638 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4640 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4641 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4642 RExC_precomp = NULL;
4646 /* Here, finished first pass. Get rid of any added setjmp */
4652 PerlIO_printf(Perl_debug_log,
4653 "Required size %"IVdf" nodes\n"
4654 "Starting second pass (creation)\n",
4657 RExC_lastparse=NULL;
4660 /* The first pass could have found things that force Unicode semantics */
4661 if ((RExC_utf8 || RExC_uni_semantics)
4662 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4664 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4667 /* Small enough for pointer-storage convention?
4668 If extralen==0, this means that we will not need long jumps. */
4669 if (RExC_size >= 0x10000L && RExC_extralen)
4670 RExC_size += RExC_extralen;
4673 if (RExC_whilem_seen > 15)
4674 RExC_whilem_seen = 15;
4676 /* Allocate space and zero-initialize. Note, the two step process
4677 of zeroing when in debug mode, thus anything assigned has to
4678 happen after that */
4679 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4680 r = (struct regexp*)SvANY(rx);
4681 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4682 char, regexp_internal);
4683 if ( r == NULL || ri == NULL )
4684 FAIL("Regexp out of space");
4686 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4687 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4689 /* bulk initialize base fields with 0. */
4690 Zero(ri, sizeof(regexp_internal), char);
4693 /* non-zero initialization begins here */
4695 r->engine= RE_ENGINE_PTR;
4696 r->extflags = pm_flags;
4698 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4699 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4701 /* The caret is output if there are any defaults: if not all the STD
4702 * flags are set, or if no character set specifier is needed */
4704 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4706 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4707 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4708 >> RXf_PMf_STD_PMMOD_SHIFT);
4709 const char *fptr = STD_PAT_MODS; /*"msix"*/
4711 /* Allocate for the worst case, which is all the std flags are turned
4712 * on. If more precision is desired, we could do a population count of
4713 * the flags set. This could be done with a small lookup table, or by
4714 * shifting, masking and adding, or even, when available, assembly
4715 * language for a machine-language population count.
4716 * We never output a minus, as all those are defaults, so are
4717 * covered by the caret */
4718 const STRLEN wraplen = plen + has_p + has_runon
4719 + has_default /* If needs a caret */
4721 /* If needs a character set specifier */
4722 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4723 + (sizeof(STD_PAT_MODS) - 1)
4724 + (sizeof("(?:)") - 1);
4726 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4728 SvFLAGS(rx) |= SvUTF8(pattern);
4731 /* If a default, cover it using the caret */
4733 *p++= DEFAULT_PAT_MOD;
4737 const char* const name = get_regex_charset_name(r->extflags, &len);
4738 Copy(name, p, len, char);
4742 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4745 while((ch = *fptr++)) {
4753 Copy(RExC_precomp, p, plen, char);
4754 assert ((RX_WRAPPED(rx) - p) < 16);
4755 r->pre_prefix = p - RX_WRAPPED(rx);
4761 SvCUR_set(rx, p - SvPVX_const(rx));
4765 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4767 if (RExC_seen & REG_SEEN_RECURSE) {
4768 Newxz(RExC_open_parens, RExC_npar,regnode *);
4769 SAVEFREEPV(RExC_open_parens);
4770 Newxz(RExC_close_parens,RExC_npar,regnode *);
4771 SAVEFREEPV(RExC_close_parens);
4774 /* Useful during FAIL. */
4775 #ifdef RE_TRACK_PATTERN_OFFSETS
4776 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4777 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4778 "%s %"UVuf" bytes for offset annotations.\n",
4779 ri->u.offsets ? "Got" : "Couldn't get",
4780 (UV)((2*RExC_size+1) * sizeof(U32))));
4782 SetProgLen(ri,RExC_size);
4786 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4788 /* Second pass: emit code. */
4789 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4794 RExC_emit_start = ri->program;
4795 RExC_emit = ri->program;
4796 RExC_emit_bound = ri->program + RExC_size + 1;
4798 /* Store the count of eval-groups for security checks: */
4799 RExC_rx->seen_evals = RExC_seen_evals;
4800 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4801 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4805 /* XXXX To minimize changes to RE engine we always allocate
4806 3-units-long substrs field. */
4807 Newx(r->substrs, 1, struct reg_substr_data);
4808 if (RExC_recurse_count) {
4809 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4810 SAVEFREEPV(RExC_recurse);
4814 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4815 Zero(r->substrs, 1, struct reg_substr_data);
4817 #ifdef TRIE_STUDY_OPT
4819 StructCopy(&zero_scan_data, &data, scan_data_t);
4820 copyRExC_state = RExC_state;
4823 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4825 RExC_state = copyRExC_state;
4826 if (seen & REG_TOP_LEVEL_BRANCHES)
4827 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4829 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4830 if (data.last_found) {
4831 SvREFCNT_dec(data.longest_fixed);
4832 SvREFCNT_dec(data.longest_float);
4833 SvREFCNT_dec(data.last_found);
4835 StructCopy(&zero_scan_data, &data, scan_data_t);
4838 StructCopy(&zero_scan_data, &data, scan_data_t);
4841 /* Dig out information for optimizations. */
4842 r->extflags = RExC_flags; /* was pm_op */
4843 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4846 SvUTF8_on(rx); /* Unicode in it? */
4847 ri->regstclass = NULL;
4848 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4849 r->intflags |= PREGf_NAUGHTY;
4850 scan = ri->program + 1; /* First BRANCH. */
4852 /* testing for BRANCH here tells us whether there is "must appear"
4853 data in the pattern. If there is then we can use it for optimisations */
4854 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4856 STRLEN longest_float_length, longest_fixed_length;
4857 struct regnode_charclass_class ch_class; /* pointed to by data */
4859 I32 last_close = 0; /* pointed to by data */
4860 regnode *first= scan;
4861 regnode *first_next= regnext(first);
4863 * Skip introductions and multiplicators >= 1
4864 * so that we can extract the 'meat' of the pattern that must
4865 * match in the large if() sequence following.
4866 * NOTE that EXACT is NOT covered here, as it is normally
4867 * picked up by the optimiser separately.
4869 * This is unfortunate as the optimiser isnt handling lookahead
4870 * properly currently.
4873 while ((OP(first) == OPEN && (sawopen = 1)) ||
4874 /* An OR of *one* alternative - should not happen now. */
4875 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4876 /* for now we can't handle lookbehind IFMATCH*/
4877 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4878 (OP(first) == PLUS) ||
4879 (OP(first) == MINMOD) ||
4880 /* An {n,m} with n>0 */
4881 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4882 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4885 * the only op that could be a regnode is PLUS, all the rest
4886 * will be regnode_1 or regnode_2.
4889 if (OP(first) == PLUS)
4892 first += regarglen[OP(first)];
4894 first = NEXTOPER(first);
4895 first_next= regnext(first);
4898 /* Starting-point info. */
4900 DEBUG_PEEP("first:",first,0);
4901 /* Ignore EXACT as we deal with it later. */
4902 if (PL_regkind[OP(first)] == EXACT) {
4903 if (OP(first) == EXACT)
4904 NOOP; /* Empty, get anchored substr later. */
4906 ri->regstclass = first;
4909 else if (PL_regkind[OP(first)] == TRIE &&
4910 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4913 /* this can happen only on restudy */
4914 if ( OP(first) == TRIE ) {
4915 struct regnode_1 *trieop = (struct regnode_1 *)
4916 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4917 StructCopy(first,trieop,struct regnode_1);
4918 trie_op=(regnode *)trieop;
4920 struct regnode_charclass *trieop = (struct regnode_charclass *)
4921 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4922 StructCopy(first,trieop,struct regnode_charclass);
4923 trie_op=(regnode *)trieop;
4926 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4927 ri->regstclass = trie_op;
4930 else if (REGNODE_SIMPLE(OP(first)))
4931 ri->regstclass = first;
4932 else if (PL_regkind[OP(first)] == BOUND ||
4933 PL_regkind[OP(first)] == NBOUND)
4934 ri->regstclass = first;
4935 else if (PL_regkind[OP(first)] == BOL) {
4936 r->extflags |= (OP(first) == MBOL
4938 : (OP(first) == SBOL
4941 first = NEXTOPER(first);
4944 else if (OP(first) == GPOS) {
4945 r->extflags |= RXf_ANCH_GPOS;
4946 first = NEXTOPER(first);
4949 else if ((!sawopen || !RExC_sawback) &&
4950 (OP(first) == STAR &&
4951 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4952 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4954 /* turn .* into ^.* with an implied $*=1 */
4956 (OP(NEXTOPER(first)) == REG_ANY)
4959 r->extflags |= type;
4960 r->intflags |= PREGf_IMPLICIT;
4961 first = NEXTOPER(first);
4964 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4965 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4966 /* x+ must match at the 1st pos of run of x's */
4967 r->intflags |= PREGf_SKIP;
4969 /* Scan is after the zeroth branch, first is atomic matcher. */
4970 #ifdef TRIE_STUDY_OPT
4973 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4974 (IV)(first - scan + 1))
4978 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4979 (IV)(first - scan + 1))
4985 * If there's something expensive in the r.e., find the
4986 * longest literal string that must appear and make it the
4987 * regmust. Resolve ties in favor of later strings, since
4988 * the regstart check works with the beginning of the r.e.
4989 * and avoiding duplication strengthens checking. Not a
4990 * strong reason, but sufficient in the absence of others.
4991 * [Now we resolve ties in favor of the earlier string if
4992 * it happens that c_offset_min has been invalidated, since the
4993 * earlier string may buy us something the later one won't.]
4996 data.longest_fixed = newSVpvs("");
4997 data.longest_float = newSVpvs("");
4998 data.last_found = newSVpvs("");
4999 data.longest = &(data.longest_fixed);
5001 if (!ri->regstclass) {
5002 cl_init(pRExC_state, &ch_class);
5003 data.start_class = &ch_class;
5004 stclass_flag = SCF_DO_STCLASS_AND;
5005 } else /* XXXX Check for BOUND? */
5007 data.last_closep = &last_close;
5009 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5010 &data, -1, NULL, NULL,
5011 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5017 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5018 && data.last_start_min == 0 && data.last_end > 0
5019 && !RExC_seen_zerolen
5020 && !(RExC_seen & REG_SEEN_VERBARG)
5021 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5022 r->extflags |= RXf_CHECK_ALL;
5023 scan_commit(pRExC_state, &data,&minlen,0);
5024 SvREFCNT_dec(data.last_found);
5026 /* Note that code very similar to this but for anchored string
5027 follows immediately below, changes may need to be made to both.
5030 longest_float_length = CHR_SVLEN(data.longest_float);
5031 if (longest_float_length
5032 || (data.flags & SF_FL_BEFORE_EOL
5033 && (!(data.flags & SF_FL_BEFORE_MEOL)
5034 || (RExC_flags & RXf_PMf_MULTILINE))))
5038 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5039 && data.offset_fixed == data.offset_float_min
5040 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5041 goto remove_float; /* As in (a)+. */
5043 /* copy the information about the longest float from the reg_scan_data
5044 over to the program. */
5045 if (SvUTF8(data.longest_float)) {
5046 r->float_utf8 = data.longest_float;
5047 r->float_substr = NULL;
5049 r->float_substr = data.longest_float;
5050 r->float_utf8 = NULL;
5052 /* float_end_shift is how many chars that must be matched that
5053 follow this item. We calculate it ahead of time as once the
5054 lookbehind offset is added in we lose the ability to correctly
5056 ml = data.minlen_float ? *(data.minlen_float)
5057 : (I32)longest_float_length;
5058 r->float_end_shift = ml - data.offset_float_min
5059 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5060 + data.lookbehind_float;
5061 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5062 r->float_max_offset = data.offset_float_max;
5063 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5064 r->float_max_offset -= data.lookbehind_float;
5066 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5067 && (!(data.flags & SF_FL_BEFORE_MEOL)
5068 || (RExC_flags & RXf_PMf_MULTILINE)));
5069 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5073 r->float_substr = r->float_utf8 = NULL;
5074 SvREFCNT_dec(data.longest_float);
5075 longest_float_length = 0;
5078 /* Note that code very similar to this but for floating string
5079 is immediately above, changes may need to be made to both.
5082 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5083 if (longest_fixed_length
5084 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5086 || (RExC_flags & RXf_PMf_MULTILINE))))
5090 /* copy the information about the longest fixed
5091 from the reg_scan_data over to the program. */
5092 if (SvUTF8(data.longest_fixed)) {
5093 r->anchored_utf8 = data.longest_fixed;
5094 r->anchored_substr = NULL;
5096 r->anchored_substr = data.longest_fixed;
5097 r->anchored_utf8 = NULL;
5099 /* fixed_end_shift is how many chars that must be matched that
5100 follow this item. We calculate it ahead of time as once the
5101 lookbehind offset is added in we lose the ability to correctly
5103 ml = data.minlen_fixed ? *(data.minlen_fixed)
5104 : (I32)longest_fixed_length;
5105 r->anchored_end_shift = ml - data.offset_fixed
5106 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5107 + data.lookbehind_fixed;
5108 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5110 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5111 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5112 || (RExC_flags & RXf_PMf_MULTILINE)));
5113 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5116 r->anchored_substr = r->anchored_utf8 = NULL;
5117 SvREFCNT_dec(data.longest_fixed);
5118 longest_fixed_length = 0;
5121 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5122 ri->regstclass = NULL;
5124 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5126 && !(data.start_class->flags & ANYOF_EOS)
5127 && !cl_is_anything(data.start_class))
5129 const U32 n = add_data(pRExC_state, 1, "f");
5130 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5132 Newx(RExC_rxi->data->data[n], 1,
5133 struct regnode_charclass_class);
5134 StructCopy(data.start_class,
5135 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5136 struct regnode_charclass_class);
5137 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5138 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5139 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5140 regprop(r, sv, (regnode*)data.start_class);
5141 PerlIO_printf(Perl_debug_log,
5142 "synthetic stclass \"%s\".\n",
5143 SvPVX_const(sv));});
5146 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5147 if (longest_fixed_length > longest_float_length) {
5148 r->check_end_shift = r->anchored_end_shift;
5149 r->check_substr = r->anchored_substr;
5150 r->check_utf8 = r->anchored_utf8;
5151 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5152 if (r->extflags & RXf_ANCH_SINGLE)
5153 r->extflags |= RXf_NOSCAN;
5156 r->check_end_shift = r->float_end_shift;
5157 r->check_substr = r->float_substr;
5158 r->check_utf8 = r->float_utf8;
5159 r->check_offset_min = r->float_min_offset;
5160 r->check_offset_max = r->float_max_offset;
5162 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5163 This should be changed ASAP! */
5164 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5165 r->extflags |= RXf_USE_INTUIT;
5166 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5167 r->extflags |= RXf_INTUIT_TAIL;
5169 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5170 if ( (STRLEN)minlen < longest_float_length )
5171 minlen= longest_float_length;
5172 if ( (STRLEN)minlen < longest_fixed_length )
5173 minlen= longest_fixed_length;
5177 /* Several toplevels. Best we can is to set minlen. */
5179 struct regnode_charclass_class ch_class;
5182 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5184 scan = ri->program + 1;
5185 cl_init(pRExC_state, &ch_class);
5186 data.start_class = &ch_class;
5187 data.last_closep = &last_close;
5190 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5191 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5195 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5196 = r->float_substr = r->float_utf8 = NULL;
5198 if (!(data.start_class->flags & ANYOF_EOS)
5199 && !cl_is_anything(data.start_class))
5201 const U32 n = add_data(pRExC_state, 1, "f");
5202 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5204 Newx(RExC_rxi->data->data[n], 1,
5205 struct regnode_charclass_class);
5206 StructCopy(data.start_class,
5207 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5208 struct regnode_charclass_class);
5209 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5210 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5211 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5212 regprop(r, sv, (regnode*)data.start_class);
5213 PerlIO_printf(Perl_debug_log,
5214 "synthetic stclass \"%s\".\n",
5215 SvPVX_const(sv));});
5219 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5220 the "real" pattern. */
5222 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5223 (IV)minlen, (IV)r->minlen);
5225 r->minlenret = minlen;
5226 if (r->minlen < minlen)
5229 if (RExC_seen & REG_SEEN_GPOS)
5230 r->extflags |= RXf_GPOS_SEEN;
5231 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5232 r->extflags |= RXf_LOOKBEHIND_SEEN;
5233 if (RExC_seen & REG_SEEN_EVAL)
5234 r->extflags |= RXf_EVAL_SEEN;
5235 if (RExC_seen & REG_SEEN_CANY)
5236 r->extflags |= RXf_CANY_SEEN;
5237 if (RExC_seen & REG_SEEN_VERBARG)
5238 r->intflags |= PREGf_VERBARG_SEEN;
5239 if (RExC_seen & REG_SEEN_CUTGROUP)
5240 r->intflags |= PREGf_CUTGROUP_SEEN;
5241 if (RExC_paren_names)
5242 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5244 RXp_PAREN_NAMES(r) = NULL;
5246 #ifdef STUPID_PATTERN_CHECKS
5247 if (RX_PRELEN(rx) == 0)
5248 r->extflags |= RXf_NULL;
5249 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5250 /* XXX: this should happen BEFORE we compile */
5251 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5252 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5253 r->extflags |= RXf_WHITE;
5254 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5255 r->extflags |= RXf_START_ONLY;
5257 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5258 /* XXX: this should happen BEFORE we compile */
5259 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5261 regnode *first = ri->program + 1;
5264 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5265 r->extflags |= RXf_NULL;
5266 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5267 r->extflags |= RXf_START_ONLY;
5268 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5269 && OP(regnext(first)) == END)
5270 r->extflags |= RXf_WHITE;
5274 if (RExC_paren_names) {
5275 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5276 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5279 ri->name_list_idx = 0;
5281 if (RExC_recurse_count) {
5282 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5283 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5284 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5287 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5288 /* assume we don't need to swap parens around before we match */
5291 PerlIO_printf(Perl_debug_log,"Final program:\n");
5294 #ifdef RE_TRACK_PATTERN_OFFSETS
5295 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5296 const U32 len = ri->u.offsets[0];
5298 GET_RE_DEBUG_FLAGS_DECL;
5299 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5300 for (i = 1; i <= len; i++) {
5301 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5302 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5303 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5305 PerlIO_printf(Perl_debug_log, "\n");
5311 #undef RE_ENGINE_PTR
5315 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5318 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5320 PERL_UNUSED_ARG(value);
5322 if (flags & RXapif_FETCH) {
5323 return reg_named_buff_fetch(rx, key, flags);
5324 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5325 Perl_croak_no_modify(aTHX);
5327 } else if (flags & RXapif_EXISTS) {
5328 return reg_named_buff_exists(rx, key, flags)
5331 } else if (flags & RXapif_REGNAMES) {
5332 return reg_named_buff_all(rx, flags);
5333 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5334 return reg_named_buff_scalar(rx, flags);
5336 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5342 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5345 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5346 PERL_UNUSED_ARG(lastkey);
5348 if (flags & RXapif_FIRSTKEY)
5349 return reg_named_buff_firstkey(rx, flags);
5350 else if (flags & RXapif_NEXTKEY)
5351 return reg_named_buff_nextkey(rx, flags);
5353 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5359 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5362 AV *retarray = NULL;
5364 struct regexp *const rx = (struct regexp *)SvANY(r);
5366 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5368 if (flags & RXapif_ALL)
5371 if (rx && RXp_PAREN_NAMES(rx)) {
5372 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5375 SV* sv_dat=HeVAL(he_str);
5376 I32 *nums=(I32*)SvPVX(sv_dat);
5377 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5378 if ((I32)(rx->nparens) >= nums[i]
5379 && rx->offs[nums[i]].start != -1
5380 && rx->offs[nums[i]].end != -1)
5383 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5387 ret = newSVsv(&PL_sv_undef);
5390 av_push(retarray, ret);
5393 return newRV_noinc(MUTABLE_SV(retarray));
5400 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5403 struct regexp *const rx = (struct regexp *)SvANY(r);
5405 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5407 if (rx && RXp_PAREN_NAMES(rx)) {
5408 if (flags & RXapif_ALL) {
5409 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5411 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5425 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5427 struct regexp *const rx = (struct regexp *)SvANY(r);
5429 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5431 if ( rx && RXp_PAREN_NAMES(rx) ) {
5432 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5434 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5441 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5443 struct regexp *const rx = (struct regexp *)SvANY(r);
5444 GET_RE_DEBUG_FLAGS_DECL;
5446 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5448 if (rx && RXp_PAREN_NAMES(rx)) {
5449 HV *hv = RXp_PAREN_NAMES(rx);
5451 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5454 SV* sv_dat = HeVAL(temphe);
5455 I32 *nums = (I32*)SvPVX(sv_dat);
5456 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5457 if ((I32)(rx->lastparen) >= nums[i] &&
5458 rx->offs[nums[i]].start != -1 &&
5459 rx->offs[nums[i]].end != -1)
5465 if (parno || flags & RXapif_ALL) {
5466 return newSVhek(HeKEY_hek(temphe));
5474 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5479 struct regexp *const rx = (struct regexp *)SvANY(r);
5481 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5483 if (rx && RXp_PAREN_NAMES(rx)) {
5484 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5485 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5486 } else if (flags & RXapif_ONE) {
5487 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5488 av = MUTABLE_AV(SvRV(ret));
5489 length = av_len(av);
5491 return newSViv(length + 1);
5493 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5497 return &PL_sv_undef;
5501 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5503 struct regexp *const rx = (struct regexp *)SvANY(r);
5506 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5508 if (rx && RXp_PAREN_NAMES(rx)) {
5509 HV *hv= RXp_PAREN_NAMES(rx);
5511 (void)hv_iterinit(hv);
5512 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5515 SV* sv_dat = HeVAL(temphe);
5516 I32 *nums = (I32*)SvPVX(sv_dat);
5517 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5518 if ((I32)(rx->lastparen) >= nums[i] &&
5519 rx->offs[nums[i]].start != -1 &&
5520 rx->offs[nums[i]].end != -1)
5526 if (parno || flags & RXapif_ALL) {
5527 av_push(av, newSVhek(HeKEY_hek(temphe)));
5532 return newRV_noinc(MUTABLE_SV(av));
5536 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5539 struct regexp *const rx = (struct regexp *)SvANY(r);
5544 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5547 sv_setsv(sv,&PL_sv_undef);
5551 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5553 i = rx->offs[0].start;
5557 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5559 s = rx->subbeg + rx->offs[0].end;
5560 i = rx->sublen - rx->offs[0].end;
5563 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5564 (s1 = rx->offs[paren].start) != -1 &&
5565 (t1 = rx->offs[paren].end) != -1)
5569 s = rx->subbeg + s1;
5571 sv_setsv(sv,&PL_sv_undef);
5574 assert(rx->sublen >= (s - rx->subbeg) + i );
5576 const int oldtainted = PL_tainted;
5578 sv_setpvn(sv, s, i);
5579 PL_tainted = oldtainted;
5580 if ( (rx->extflags & RXf_CANY_SEEN)
5581 ? (RXp_MATCH_UTF8(rx)
5582 && (!i || is_utf8_string((U8*)s, i)))
5583 : (RXp_MATCH_UTF8(rx)) )
5590 if (RXp_MATCH_TAINTED(rx)) {
5591 if (SvTYPE(sv) >= SVt_PVMG) {
5592 MAGIC* const mg = SvMAGIC(sv);
5595 SvMAGIC_set(sv, mg->mg_moremagic);
5597 if ((mgt = SvMAGIC(sv))) {
5598 mg->mg_moremagic = mgt;
5599 SvMAGIC_set(sv, mg);
5609 sv_setsv(sv,&PL_sv_undef);
5615 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5616 SV const * const value)
5618 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5620 PERL_UNUSED_ARG(rx);
5621 PERL_UNUSED_ARG(paren);
5622 PERL_UNUSED_ARG(value);
5625 Perl_croak_no_modify(aTHX);
5629 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5632 struct regexp *const rx = (struct regexp *)SvANY(r);
5636 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5638 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5640 /* $` / ${^PREMATCH} */
5641 case RX_BUFF_IDX_PREMATCH:
5642 if (rx->offs[0].start != -1) {
5643 i = rx->offs[0].start;
5651 /* $' / ${^POSTMATCH} */
5652 case RX_BUFF_IDX_POSTMATCH:
5653 if (rx->offs[0].end != -1) {
5654 i = rx->sublen - rx->offs[0].end;
5656 s1 = rx->offs[0].end;
5662 /* $& / ${^MATCH}, $1, $2, ... */
5664 if (paren <= (I32)rx->nparens &&
5665 (s1 = rx->offs[paren].start) != -1 &&
5666 (t1 = rx->offs[paren].end) != -1)
5671 if (ckWARN(WARN_UNINITIALIZED))
5672 report_uninit((const SV *)sv);
5677 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5678 const char * const s = rx->subbeg + s1;
5683 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5690 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5692 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5693 PERL_UNUSED_ARG(rx);
5697 return newSVpvs("Regexp");
5700 /* Scans the name of a named buffer from the pattern.
5701 * If flags is REG_RSN_RETURN_NULL returns null.
5702 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5703 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5704 * to the parsed name as looked up in the RExC_paren_names hash.
5705 * If there is an error throws a vFAIL().. type exception.
5708 #define REG_RSN_RETURN_NULL 0
5709 #define REG_RSN_RETURN_NAME 1
5710 #define REG_RSN_RETURN_DATA 2
5713 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5715 char *name_start = RExC_parse;
5717 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5719 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5720 /* skip IDFIRST by using do...while */
5723 RExC_parse += UTF8SKIP(RExC_parse);
5724 } while (isALNUM_utf8((U8*)RExC_parse));
5728 } while (isALNUM(*RExC_parse));
5733 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5734 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5735 if ( flags == REG_RSN_RETURN_NAME)
5737 else if (flags==REG_RSN_RETURN_DATA) {
5740 if ( ! sv_name ) /* should not happen*/
5741 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5742 if (RExC_paren_names)
5743 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5745 sv_dat = HeVAL(he_str);
5747 vFAIL("Reference to nonexistent named group");
5751 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5758 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5759 int rem=(int)(RExC_end - RExC_parse); \
5768 if (RExC_lastparse!=RExC_parse) \
5769 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5772 iscut ? "..." : "<" \
5775 PerlIO_printf(Perl_debug_log,"%16s",""); \
5778 num = RExC_size + 1; \
5780 num=REG_NODE_NUM(RExC_emit); \
5781 if (RExC_lastnum!=num) \
5782 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5784 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5785 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5786 (int)((depth*2)), "", \
5790 RExC_lastparse=RExC_parse; \
5795 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5796 DEBUG_PARSE_MSG((funcname)); \
5797 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5799 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5800 DEBUG_PARSE_MSG((funcname)); \
5801 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5804 /* This section of code defines the inversion list object and its methods. The
5805 * interfaces are highly subject to change, so as much as possible is static to
5806 * this file. An inversion list is here implemented as a malloc'd C array with
5807 * some added info. More will be coming when functionality is added later.
5809 * Some of the methods should always be private to the implementation, and some
5810 * should eventually be made public */
5812 #define INVLIST_INITIAL_LEN 10
5813 #define INVLIST_ARRAY_KEY "array"
5814 #define INVLIST_MAX_KEY "max"
5815 #define INVLIST_LEN_KEY "len"
5817 PERL_STATIC_INLINE UV*
5818 S_invlist_array(pTHX_ HV* const invlist)
5820 /* Returns the pointer to the inversion list's array. Every time the
5821 * length changes, this needs to be called in case malloc or realloc moved
5824 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5826 PERL_ARGS_ASSERT_INVLIST_ARRAY;
5828 if (list_ptr == NULL) {
5829 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5833 return INT2PTR(UV *, SvUV(*list_ptr));
5836 PERL_STATIC_INLINE void
5837 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5839 PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5841 /* Sets the array stored in the inversion list to the memory beginning with
5844 if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5845 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5850 PERL_STATIC_INLINE UV
5851 S_invlist_len(pTHX_ HV* const invlist)
5853 /* Returns the current number of elements in the inversion list's array */
5855 SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5857 PERL_ARGS_ASSERT_INVLIST_LEN;
5859 if (len_ptr == NULL) {
5860 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5864 return SvUV(*len_ptr);
5867 PERL_STATIC_INLINE UV
5868 S_invlist_max(pTHX_ HV* const invlist)
5870 /* Returns the maximum number of elements storable in the inversion list's
5871 * array, without having to realloc() */
5873 SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5875 PERL_ARGS_ASSERT_INVLIST_MAX;
5877 if (max_ptr == NULL) {
5878 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5882 return SvUV(*max_ptr);
5885 PERL_STATIC_INLINE void
5886 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5888 /* Sets the current number of elements stored in the inversion list */
5890 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5892 if (len != 0 && len > invlist_max(invlist)) {
5893 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5896 if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5897 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5902 PERL_STATIC_INLINE void
5903 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5906 /* Sets the maximum number of elements storable in the inversion list
5907 * without having to realloc() */
5909 PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5911 if (max < invlist_len(invlist)) {
5912 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5915 if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5916 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5921 #ifndef PERL_IN_XSUB_RE
5923 Perl__new_invlist(pTHX_ IV initial_size)
5926 /* Return a pointer to a newly constructed inversion list, with enough
5927 * space to store 'initial_size' elements. If that number is negative, a
5928 * system default is used instead */
5930 HV* invlist = newHV();
5933 if (initial_size < 0) {
5934 initial_size = INVLIST_INITIAL_LEN;
5937 /* Allocate the initial space */
5938 Newx(list, initial_size, UV);
5939 invlist_set_array(invlist, list);
5941 /* set_len has to come before set_max, as the latter inspects the len */
5942 invlist_set_len(invlist, 0);
5943 invlist_set_max(invlist, initial_size);
5949 PERL_STATIC_INLINE void
5950 S_invlist_destroy(pTHX_ HV* const invlist)
5952 /* Inversion list destructor */
5954 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5956 PERL_ARGS_ASSERT_INVLIST_DESTROY;
5958 if (list_ptr != NULL) {
5959 UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5965 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5967 /* Change the maximum size of an inversion list (up or down) */
5971 const UV old_max = invlist_max(invlist);
5973 PERL_ARGS_ASSERT_INVLIST_EXTEND;
5975 if (old_max == new_max) { /* If a no-op */
5979 array = orig_array = invlist_array(invlist);
5980 Renew(array, new_max, UV);
5982 /* If the size change moved the list in memory, set the new one */
5983 if (array != orig_array) {
5984 invlist_set_array(invlist, array);
5987 invlist_set_max(invlist, new_max);
5991 PERL_STATIC_INLINE void
5992 S_invlist_trim(pTHX_ HV* const invlist)
5994 PERL_ARGS_ASSERT_INVLIST_TRIM;
5996 /* Change the length of the inversion list to how many entries it currently
5999 invlist_extend(invlist, invlist_len(invlist));
6002 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6005 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
6007 #ifndef PERL_IN_XSUB_RE
6009 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6011 /* Subject to change or removal. Append the range from 'start' to 'end' at
6012 * the end of the inversion list. The range must be above any existing
6015 UV* array = invlist_array(invlist);
6016 UV max = invlist_max(invlist);
6017 UV len = invlist_len(invlist);
6019 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6023 /* Here, the existing list is non-empty. The current max entry in the
6024 * list is generally the first value not in the set, except when the
6025 * set extends to the end of permissible values, in which case it is
6026 * the first entry in that final set, and so this call is an attempt to
6027 * append out-of-order */
6029 UV final_element = len - 1;
6030 if (array[final_element] > start
6031 || ELEMENT_IN_INVLIST_SET(final_element))
6033 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
6036 /* Here, it is a legal append. If the new range begins with the first
6037 * value not in the set, it is extending the set, so the new first
6038 * value not in the set is one greater than the newly extended range.
6040 if (array[final_element] == start) {
6041 if (end != UV_MAX) {
6042 array[final_element] = end + 1;
6045 /* But if the end is the maximum representable on the machine,
6046 * just let the range that this would extend have no end */
6047 invlist_set_len(invlist, len - 1);
6053 /* Here the new range doesn't extend any existing set. Add it */
6055 len += 2; /* Includes an element each for the start and end of range */
6057 /* If overflows the existing space, extend, which may cause the array to be
6060 invlist_extend(invlist, len);
6061 array = invlist_array(invlist);
6064 invlist_set_len(invlist, len);
6066 /* The next item on the list starts the range, the one after that is
6067 * one past the new range. */
6068 array[len - 2] = start;
6069 if (end != UV_MAX) {
6070 array[len - 1] = end + 1;
6073 /* But if the end is the maximum representable on the machine, just let
6074 * the range have no end */
6075 invlist_set_len(invlist, len - 1);
6081 S_invlist_union(pTHX_ HV* const a, HV* const b)
6083 /* Return a new inversion list which is the union of two inversion lists.
6084 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6085 * Richard Gillam, published by Addison-Wesley, and explained at some
6086 * length there. The preface says to incorporate its examples into your
6087 * code at your own risk.
6089 * The algorithm is like a merge sort.
6091 * XXX A potential performance improvement is to keep track as we go along
6092 * if only one of the inputs contributes to the result, meaning the other
6093 * is a subset of that one. In that case, we can skip the final copy and
6094 * return the larger of the input lists */
6096 UV* array_a = invlist_array(a); /* a's array */
6097 UV* array_b = invlist_array(b);
6098 UV len_a = invlist_len(a); /* length of a's array */
6099 UV len_b = invlist_len(b);
6101 HV* u; /* the resulting union */
6105 UV i_a = 0; /* current index into a's array */
6109 /* running count, as explained in the algorithm source book; items are
6110 * stopped accumulating and are output when the count changes to/from 0.
6111 * The count is incremented when we start a range that's in the set, and
6112 * decremented when we start a range that's not in the set. So its range
6113 * is 0 to 2. Only when the count is zero is something not in the set.
6117 PERL_ARGS_ASSERT_INVLIST_UNION;
6119 /* Size the union for the worst case: that the sets are completely
6121 u = _new_invlist(len_a + len_b);
6122 array_u = invlist_array(u);
6124 /* Go through each list item by item, stopping when exhausted one of
6126 while (i_a < len_a && i_b < len_b) {
6127 UV cp; /* The element to potentially add to the union's array */
6128 bool cp_in_set; /* is it in the the input list's set or not */
6130 /* We need to take one or the other of the two inputs for the union.
6131 * Since we are merging two sorted lists, we take the smaller of the
6132 * next items. In case of a tie, we take the one that is in its set
6133 * first. If we took one not in the set first, it would decrement the
6134 * count, possibly to 0 which would cause it to be output as ending the
6135 * range, and the next time through we would take the same number, and
6136 * output it again as beginning the next range. By doing it the
6137 * opposite way, there is no possibility that the count will be
6138 * momentarily decremented to 0, and thus the two adjoining ranges will
6139 * be seamlessly merged. (In a tie and both are in the set or both not
6140 * in the set, it doesn't matter which we take first.) */
6141 if (array_a[i_a] < array_b[i_b]
6142 || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6144 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6148 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6152 /* Here, have chosen which of the two inputs to look at. Only output
6153 * if the running count changes to/from 0, which marks the
6154 * beginning/end of a range in that's in the set */
6157 array_u[i_u++] = cp;
6164 array_u[i_u++] = cp;
6169 /* Here, we are finished going through at least one of the lists, which
6170 * means there is something remaining in at most one. We check if the list
6171 * that hasn't been exhausted is positioned such that we are in the middle
6172 * of a range in its set or not. (We are in the set if the next item in
6173 * the array marks the beginning of something not in the set) If in the
6174 * set, we decrement 'count'; if 0, there is potentially more to output.
6175 * There are four cases:
6176 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6177 * in the union is entirely from the non-exhausted set.
6178 * 2) Both were in their sets, count is 2. Nothing further should
6179 * be output, as everything that remains will be in the exhausted
6180 * list's set, hence in the union; decrementing to 1 but not 0 insures
6182 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6183 * Nothing further should be output because the union includes
6184 * everything from the exhausted set. Not decrementing insures that.
6185 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6186 * decrementing to 0 insures that we look at the remainder of the
6187 * non-exhausted set */
6188 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6189 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6194 /* The final length is what we've output so far, plus what else is about to
6195 * be output. (If 'count' is non-zero, then the input list we exhausted
6196 * has everything remaining up to the machine's limit in its set, and hence
6197 * in the union, so there will be no further output. */
6200 /* At most one of the subexpressions will be non-zero */
6201 len_u += (len_a - i_a) + (len_b - i_b);
6204 /* Set result to final length, which can change the pointer to array_u, so
6206 if (len_u != invlist_len(u)) {
6207 invlist_set_len(u, len_u);
6209 array_u = invlist_array(u);
6212 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6213 * the other) ended with everything above it not in its set. That means
6214 * that the remaining part of the union is precisely the same as the
6215 * non-exhausted list, so can just copy it unchanged. (If both list were
6216 * exhausted at the same time, then the operations below will be both 0.)
6219 IV copy_count; /* At most one will have a non-zero copy count */
6220 if ((copy_count = len_a - i_a) > 0) {
6221 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6223 else if ((copy_count = len_b - i_b) > 0) {
6224 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6232 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6234 /* Return the intersection of two inversion lists. The basis for this
6235 * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6236 * by Addison-Wesley, and explained at some length there. The preface says
6237 * to incorporate its examples into your code at your own risk.
6239 * The algorithm is like a merge sort, and is essentially the same as the
6243 UV* array_a = invlist_array(a); /* a's array */
6244 UV* array_b = invlist_array(b);
6245 UV len_a = invlist_len(a); /* length of a's array */
6246 UV len_b = invlist_len(b);
6248 HV* r; /* the resulting intersection */
6252 UV i_a = 0; /* current index into a's array */
6256 /* running count, as explained in the algorithm source book; items are
6257 * stopped accumulating and are output when the count changes to/from 2.
6258 * The count is incremented when we start a range that's in the set, and
6259 * decremented when we start a range that's not in the set. So its range
6260 * is 0 to 2. Only when the count is 2 is something in the intersection.
6264 PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6266 /* Size the intersection for the worst case: that the intersection ends up
6267 * fragmenting everything to be completely disjoint */
6268 r= _new_invlist(len_a + len_b);
6269 array_r = invlist_array(r);
6271 /* Go through each list item by item, stopping when exhausted one of
6273 while (i_a < len_a && i_b < len_b) {
6274 UV cp; /* The element to potentially add to the intersection's
6276 bool cp_in_set; /* Is it in the input list's set or not */
6278 /* We need to take one or the other of the two inputs for the union.
6279 * Since we are merging two sorted lists, we take the smaller of the
6280 * next items. In case of a tie, we take the one that is not in its
6281 * set first (a difference from the union algorithm). If we took one
6282 * in the set first, it would increment the count, possibly to 2 which
6283 * would cause it to be output as starting a range in the intersection,
6284 * and the next time through we would take that same number, and output
6285 * it again as ending the set. By doing it the opposite of this, we
6286 * there is no possibility that the count will be momentarily
6287 * incremented to 2. (In a tie and both are in the set or both not in
6288 * the set, it doesn't matter which we take first.) */
6289 if (array_a[i_a] < array_b[i_b]
6290 || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6292 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6296 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6300 /* Here, have chosen which of the two inputs to look at. Only output
6301 * if the running count changes to/from 2, which marks the
6302 * beginning/end of a range that's in the intersection */
6306 array_r[i_r++] = cp;
6311 array_r[i_r++] = cp;
6317 /* Here, we are finished going through at least one of the sets, which
6318 * means there is something remaining in at most one. See the comments in
6320 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6321 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6326 /* The final length is what we've output so far plus what else is in the
6327 * intersection. Only one of the subexpressions below will be non-zero */
6330 len_r += (len_a - i_a) + (len_b - i_b);
6333 /* Set result to final length, which can change the pointer to array_r, so
6335 if (len_r != invlist_len(r)) {
6336 invlist_set_len(r, len_r);
6338 array_r = invlist_array(r);
6341 /* Finish outputting any remaining */
6342 if (count == 2) { /* Only one of will have a non-zero copy count */
6344 if ((copy_count = len_a - i_a) > 0) {
6345 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6347 else if ((copy_count = len_b - i_b) > 0) {
6348 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6356 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6358 /* Add the range from 'start' to 'end' inclusive to the inversion list's
6359 * set. A pointer to the inversion list is returned. This may actually be
6360 * a new list, in which case the passed in one has been destroyed. The
6361 * passed in inversion list can be NULL, in which case a new one is created
6362 * with just the one range in it */
6368 if (invlist == NULL) {
6369 invlist = _new_invlist(2);
6373 len = invlist_len(invlist);
6376 /* If comes after the final entry, can just append it to the end */
6378 || start >= invlist_array(invlist)
6379 [invlist_len(invlist) - 1])
6381 _append_range_to_invlist(invlist, start, end);
6385 /* Here, can't just append things, create and return a new inversion list
6386 * which is the union of this range and the existing inversion list */
6387 range_invlist = _new_invlist(2);
6388 _append_range_to_invlist(range_invlist, start, end);
6390 added_invlist = invlist_union(invlist, range_invlist);
6392 /* The passed in list can be freed, as well as our temporary */
6393 invlist_destroy(range_invlist);
6394 if (invlist != added_invlist) {
6395 invlist_destroy(invlist);
6398 return added_invlist;
6401 PERL_STATIC_INLINE HV*
6402 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6403 return add_range_to_invlist(invlist, cp, cp);
6406 /* End of inversion list object */
6409 - reg - regular expression, i.e. main body or parenthesized thing
6411 * Caller must absorb opening parenthesis.
6413 * Combining parenthesis handling with the base level of regular expression
6414 * is a trifle forced, but the need to tie the tails of the branches to what
6415 * follows makes it hard to avoid.
6417 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6419 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6421 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6425 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6426 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6429 register regnode *ret; /* Will be the head of the group. */
6430 register regnode *br;
6431 register regnode *lastbr;
6432 register regnode *ender = NULL;
6433 register I32 parno = 0;
6435 U32 oregflags = RExC_flags;
6436 bool have_branch = 0;
6438 I32 freeze_paren = 0;
6439 I32 after_freeze = 0;
6441 /* for (?g), (?gc), and (?o) warnings; warning
6442 about (?c) will warn about (?g) -- japhy */
6444 #define WASTED_O 0x01
6445 #define WASTED_G 0x02
6446 #define WASTED_C 0x04
6447 #define WASTED_GC (0x02|0x04)
6448 I32 wastedflags = 0x00;
6450 char * parse_start = RExC_parse; /* MJD */
6451 char * const oregcomp_parse = RExC_parse;
6453 GET_RE_DEBUG_FLAGS_DECL;
6455 PERL_ARGS_ASSERT_REG;
6456 DEBUG_PARSE("reg ");
6458 *flagp = 0; /* Tentatively. */
6461 /* Make an OPEN node, if parenthesized. */
6463 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6464 char *start_verb = RExC_parse;
6465 STRLEN verb_len = 0;
6466 char *start_arg = NULL;
6467 unsigned char op = 0;
6469 int internal_argval = 0; /* internal_argval is only useful if !argok */
6470 while ( *RExC_parse && *RExC_parse != ')' ) {
6471 if ( *RExC_parse == ':' ) {
6472 start_arg = RExC_parse + 1;
6478 verb_len = RExC_parse - start_verb;
6481 while ( *RExC_parse && *RExC_parse != ')' )
6483 if ( *RExC_parse != ')' )
6484 vFAIL("Unterminated verb pattern argument");
6485 if ( RExC_parse == start_arg )
6488 if ( *RExC_parse != ')' )
6489 vFAIL("Unterminated verb pattern");
6492 switch ( *start_verb ) {
6493 case 'A': /* (*ACCEPT) */
6494 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6496 internal_argval = RExC_nestroot;
6499 case 'C': /* (*COMMIT) */
6500 if ( memEQs(start_verb,verb_len,"COMMIT") )
6503 case 'F': /* (*FAIL) */
6504 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6509 case ':': /* (*:NAME) */
6510 case 'M': /* (*MARK:NAME) */
6511 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6516 case 'P': /* (*PRUNE) */
6517 if ( memEQs(start_verb,verb_len,"PRUNE") )
6520 case 'S': /* (*SKIP) */
6521 if ( memEQs(start_verb,verb_len,"SKIP") )
6524 case 'T': /* (*THEN) */
6525 /* [19:06] <TimToady> :: is then */
6526 if ( memEQs(start_verb,verb_len,"THEN") ) {
6528 RExC_seen |= REG_SEEN_CUTGROUP;
6534 vFAIL3("Unknown verb pattern '%.*s'",
6535 verb_len, start_verb);
6538 if ( start_arg && internal_argval ) {
6539 vFAIL3("Verb pattern '%.*s' may not have an argument",
6540 verb_len, start_verb);
6541 } else if ( argok < 0 && !start_arg ) {
6542 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6543 verb_len, start_verb);
6545 ret = reganode(pRExC_state, op, internal_argval);
6546 if ( ! internal_argval && ! SIZE_ONLY ) {
6548 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6549 ARG(ret) = add_data( pRExC_state, 1, "S" );
6550 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6557 if (!internal_argval)
6558 RExC_seen |= REG_SEEN_VERBARG;
6559 } else if ( start_arg ) {
6560 vFAIL3("Verb pattern '%.*s' may not have an argument",
6561 verb_len, start_verb);
6563 ret = reg_node(pRExC_state, op);
6565 nextchar(pRExC_state);
6568 if (*RExC_parse == '?') { /* (?...) */
6569 bool is_logical = 0;
6570 const char * const seqstart = RExC_parse;
6571 bool has_use_defaults = FALSE;
6574 paren = *RExC_parse++;
6575 ret = NULL; /* For look-ahead/behind. */
6578 case 'P': /* (?P...) variants for those used to PCRE/Python */
6579 paren = *RExC_parse++;
6580 if ( paren == '<') /* (?P<...>) named capture */
6582 else if (paren == '>') { /* (?P>name) named recursion */
6583 goto named_recursion;
6585 else if (paren == '=') { /* (?P=...) named backref */
6586 /* this pretty much dupes the code for \k<NAME> in regatom(), if
6587 you change this make sure you change that */
6588 char* name_start = RExC_parse;
6590 SV *sv_dat = reg_scan_name(pRExC_state,
6591 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6592 if (RExC_parse == name_start || *RExC_parse != ')')
6593 vFAIL2("Sequence %.3s... not terminated",parse_start);
6596 num = add_data( pRExC_state, 1, "S" );
6597 RExC_rxi->data->data[num]=(void*)sv_dat;
6598 SvREFCNT_inc_simple_void(sv_dat);
6601 ret = reganode(pRExC_state,
6604 : (MORE_ASCII_RESTRICTED)
6606 : (AT_LEAST_UNI_SEMANTICS)
6614 Set_Node_Offset(ret, parse_start+1);
6615 Set_Node_Cur_Length(ret); /* MJD */
6617 nextchar(pRExC_state);
6621 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6623 case '<': /* (?<...) */
6624 if (*RExC_parse == '!')
6626 else if (*RExC_parse != '=')
6632 case '\'': /* (?'...') */
6633 name_start= RExC_parse;
6634 svname = reg_scan_name(pRExC_state,
6635 SIZE_ONLY ? /* reverse test from the others */
6636 REG_RSN_RETURN_NAME :
6637 REG_RSN_RETURN_NULL);
6638 if (RExC_parse == name_start) {
6640 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6643 if (*RExC_parse != paren)
6644 vFAIL2("Sequence (?%c... not terminated",
6645 paren=='>' ? '<' : paren);
6649 if (!svname) /* shouldn't happen */
6651 "panic: reg_scan_name returned NULL");
6652 if (!RExC_paren_names) {
6653 RExC_paren_names= newHV();
6654 sv_2mortal(MUTABLE_SV(RExC_paren_names));
6656 RExC_paren_name_list= newAV();
6657 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6660 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6662 sv_dat = HeVAL(he_str);
6664 /* croak baby croak */
6666 "panic: paren_name hash element allocation failed");
6667 } else if ( SvPOK(sv_dat) ) {
6668 /* (?|...) can mean we have dupes so scan to check
6669 its already been stored. Maybe a flag indicating
6670 we are inside such a construct would be useful,
6671 but the arrays are likely to be quite small, so
6672 for now we punt -- dmq */
6673 IV count = SvIV(sv_dat);
6674 I32 *pv = (I32*)SvPVX(sv_dat);
6676 for ( i = 0 ; i < count ; i++ ) {
6677 if ( pv[i] == RExC_npar ) {
6683 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6684 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6685 pv[count] = RExC_npar;
6686 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6689 (void)SvUPGRADE(sv_dat,SVt_PVNV);
6690 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6692 SvIV_set(sv_dat, 1);
6695 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6696 SvREFCNT_dec(svname);
6699 /*sv_dump(sv_dat);*/
6701 nextchar(pRExC_state);
6703 goto capturing_parens;
6705 RExC_seen |= REG_SEEN_LOOKBEHIND;
6706 RExC_in_lookbehind++;
6708 case '=': /* (?=...) */
6709 RExC_seen_zerolen++;
6711 case '!': /* (?!...) */
6712 RExC_seen_zerolen++;
6713 if (*RExC_parse == ')') {
6714 ret=reg_node(pRExC_state, OPFAIL);
6715 nextchar(pRExC_state);
6719 case '|': /* (?|...) */
6720 /* branch reset, behave like a (?:...) except that
6721 buffers in alternations share the same numbers */
6723 after_freeze = freeze_paren = RExC_npar;
6725 case ':': /* (?:...) */
6726 case '>': /* (?>...) */
6728 case '$': /* (?$...) */
6729 case '@': /* (?@...) */
6730 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6732 case '#': /* (?#...) */
6733 while (*RExC_parse && *RExC_parse != ')')
6735 if (*RExC_parse != ')')
6736 FAIL("Sequence (?#... not terminated");
6737 nextchar(pRExC_state);
6740 case '0' : /* (?0) */
6741 case 'R' : /* (?R) */
6742 if (*RExC_parse != ')')
6743 FAIL("Sequence (?R) not terminated");
6744 ret = reg_node(pRExC_state, GOSTART);
6745 *flagp |= POSTPONED;
6746 nextchar(pRExC_state);
6749 { /* named and numeric backreferences */
6751 case '&': /* (?&NAME) */
6752 parse_start = RExC_parse - 1;
6755 SV *sv_dat = reg_scan_name(pRExC_state,
6756 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6757 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6759 goto gen_recurse_regop;
6762 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6764 vFAIL("Illegal pattern");
6766 goto parse_recursion;
6768 case '-': /* (?-1) */
6769 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6770 RExC_parse--; /* rewind to let it be handled later */
6774 case '1': case '2': case '3': case '4': /* (?1) */
6775 case '5': case '6': case '7': case '8': case '9':
6778 num = atoi(RExC_parse);
6779 parse_start = RExC_parse - 1; /* MJD */
6780 if (*RExC_parse == '-')
6782 while (isDIGIT(*RExC_parse))
6784 if (*RExC_parse!=')')
6785 vFAIL("Expecting close bracket");
6788 if ( paren == '-' ) {
6790 Diagram of capture buffer numbering.
6791 Top line is the normal capture buffer numbers
6792 Bottom line is the negative indexing as from
6796 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6800 num = RExC_npar + num;
6803 vFAIL("Reference to nonexistent group");
6805 } else if ( paren == '+' ) {
6806 num = RExC_npar + num - 1;
6809 ret = reganode(pRExC_state, GOSUB, num);
6811 if (num > (I32)RExC_rx->nparens) {
6813 vFAIL("Reference to nonexistent group");
6815 ARG2L_SET( ret, RExC_recurse_count++);
6817 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6818 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6822 RExC_seen |= REG_SEEN_RECURSE;
6823 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6824 Set_Node_Offset(ret, parse_start); /* MJD */
6826 *flagp |= POSTPONED;
6827 nextchar(pRExC_state);
6829 } /* named and numeric backreferences */
6832 case '?': /* (??...) */
6834 if (*RExC_parse != '{') {
6836 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6839 *flagp |= POSTPONED;
6840 paren = *RExC_parse++;
6842 case '{': /* (?{...}) */
6847 char *s = RExC_parse;
6849 RExC_seen_zerolen++;
6850 RExC_seen |= REG_SEEN_EVAL;
6851 while (count && (c = *RExC_parse)) {
6862 if (*RExC_parse != ')') {
6864 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6868 OP_4tree *sop, *rop;
6869 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6872 Perl_save_re_context(aTHX);
6873 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6874 sop->op_private |= OPpREFCOUNTED;
6875 /* re_dup will OpREFCNT_inc */
6876 OpREFCNT_set(sop, 1);
6879 n = add_data(pRExC_state, 3, "nop");
6880 RExC_rxi->data->data[n] = (void*)rop;
6881 RExC_rxi->data->data[n+1] = (void*)sop;
6882 RExC_rxi->data->data[n+2] = (void*)pad;
6885 else { /* First pass */
6886 if (PL_reginterp_cnt < ++RExC_seen_evals
6888 /* No compiled RE interpolated, has runtime
6889 components ===> unsafe. */
6890 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6891 if (PL_tainting && PL_tainted)
6892 FAIL("Eval-group in insecure regular expression");
6893 #if PERL_VERSION > 8
6894 if (IN_PERL_COMPILETIME)
6899 nextchar(pRExC_state);
6901 ret = reg_node(pRExC_state, LOGICAL);
6904 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6905 /* deal with the length of this later - MJD */
6908 ret = reganode(pRExC_state, EVAL, n);
6909 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6910 Set_Node_Offset(ret, parse_start);
6913 case '(': /* (?(?{...})...) and (?(?=...)...) */
6916 if (RExC_parse[0] == '?') { /* (?(?...)) */
6917 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6918 || RExC_parse[1] == '<'
6919 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6922 ret = reg_node(pRExC_state, LOGICAL);
6925 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6929 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6930 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6932 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6933 char *name_start= RExC_parse++;
6935 SV *sv_dat=reg_scan_name(pRExC_state,
6936 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6937 if (RExC_parse == name_start || *RExC_parse != ch)
6938 vFAIL2("Sequence (?(%c... not terminated",
6939 (ch == '>' ? '<' : ch));
6942 num = add_data( pRExC_state, 1, "S" );
6943 RExC_rxi->data->data[num]=(void*)sv_dat;
6944 SvREFCNT_inc_simple_void(sv_dat);
6946 ret = reganode(pRExC_state,NGROUPP,num);
6947 goto insert_if_check_paren;
6949 else if (RExC_parse[0] == 'D' &&
6950 RExC_parse[1] == 'E' &&
6951 RExC_parse[2] == 'F' &&
6952 RExC_parse[3] == 'I' &&
6953 RExC_parse[4] == 'N' &&
6954 RExC_parse[5] == 'E')
6956 ret = reganode(pRExC_state,DEFINEP,0);
6959 goto insert_if_check_paren;
6961 else if (RExC_parse[0] == 'R') {
6964 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6965 parno = atoi(RExC_parse++);
6966 while (isDIGIT(*RExC_parse))
6968 } else if (RExC_parse[0] == '&') {
6971 sv_dat = reg_scan_name(pRExC_state,
6972 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6973 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6975 ret = reganode(pRExC_state,INSUBP,parno);
6976 goto insert_if_check_paren;
6978 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6981 parno = atoi(RExC_parse++);
6983 while (isDIGIT(*RExC_parse))
6985 ret = reganode(pRExC_state, GROUPP, parno);
6987 insert_if_check_paren:
6988 if ((c = *nextchar(pRExC_state)) != ')')
6989 vFAIL("Switch condition not recognized");
6991 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6992 br = regbranch(pRExC_state, &flags, 1,depth+1);
6994 br = reganode(pRExC_state, LONGJMP, 0);
6996 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6997 c = *nextchar(pRExC_state);
7002 vFAIL("(?(DEFINE)....) does not allow branches");
7003 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7004 regbranch(pRExC_state, &flags, 1,depth+1);
7005 REGTAIL(pRExC_state, ret, lastbr);
7008 c = *nextchar(pRExC_state);
7013 vFAIL("Switch (?(condition)... contains too many branches");
7014 ender = reg_node(pRExC_state, TAIL);
7015 REGTAIL(pRExC_state, br, ender);
7017 REGTAIL(pRExC_state, lastbr, ender);
7018 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7021 REGTAIL(pRExC_state, ret, ender);
7022 RExC_size++; /* XXX WHY do we need this?!!
7023 For large programs it seems to be required
7024 but I can't figure out why. -- dmq*/
7028 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7032 RExC_parse--; /* for vFAIL to print correctly */
7033 vFAIL("Sequence (? incomplete");
7035 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
7037 has_use_defaults = TRUE;
7038 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7039 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7040 ? REGEX_UNICODE_CHARSET
7041 : REGEX_DEPENDS_CHARSET);
7045 parse_flags: /* (?i) */
7047 U32 posflags = 0, negflags = 0;
7048 U32 *flagsp = &posflags;
7049 bool has_charset_modifier = 0;
7050 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7051 ? REGEX_UNICODE_CHARSET
7052 : REGEX_DEPENDS_CHARSET;
7054 while (*RExC_parse) {
7055 /* && strchr("iogcmsx", *RExC_parse) */
7056 /* (?g), (?gc) and (?o) are useless here
7057 and must be globally applied -- japhy */
7058 switch (*RExC_parse) {
7059 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7060 case LOCALE_PAT_MOD:
7061 if (has_charset_modifier || flagsp == &negflags) {
7062 goto fail_modifiers;
7064 cs = REGEX_LOCALE_CHARSET;
7065 has_charset_modifier = 1;
7066 RExC_contains_locale = 1;
7068 case UNICODE_PAT_MOD:
7069 if (has_charset_modifier || flagsp == &negflags) {
7070 goto fail_modifiers;
7072 cs = REGEX_UNICODE_CHARSET;
7073 has_charset_modifier = 1;
7075 case ASCII_RESTRICT_PAT_MOD:
7076 if (has_charset_modifier || flagsp == &negflags) {
7077 goto fail_modifiers;
7079 if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
7080 /* Doubled modifier implies more restricted */
7081 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7085 cs = REGEX_ASCII_RESTRICTED_CHARSET;
7087 has_charset_modifier = 1;
7089 case DEPENDS_PAT_MOD:
7090 if (has_use_defaults
7091 || has_charset_modifier
7092 || flagsp == &negflags)
7094 goto fail_modifiers;
7097 /* The dual charset means unicode semantics if the
7098 * pattern (or target, not known until runtime) are
7099 * utf8, or something in the pattern indicates unicode
7101 cs = (RExC_utf8 || RExC_uni_semantics)
7102 ? REGEX_UNICODE_CHARSET
7103 : REGEX_DEPENDS_CHARSET;
7104 has_charset_modifier = 1;
7106 case ONCE_PAT_MOD: /* 'o' */
7107 case GLOBAL_PAT_MOD: /* 'g' */
7108 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7109 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7110 if (! (wastedflags & wflagbit) ) {
7111 wastedflags |= wflagbit;
7114 "Useless (%s%c) - %suse /%c modifier",
7115 flagsp == &negflags ? "?-" : "?",
7117 flagsp == &negflags ? "don't " : "",
7124 case CONTINUE_PAT_MOD: /* 'c' */
7125 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7126 if (! (wastedflags & WASTED_C) ) {
7127 wastedflags |= WASTED_GC;
7130 "Useless (%sc) - %suse /gc modifier",
7131 flagsp == &negflags ? "?-" : "?",
7132 flagsp == &negflags ? "don't " : ""
7137 case KEEPCOPY_PAT_MOD: /* 'p' */
7138 if (flagsp == &negflags) {
7140 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7142 *flagsp |= RXf_PMf_KEEPCOPY;
7146 /* A flag is a default iff it is following a minus, so
7147 * if there is a minus, it means will be trying to
7148 * re-specify a default which is an error */
7149 if (has_use_defaults || flagsp == &negflags) {
7152 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7156 wastedflags = 0; /* reset so (?g-c) warns twice */
7162 RExC_flags |= posflags;
7163 RExC_flags &= ~negflags;
7164 set_regex_charset(&RExC_flags, cs);
7166 oregflags |= posflags;
7167 oregflags &= ~negflags;
7168 set_regex_charset(&oregflags, cs);
7170 nextchar(pRExC_state);
7181 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7186 }} /* one for the default block, one for the switch */
7193 ret = reganode(pRExC_state, OPEN, parno);
7196 RExC_nestroot = parno;
7197 if (RExC_seen & REG_SEEN_RECURSE
7198 && !RExC_open_parens[parno-1])
7200 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7201 "Setting open paren #%"IVdf" to %d\n",
7202 (IV)parno, REG_NODE_NUM(ret)));
7203 RExC_open_parens[parno-1]= ret;
7206 Set_Node_Length(ret, 1); /* MJD */
7207 Set_Node_Offset(ret, RExC_parse); /* MJD */
7215 /* Pick up the branches, linking them together. */
7216 parse_start = RExC_parse; /* MJD */
7217 br = regbranch(pRExC_state, &flags, 1,depth+1);
7219 /* branch_len = (paren != 0); */
7223 if (*RExC_parse == '|') {
7224 if (!SIZE_ONLY && RExC_extralen) {
7225 reginsert(pRExC_state, BRANCHJ, br, depth+1);
7228 reginsert(pRExC_state, BRANCH, br, depth+1);
7229 Set_Node_Length(br, paren != 0);
7230 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7234 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
7236 else if (paren == ':') {
7237 *flagp |= flags&SIMPLE;
7239 if (is_open) { /* Starts with OPEN. */
7240 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
7242 else if (paren != '?') /* Not Conditional */
7244 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7246 while (*RExC_parse == '|') {
7247 if (!SIZE_ONLY && RExC_extralen) {
7248 ender = reganode(pRExC_state, LONGJMP,0);
7249 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7252 RExC_extralen += 2; /* Account for LONGJMP. */
7253 nextchar(pRExC_state);
7255 if (RExC_npar > after_freeze)
7256 after_freeze = RExC_npar;
7257 RExC_npar = freeze_paren;
7259 br = regbranch(pRExC_state, &flags, 0, depth+1);
7263 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
7265 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7268 if (have_branch || paren != ':') {
7269 /* Make a closing node, and hook it on the end. */
7272 ender = reg_node(pRExC_state, TAIL);
7275 ender = reganode(pRExC_state, CLOSE, parno);
7276 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7277 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7278 "Setting close paren #%"IVdf" to %d\n",
7279 (IV)parno, REG_NODE_NUM(ender)));
7280 RExC_close_parens[parno-1]= ender;
7281 if (RExC_nestroot == parno)
7284 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7285 Set_Node_Length(ender,1); /* MJD */
7291 *flagp &= ~HASWIDTH;
7294 ender = reg_node(pRExC_state, SUCCEED);
7297 ender = reg_node(pRExC_state, END);
7299 assert(!RExC_opend); /* there can only be one! */
7304 REGTAIL(pRExC_state, lastbr, ender);
7306 if (have_branch && !SIZE_ONLY) {
7308 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7310 /* Hook the tails of the branches to the closing node. */
7311 for (br = ret; br; br = regnext(br)) {
7312 const U8 op = PL_regkind[OP(br)];
7314 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7316 else if (op == BRANCHJ) {
7317 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7325 static const char parens[] = "=!<,>";
7327 if (paren && (p = strchr(parens, paren))) {
7328 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7329 int flag = (p - parens) > 1;
7332 node = SUSPEND, flag = 0;
7333 reginsert(pRExC_state, node,ret, depth+1);
7334 Set_Node_Cur_Length(ret);
7335 Set_Node_Offset(ret, parse_start + 1);
7337 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7341 /* Check for proper termination. */
7343 RExC_flags = oregflags;
7344 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7345 RExC_parse = oregcomp_parse;
7346 vFAIL("Unmatched (");
7349 else if (!paren && RExC_parse < RExC_end) {
7350 if (*RExC_parse == ')') {
7352 vFAIL("Unmatched )");
7355 FAIL("Junk on end of regexp"); /* "Can't happen". */
7359 if (RExC_in_lookbehind) {
7360 RExC_in_lookbehind--;
7362 if (after_freeze > RExC_npar)
7363 RExC_npar = after_freeze;
7368 - regbranch - one alternative of an | operator
7370 * Implements the concatenation operator.
7373 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7376 register regnode *ret;
7377 register regnode *chain = NULL;
7378 register regnode *latest;
7379 I32 flags = 0, c = 0;
7380 GET_RE_DEBUG_FLAGS_DECL;
7382 PERL_ARGS_ASSERT_REGBRANCH;
7384 DEBUG_PARSE("brnc");
7389 if (!SIZE_ONLY && RExC_extralen)
7390 ret = reganode(pRExC_state, BRANCHJ,0);
7392 ret = reg_node(pRExC_state, BRANCH);
7393 Set_Node_Length(ret, 1);
7397 if (!first && SIZE_ONLY)
7398 RExC_extralen += 1; /* BRANCHJ */
7400 *flagp = WORST; /* Tentatively. */
7403 nextchar(pRExC_state);
7404 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7406 latest = regpiece(pRExC_state, &flags,depth+1);
7407 if (latest == NULL) {
7408 if (flags & TRYAGAIN)
7412 else if (ret == NULL)
7414 *flagp |= flags&(HASWIDTH|POSTPONED);
7415 if (chain == NULL) /* First piece. */
7416 *flagp |= flags&SPSTART;
7419 REGTAIL(pRExC_state, chain, latest);
7424 if (chain == NULL) { /* Loop ran zero times. */
7425 chain = reg_node(pRExC_state, NOTHING);
7430 *flagp |= flags&SIMPLE;
7437 - regpiece - something followed by possible [*+?]
7439 * Note that the branching code sequences used for ? and the general cases
7440 * of * and + are somewhat optimized: they use the same NOTHING node as
7441 * both the endmarker for their branch list and the body of the last branch.
7442 * It might seem that this node could be dispensed with entirely, but the
7443 * endmarker role is not redundant.
7446 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7449 register regnode *ret;
7451 register char *next;
7453 const char * const origparse = RExC_parse;
7455 I32 max = REG_INFTY;
7457 const char *maxpos = NULL;
7458 GET_RE_DEBUG_FLAGS_DECL;
7460 PERL_ARGS_ASSERT_REGPIECE;
7462 DEBUG_PARSE("piec");
7464 ret = regatom(pRExC_state, &flags,depth+1);
7466 if (flags & TRYAGAIN)
7473 if (op == '{' && regcurly(RExC_parse)) {
7475 parse_start = RExC_parse; /* MJD */
7476 next = RExC_parse + 1;
7477 while (isDIGIT(*next) || *next == ',') {
7486 if (*next == '}') { /* got one */
7490 min = atoi(RExC_parse);
7494 maxpos = RExC_parse;
7496 if (!max && *maxpos != '0')
7497 max = REG_INFTY; /* meaning "infinity" */
7498 else if (max >= REG_INFTY)
7499 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7501 nextchar(pRExC_state);
7504 if ((flags&SIMPLE)) {
7505 RExC_naughty += 2 + RExC_naughty / 2;
7506 reginsert(pRExC_state, CURLY, ret, depth+1);
7507 Set_Node_Offset(ret, parse_start+1); /* MJD */
7508 Set_Node_Cur_Length(ret);
7511 regnode * const w = reg_node(pRExC_state, WHILEM);
7514 REGTAIL(pRExC_state, ret, w);
7515 if (!SIZE_ONLY && RExC_extralen) {
7516 reginsert(pRExC_state, LONGJMP,ret, depth+1);
7517 reginsert(pRExC_state, NOTHING,ret, depth+1);
7518 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7520 reginsert(pRExC_state, CURLYX,ret, depth+1);
7522 Set_Node_Offset(ret, parse_start+1);
7523 Set_Node_Length(ret,
7524 op == '{' ? (RExC_parse - parse_start) : 1);
7526 if (!SIZE_ONLY && RExC_extralen)
7527 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
7528 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7530 RExC_whilem_seen++, RExC_extralen += 3;
7531 RExC_naughty += 4 + RExC_naughty; /* compound interest */
7540 vFAIL("Can't do {n,m} with n > m");
7542 ARG1_SET(ret, (U16)min);
7543 ARG2_SET(ret, (U16)max);
7555 #if 0 /* Now runtime fix should be reliable. */
7557 /* if this is reinstated, don't forget to put this back into perldiag:
7559 =item Regexp *+ operand could be empty at {#} in regex m/%s/
7561 (F) The part of the regexp subject to either the * or + quantifier
7562 could match an empty string. The {#} shows in the regular
7563 expression about where the problem was discovered.
7567 if (!(flags&HASWIDTH) && op != '?')
7568 vFAIL("Regexp *+ operand could be empty");
7571 parse_start = RExC_parse;
7572 nextchar(pRExC_state);
7574 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7576 if (op == '*' && (flags&SIMPLE)) {
7577 reginsert(pRExC_state, STAR, ret, depth+1);
7581 else if (op == '*') {
7585 else if (op == '+' && (flags&SIMPLE)) {
7586 reginsert(pRExC_state, PLUS, ret, depth+1);
7590 else if (op == '+') {
7594 else if (op == '?') {
7599 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7600 ckWARN3reg(RExC_parse,
7601 "%.*s matches null string many times",
7602 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7606 if (RExC_parse < RExC_end && *RExC_parse == '?') {
7607 nextchar(pRExC_state);
7608 reginsert(pRExC_state, MINMOD, ret, depth+1);
7609 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7611 #ifndef REG_ALLOW_MINMOD_SUSPEND
7614 if (RExC_parse < RExC_end && *RExC_parse == '+') {
7616 nextchar(pRExC_state);
7617 ender = reg_node(pRExC_state, SUCCEED);
7618 REGTAIL(pRExC_state, ret, ender);
7619 reginsert(pRExC_state, SUSPEND, ret, depth+1);
7621 ender = reg_node(pRExC_state, TAIL);
7622 REGTAIL(pRExC_state, ret, ender);
7626 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7628 vFAIL("Nested quantifiers");
7635 /* reg_namedseq(pRExC_state,UVp)
7637 This is expected to be called by a parser routine that has
7638 recognized '\N' and needs to handle the rest. RExC_parse is
7639 expected to point at the first char following the N at the time
7642 The \N may be inside (indicated by valuep not being NULL) or outside a
7645 \N may begin either a named sequence, or if outside a character class, mean
7646 to match a non-newline. For non single-quoted regexes, the tokenizer has
7647 attempted to decide which, and in the case of a named sequence converted it
7648 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7649 where c1... are the characters in the sequence. For single-quoted regexes,
7650 the tokenizer passes the \N sequence through unchanged; this code will not
7651 attempt to determine this nor expand those. The net effect is that if the
7652 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7653 signals that this \N occurrence means to match a non-newline.
7655 Only the \N{U+...} form should occur in a character class, for the same
7656 reason that '.' inside a character class means to just match a period: it
7657 just doesn't make sense.
7659 If valuep is non-null then it is assumed that we are parsing inside
7660 of a charclass definition and the first codepoint in the resolved
7661 string is returned via *valuep and the routine will return NULL.
7662 In this mode if a multichar string is returned from the charnames
7663 handler, a warning will be issued, and only the first char in the
7664 sequence will be examined. If the string returned is zero length
7665 then the value of *valuep is undefined and NON-NULL will
7666 be returned to indicate failure. (This will NOT be a valid pointer
7669 If valuep is null then it is assumed that we are parsing normal text and a
7670 new EXACT node is inserted into the program containing the resolved string,
7671 and a pointer to the new node is returned. But if the string is zero length
7672 a NOTHING node is emitted instead.
7674 On success RExC_parse is set to the char following the endbrace.
7675 Parsing failures will generate a fatal error via vFAIL(...)
7678 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7680 char * endbrace; /* '}' following the name */
7681 regnode *ret = NULL;
7683 char* parse_start = RExC_parse - 2; /* points to the '\N' */
7687 GET_RE_DEBUG_FLAGS_DECL;
7689 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7693 /* The [^\n] meaning of \N ignores spaces and comments under the /x
7694 * modifier. The other meaning does not */
7695 p = (RExC_flags & RXf_PMf_EXTENDED)
7696 ? regwhite( pRExC_state, RExC_parse )
7699 /* Disambiguate between \N meaning a named character versus \N meaning
7700 * [^\n]. The former is assumed when it can't be the latter. */
7701 if (*p != '{' || regcurly(p)) {
7704 /* no bare \N in a charclass */
7705 vFAIL("\\N in a character class must be a named character: \\N{...}");
7707 nextchar(pRExC_state);
7708 ret = reg_node(pRExC_state, REG_ANY);
7709 *flagp |= HASWIDTH|SIMPLE;
7712 Set_Node_Length(ret, 1); /* MJD */
7716 /* Here, we have decided it should be a named sequence */
7718 /* The test above made sure that the next real character is a '{', but
7719 * under the /x modifier, it could be separated by space (or a comment and
7720 * \n) and this is not allowed (for consistency with \x{...} and the
7721 * tokenizer handling of \N{NAME}). */
7722 if (*RExC_parse != '{') {
7723 vFAIL("Missing braces on \\N{}");
7726 RExC_parse++; /* Skip past the '{' */
7728 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7729 || ! (endbrace == RExC_parse /* nothing between the {} */
7730 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7731 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7733 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7734 vFAIL("\\N{NAME} must be resolved by the lexer");
7737 if (endbrace == RExC_parse) { /* empty: \N{} */
7739 RExC_parse = endbrace + 1;
7740 return reg_node(pRExC_state,NOTHING);
7744 ckWARNreg(RExC_parse,
7745 "Ignoring zero length \\N{} in character class"
7747 RExC_parse = endbrace + 1;
7750 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7753 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
7754 RExC_parse += 2; /* Skip past the 'U+' */
7756 if (valuep) { /* In a bracketed char class */
7757 /* We only pay attention to the first char of
7758 multichar strings being returned. I kinda wonder
7759 if this makes sense as it does change the behaviour
7760 from earlier versions, OTOH that behaviour was broken
7761 as well. XXX Solution is to recharacterize as
7762 [rest-of-class]|multi1|multi2... */
7764 STRLEN length_of_hex;
7765 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7766 | PERL_SCAN_DISALLOW_PREFIX
7767 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7769 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7770 if (endchar < endbrace) {
7771 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7774 length_of_hex = (STRLEN)(endchar - RExC_parse);
7775 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7777 /* The tokenizer should have guaranteed validity, but it's possible to
7778 * bypass it by using single quoting, so check */
7779 if (length_of_hex == 0
7780 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7782 RExC_parse += length_of_hex; /* Includes all the valid */
7783 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7784 ? UTF8SKIP(RExC_parse)
7786 /* Guard against malformed utf8 */
7787 if (RExC_parse >= endchar) RExC_parse = endchar;
7788 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7791 RExC_parse = endbrace + 1;
7792 if (endchar == endbrace) return NULL;
7794 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7796 else { /* Not a char class */
7797 char *s; /* String to put in generated EXACT node */
7798 STRLEN len = 0; /* Its current byte length */
7799 char *endchar; /* Points to '.' or '}' ending cur char in the input
7801 ret = reg_node(pRExC_state,
7802 (U8) ((! FOLD) ? EXACT
7805 : (MORE_ASCII_RESTRICTED)
7807 : (AT_LEAST_UNI_SEMANTICS)
7812 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7813 * the input which is of the form now 'c1.c2.c3...}' until find the
7814 * ending brace or exceed length 255. The characters that exceed this
7815 * limit are dropped. The limit could be relaxed should it become
7816 * desirable by reparsing this as (?:\N{NAME}), so could generate
7817 * multiple EXACT nodes, as is done for just regular input. But this
7818 * is primarily a named character, and not intended to be a huge long
7819 * string, so 255 bytes should be good enough */
7821 STRLEN length_of_hex;
7822 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7823 | PERL_SCAN_DISALLOW_PREFIX
7824 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7825 UV cp; /* Ord of current character */
7826 bool use_this_char_fold = FOLD;
7828 /* Code points are separated by dots. If none, there is only one
7829 * code point, and is terminated by the brace */
7830 endchar = RExC_parse + strcspn(RExC_parse, ".}");
7832 /* The values are Unicode even on EBCDIC machines */
7833 length_of_hex = (STRLEN)(endchar - RExC_parse);
7834 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7835 if ( length_of_hex == 0
7836 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7838 RExC_parse += length_of_hex; /* Includes all the valid */
7839 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7840 ? UTF8SKIP(RExC_parse)
7842 /* Guard against malformed utf8 */
7843 if (RExC_parse >= endchar) RExC_parse = endchar;
7844 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7847 /* XXX ? Change to ANYOF node
7849 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7850 && is_TRICKYFOLD_cp(cp))
7855 /* Under /aa, we can't mix ASCII with non- in a fold. If we are
7856 * folding, and the source isn't ASCII, look through all the
7857 * characters it folds to. If any one of them is ASCII, forbid
7858 * this fold. (cp is uni, so the 127 below is correct even for
7859 * EBCDIC). Similarly under locale rules, we don't mix under 256
7860 * with above 255. XXX It really doesn't make sense to have \N{}
7861 * which means a Unicode rules under locale. I (khw) think this
7862 * should be warned about, but the counter argument is that people
7863 * who have programmed around Perl's earlier lack of specifying the
7864 * rules and used \N{} to force Unicode things in a local
7865 * environment shouldn't get suddenly a warning */
7866 if (use_this_char_fold) {
7867 if (LOC && cp < 256) { /* Fold not known until run-time */
7868 use_this_char_fold = FALSE;
7870 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7871 || (cp > 255 && LOC))
7873 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7878 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7883 || (LOC && (UTF8_IS_INVARIANT(*s)
7884 || UTF8_IS_DOWNGRADEABLE_START(*s))))
7886 use_this_char_fold = FALSE;
7894 if (! use_this_char_fold) { /* Not folding, just append to the
7898 /* Quit before adding this character if would exceed limit */
7899 if (len + UNISKIP(cp) > U8_MAX) break;
7901 unilen = reguni(pRExC_state, cp, s);
7906 } else { /* Folding, output the folded equivalent */
7907 STRLEN foldlen,numlen;
7908 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7909 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7911 /* Quit before exceeding size limit */
7912 if (len + foldlen > U8_MAX) break;
7914 for (foldbuf = tmpbuf;
7918 cp = utf8_to_uvchr(foldbuf, &numlen);
7920 const STRLEN unilen = reguni(pRExC_state, cp, s);
7923 /* In EBCDIC the numlen and unilen can differ. */
7925 if (numlen >= foldlen)
7929 break; /* "Can't happen." */
7933 /* Point to the beginning of the next character in the sequence. */
7934 RExC_parse = endchar + 1;
7936 /* Quit if no more characters */
7937 if (RExC_parse >= endbrace) break;
7942 if (RExC_parse < endbrace) {
7943 ckWARNreg(RExC_parse - 1,
7944 "Using just the first characters returned by \\N{}");
7947 RExC_size += STR_SZ(len);
7950 RExC_emit += STR_SZ(len);
7953 RExC_parse = endbrace + 1;
7955 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7956 with malformed in t/re/pat_advanced.t */
7958 Set_Node_Cur_Length(ret); /* MJD */
7959 nextchar(pRExC_state);
7969 * It returns the code point in utf8 for the value in *encp.
7970 * value: a code value in the source encoding
7971 * encp: a pointer to an Encode object
7973 * If the result from Encode is not a single character,
7974 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7977 S_reg_recode(pTHX_ const char value, SV **encp)
7980 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7981 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7982 const STRLEN newlen = SvCUR(sv);
7983 UV uv = UNICODE_REPLACEMENT;
7985 PERL_ARGS_ASSERT_REG_RECODE;
7989 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7992 if (!newlen || numlen != newlen) {
7993 uv = UNICODE_REPLACEMENT;
8001 - regatom - the lowest level
8003 Try to identify anything special at the start of the pattern. If there
8004 is, then handle it as required. This may involve generating a single regop,
8005 such as for an assertion; or it may involve recursing, such as to
8006 handle a () structure.
8008 If the string doesn't start with something special then we gobble up
8009 as much literal text as we can.
8011 Once we have been able to handle whatever type of thing started the
8012 sequence, we return.
8014 Note: we have to be careful with escapes, as they can be both literal
8015 and special, and in the case of \10 and friends can either, depending
8016 on context. Specifically there are two separate switches for handling
8017 escape sequences, with the one for handling literal escapes requiring
8018 a dummy entry for all of the special escapes that are actually handled
8023 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8026 register regnode *ret = NULL;
8028 char *parse_start = RExC_parse;
8030 GET_RE_DEBUG_FLAGS_DECL;
8031 DEBUG_PARSE("atom");
8032 *flagp = WORST; /* Tentatively. */
8034 PERL_ARGS_ASSERT_REGATOM;
8037 switch ((U8)*RExC_parse) {
8039 RExC_seen_zerolen++;
8040 nextchar(pRExC_state);
8041 if (RExC_flags & RXf_PMf_MULTILINE)
8042 ret = reg_node(pRExC_state, MBOL);
8043 else if (RExC_flags & RXf_PMf_SINGLELINE)
8044 ret = reg_node(pRExC_state, SBOL);
8046 ret = reg_node(pRExC_state, BOL);
8047 Set_Node_Length(ret, 1); /* MJD */
8050 nextchar(pRExC_state);
8052 RExC_seen_zerolen++;
8053 if (RExC_flags & RXf_PMf_MULTILINE)
8054 ret = reg_node(pRExC_state, MEOL);
8055 else if (RExC_flags & RXf_PMf_SINGLELINE)
8056 ret = reg_node(pRExC_state, SEOL);
8058 ret = reg_node(pRExC_state, EOL);
8059 Set_Node_Length(ret, 1); /* MJD */
8062 nextchar(pRExC_state);
8063 if (RExC_flags & RXf_PMf_SINGLELINE)
8064 ret = reg_node(pRExC_state, SANY);
8066 ret = reg_node(pRExC_state, REG_ANY);
8067 *flagp |= HASWIDTH|SIMPLE;
8069 Set_Node_Length(ret, 1); /* MJD */
8073 char * const oregcomp_parse = ++RExC_parse;
8074 ret = regclass(pRExC_state,depth+1);
8075 if (*RExC_parse != ']') {
8076 RExC_parse = oregcomp_parse;
8077 vFAIL("Unmatched [");
8079 nextchar(pRExC_state);
8080 *flagp |= HASWIDTH|SIMPLE;
8081 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8085 nextchar(pRExC_state);
8086 ret = reg(pRExC_state, 1, &flags,depth+1);
8088 if (flags & TRYAGAIN) {
8089 if (RExC_parse == RExC_end) {
8090 /* Make parent create an empty node if needed. */
8098 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8102 if (flags & TRYAGAIN) {
8106 vFAIL("Internal urp");
8107 /* Supposed to be caught earlier. */
8110 if (!regcurly(RExC_parse)) {
8119 vFAIL("Quantifier follows nothing");
8121 case LATIN_SMALL_LETTER_SHARP_S:
8122 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8123 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8124 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8125 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
8126 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8131 len=0; /* silence a spurious compiler warning */
8132 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8133 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8134 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8135 ret = reganode(pRExC_state, FOLDCHAR, cp);
8136 Set_Node_Length(ret, 1); /* MJD */
8137 nextchar(pRExC_state); /* kill whitespace under /x */
8145 This switch handles escape sequences that resolve to some kind
8146 of special regop and not to literal text. Escape sequnces that
8147 resolve to literal text are handled below in the switch marked
8150 Every entry in this switch *must* have a corresponding entry
8151 in the literal escape switch. However, the opposite is not
8152 required, as the default for this switch is to jump to the
8153 literal text handling code.
8155 switch ((U8)*++RExC_parse) {
8156 case LATIN_SMALL_LETTER_SHARP_S:
8157 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8158 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8160 /* Special Escapes */
8162 RExC_seen_zerolen++;
8163 ret = reg_node(pRExC_state, SBOL);
8165 goto finish_meta_pat;
8167 ret = reg_node(pRExC_state, GPOS);
8168 RExC_seen |= REG_SEEN_GPOS;
8170 goto finish_meta_pat;
8172 RExC_seen_zerolen++;
8173 ret = reg_node(pRExC_state, KEEPS);
8175 /* XXX:dmq : disabling in-place substitution seems to
8176 * be necessary here to avoid cases of memory corruption, as
8177 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8179 RExC_seen |= REG_SEEN_LOOKBEHIND;
8180 goto finish_meta_pat;
8182 ret = reg_node(pRExC_state, SEOL);
8184 RExC_seen_zerolen++; /* Do not optimize RE away */
8185 goto finish_meta_pat;
8187 ret = reg_node(pRExC_state, EOS);
8189 RExC_seen_zerolen++; /* Do not optimize RE away */
8190 goto finish_meta_pat;
8192 ret = reg_node(pRExC_state, CANY);
8193 RExC_seen |= REG_SEEN_CANY;
8194 *flagp |= HASWIDTH|SIMPLE;
8195 goto finish_meta_pat;
8197 ret = reg_node(pRExC_state, CLUMP);
8199 goto finish_meta_pat;
8201 switch (get_regex_charset(RExC_flags)) {
8202 case REGEX_LOCALE_CHARSET:
8205 case REGEX_UNICODE_CHARSET:
8208 case REGEX_ASCII_RESTRICTED_CHARSET:
8209 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8212 case REGEX_DEPENDS_CHARSET:
8218 ret = reg_node(pRExC_state, op);
8219 *flagp |= HASWIDTH|SIMPLE;
8220 goto finish_meta_pat;
8222 switch (get_regex_charset(RExC_flags)) {
8223 case REGEX_LOCALE_CHARSET:
8226 case REGEX_UNICODE_CHARSET:
8229 case REGEX_ASCII_RESTRICTED_CHARSET:
8230 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8233 case REGEX_DEPENDS_CHARSET:
8239 ret = reg_node(pRExC_state, op);
8240 *flagp |= HASWIDTH|SIMPLE;
8241 goto finish_meta_pat;
8243 RExC_seen_zerolen++;
8244 RExC_seen |= REG_SEEN_LOOKBEHIND;
8245 switch (get_regex_charset(RExC_flags)) {
8246 case REGEX_LOCALE_CHARSET:
8249 case REGEX_UNICODE_CHARSET:
8252 case REGEX_ASCII_RESTRICTED_CHARSET:
8253 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8256 case REGEX_DEPENDS_CHARSET:
8262 ret = reg_node(pRExC_state, op);
8263 FLAGS(ret) = get_regex_charset(RExC_flags);
8265 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8266 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8268 goto finish_meta_pat;
8270 RExC_seen_zerolen++;
8271 RExC_seen |= REG_SEEN_LOOKBEHIND;
8272 switch (get_regex_charset(RExC_flags)) {
8273 case REGEX_LOCALE_CHARSET:
8276 case REGEX_UNICODE_CHARSET:
8279 case REGEX_ASCII_RESTRICTED_CHARSET:
8280 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8283 case REGEX_DEPENDS_CHARSET:
8289 ret = reg_node(pRExC_state, op);
8290 FLAGS(ret) = get_regex_charset(RExC_flags);
8292 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8293 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8295 goto finish_meta_pat;
8297 switch (get_regex_charset(RExC_flags)) {
8298 case REGEX_LOCALE_CHARSET:
8301 case REGEX_UNICODE_CHARSET:
8304 case REGEX_ASCII_RESTRICTED_CHARSET:
8305 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8308 case REGEX_DEPENDS_CHARSET:
8314 ret = reg_node(pRExC_state, op);
8315 *flagp |= HASWIDTH|SIMPLE;
8316 goto finish_meta_pat;
8318 switch (get_regex_charset(RExC_flags)) {
8319 case REGEX_LOCALE_CHARSET:
8322 case REGEX_UNICODE_CHARSET:
8325 case REGEX_ASCII_RESTRICTED_CHARSET:
8326 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8329 case REGEX_DEPENDS_CHARSET:
8335 ret = reg_node(pRExC_state, op);
8336 *flagp |= HASWIDTH|SIMPLE;
8337 goto finish_meta_pat;
8339 switch (get_regex_charset(RExC_flags)) {
8340 case REGEX_LOCALE_CHARSET:
8343 case REGEX_ASCII_RESTRICTED_CHARSET:
8344 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8347 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8348 case REGEX_UNICODE_CHARSET:
8354 ret = reg_node(pRExC_state, op);
8355 *flagp |= HASWIDTH|SIMPLE;
8356 goto finish_meta_pat;
8358 switch (get_regex_charset(RExC_flags)) {
8359 case REGEX_LOCALE_CHARSET:
8362 case REGEX_ASCII_RESTRICTED_CHARSET:
8363 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8366 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8367 case REGEX_UNICODE_CHARSET:
8373 ret = reg_node(pRExC_state, op);
8374 *flagp |= HASWIDTH|SIMPLE;
8375 goto finish_meta_pat;
8377 ret = reg_node(pRExC_state, LNBREAK);
8378 *flagp |= HASWIDTH|SIMPLE;
8379 goto finish_meta_pat;
8381 ret = reg_node(pRExC_state, HORIZWS);
8382 *flagp |= HASWIDTH|SIMPLE;
8383 goto finish_meta_pat;
8385 ret = reg_node(pRExC_state, NHORIZWS);
8386 *flagp |= HASWIDTH|SIMPLE;
8387 goto finish_meta_pat;
8389 ret = reg_node(pRExC_state, VERTWS);
8390 *flagp |= HASWIDTH|SIMPLE;
8391 goto finish_meta_pat;
8393 ret = reg_node(pRExC_state, NVERTWS);
8394 *flagp |= HASWIDTH|SIMPLE;
8396 nextchar(pRExC_state);
8397 Set_Node_Length(ret, 2); /* MJD */
8402 char* const oldregxend = RExC_end;
8404 char* parse_start = RExC_parse - 2;
8407 if (RExC_parse[1] == '{') {
8408 /* a lovely hack--pretend we saw [\pX] instead */
8409 RExC_end = strchr(RExC_parse, '}');
8411 const U8 c = (U8)*RExC_parse;
8413 RExC_end = oldregxend;
8414 vFAIL2("Missing right brace on \\%c{}", c);
8419 RExC_end = RExC_parse + 2;
8420 if (RExC_end > oldregxend)
8421 RExC_end = oldregxend;
8425 ret = regclass(pRExC_state,depth+1);
8427 RExC_end = oldregxend;
8430 Set_Node_Offset(ret, parse_start + 2);
8431 Set_Node_Cur_Length(ret);
8432 nextchar(pRExC_state);
8433 *flagp |= HASWIDTH|SIMPLE;
8437 /* Handle \N and \N{NAME} here and not below because it can be
8438 multicharacter. join_exact() will join them up later on.
8439 Also this makes sure that things like /\N{BLAH}+/ and
8440 \N{BLAH} being multi char Just Happen. dmq*/
8442 ret= reg_namedseq(pRExC_state, NULL, flagp);
8444 case 'k': /* Handle \k<NAME> and \k'NAME' */
8447 char ch= RExC_parse[1];
8448 if (ch != '<' && ch != '\'' && ch != '{') {
8450 vFAIL2("Sequence %.2s... not terminated",parse_start);
8452 /* this pretty much dupes the code for (?P=...) in reg(), if
8453 you change this make sure you change that */
8454 char* name_start = (RExC_parse += 2);
8456 SV *sv_dat = reg_scan_name(pRExC_state,
8457 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8458 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8459 if (RExC_parse == name_start || *RExC_parse != ch)
8460 vFAIL2("Sequence %.3s... not terminated",parse_start);
8463 num = add_data( pRExC_state, 1, "S" );
8464 RExC_rxi->data->data[num]=(void*)sv_dat;
8465 SvREFCNT_inc_simple_void(sv_dat);
8469 ret = reganode(pRExC_state,
8472 : (MORE_ASCII_RESTRICTED)
8474 : (AT_LEAST_UNI_SEMANTICS)
8482 /* override incorrect value set in reganode MJD */
8483 Set_Node_Offset(ret, parse_start+1);
8484 Set_Node_Cur_Length(ret); /* MJD */
8485 nextchar(pRExC_state);
8491 case '1': case '2': case '3': case '4':
8492 case '5': case '6': case '7': case '8': case '9':
8495 bool isg = *RExC_parse == 'g';
8500 if (*RExC_parse == '{') {
8504 if (*RExC_parse == '-') {
8508 if (hasbrace && !isDIGIT(*RExC_parse)) {
8509 if (isrel) RExC_parse--;
8511 goto parse_named_seq;
8513 num = atoi(RExC_parse);
8514 if (isg && num == 0)
8515 vFAIL("Reference to invalid group 0");
8517 num = RExC_npar - num;
8519 vFAIL("Reference to nonexistent or unclosed group");
8521 if (!isg && num > 9 && num >= RExC_npar)
8524 char * const parse_start = RExC_parse - 1; /* MJD */
8525 while (isDIGIT(*RExC_parse))
8527 if (parse_start == RExC_parse - 1)
8528 vFAIL("Unterminated \\g... pattern");
8530 if (*RExC_parse != '}')
8531 vFAIL("Unterminated \\g{...} pattern");
8535 if (num > (I32)RExC_rx->nparens)
8536 vFAIL("Reference to nonexistent group");
8539 ret = reganode(pRExC_state,
8542 : (MORE_ASCII_RESTRICTED)
8544 : (AT_LEAST_UNI_SEMANTICS)
8552 /* override incorrect value set in reganode MJD */
8553 Set_Node_Offset(ret, parse_start+1);
8554 Set_Node_Cur_Length(ret); /* MJD */
8556 nextchar(pRExC_state);
8561 if (RExC_parse >= RExC_end)
8562 FAIL("Trailing \\");
8565 /* Do not generate "unrecognized" warnings here, we fall
8566 back into the quick-grab loop below */
8573 if (RExC_flags & RXf_PMf_EXTENDED) {
8574 if ( reg_skipcomment( pRExC_state ) )
8581 register STRLEN len;
8586 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8587 regnode * orig_emit;
8589 parse_start = RExC_parse - 1;
8595 orig_emit = RExC_emit; /* Save the original output node position in
8596 case we need to output a different node
8598 ret = reg_node(pRExC_state,
8599 (U8) ((! FOLD) ? EXACT
8602 : (MORE_ASCII_RESTRICTED)
8604 : (AT_LEAST_UNI_SEMANTICS)
8609 for (len = 0, p = RExC_parse - 1;
8610 len < 127 && p < RExC_end;
8613 char * const oldp = p;
8615 if (RExC_flags & RXf_PMf_EXTENDED)
8616 p = regwhite( pRExC_state, p );
8618 case LATIN_SMALL_LETTER_SHARP_S:
8619 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8620 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8621 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8622 goto normal_default;
8632 /* Literal Escapes Switch
8634 This switch is meant to handle escape sequences that
8635 resolve to a literal character.
8637 Every escape sequence that represents something
8638 else, like an assertion or a char class, is handled
8639 in the switch marked 'Special Escapes' above in this
8640 routine, but also has an entry here as anything that
8641 isn't explicitly mentioned here will be treated as
8642 an unescaped equivalent literal.
8646 /* These are all the special escapes. */
8647 case LATIN_SMALL_LETTER_SHARP_S:
8648 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8649 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8650 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8651 goto normal_default;
8652 case 'A': /* Start assertion */
8653 case 'b': case 'B': /* Word-boundary assertion*/
8654 case 'C': /* Single char !DANGEROUS! */
8655 case 'd': case 'D': /* digit class */
8656 case 'g': case 'G': /* generic-backref, pos assertion */
8657 case 'h': case 'H': /* HORIZWS */
8658 case 'k': case 'K': /* named backref, keep marker */
8659 case 'N': /* named char sequence */
8660 case 'p': case 'P': /* Unicode property */
8661 case 'R': /* LNBREAK */
8662 case 's': case 'S': /* space class */
8663 case 'v': case 'V': /* VERTWS */
8664 case 'w': case 'W': /* word class */
8665 case 'X': /* eXtended Unicode "combining character sequence" */
8666 case 'z': case 'Z': /* End of line/string assertion */
8670 /* Anything after here is an escape that resolves to a
8671 literal. (Except digits, which may or may not)
8690 ender = ASCII_TO_NATIVE('\033');
8694 ender = ASCII_TO_NATIVE('\007');
8699 STRLEN brace_len = len;
8701 const char* error_msg;
8703 bool valid = grok_bslash_o(p,
8710 RExC_parse = p; /* going to die anyway; point
8711 to exact spot of failure */
8718 if (PL_encoding && ender < 0x100) {
8719 goto recode_encoding;
8728 char* const e = strchr(p, '}');
8732 vFAIL("Missing right brace on \\x{}");
8735 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8736 | PERL_SCAN_DISALLOW_PREFIX;
8737 STRLEN numlen = e - p - 1;
8738 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8745 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8747 ender = grok_hex(p, &numlen, &flags, NULL);
8750 if (PL_encoding && ender < 0x100)
8751 goto recode_encoding;
8755 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8757 case '0': case '1': case '2': case '3':case '4':
8758 case '5': case '6': case '7': case '8':case '9':
8760 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8762 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8764 ender = grok_oct(p, &numlen, &flags, NULL);
8774 if (PL_encoding && ender < 0x100)
8775 goto recode_encoding;
8779 SV* enc = PL_encoding;
8780 ender = reg_recode((const char)(U8)ender, &enc);
8781 if (!enc && SIZE_ONLY)
8782 ckWARNreg(p, "Invalid escape in the specified encoding");
8788 FAIL("Trailing \\");
8791 if (!SIZE_ONLY&& isALPHA(*p)) {
8792 /* Include any { following the alpha to emphasize
8793 * that it could be part of an escape at some point
8795 int len = (*(p + 1) == '{') ? 2 : 1;
8796 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8798 goto normal_default;
8803 if (UTF8_IS_START(*p) && UTF) {
8805 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8806 &numlen, UTF8_ALLOW_DEFAULT);
8812 } /* End of switch on the literal */
8814 /* Certain characters are problematic because their folded
8815 * length is so different from their original length that it
8816 * isn't handleable by the optimizer. They are therefore not
8817 * placed in an EXACTish node; and are here handled specially.
8818 * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8819 * putting it in a special node keeps regexec from having to
8820 * deal with a non-utf8 multi-char fold */
8822 && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8823 && is_TRICKYFOLD_cp(ender))
8825 /* If is in middle of outputting characters into an
8826 * EXACTish node, go output what we have so far, and
8827 * position the parse so that this will be called again
8835 /* Here we are ready to output our tricky fold
8836 * character. What's done is to pretend it's in a
8837 * [bracketed] class, and let the code that deals with
8838 * those handle it, as that code has all the
8839 * intelligence necessary. First save the current
8840 * parse state, get rid of the already allocated EXACT
8841 * node that the ANYOFV node will replace, and point
8842 * the parse to a buffer which we fill with the
8843 * character we want the regclass code to think is
8845 char* const oldregxend = RExC_end;
8847 RExC_emit = orig_emit;
8848 RExC_parse = tmpbuf;
8850 tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8851 tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8852 RExC_end = RExC_parse + 2;
8855 tmpbuf[0] = (char) ender;
8856 RExC_end = RExC_parse + 1;
8859 ret = regclass(pRExC_state,depth+1);
8861 /* Here, have parsed the buffer. Reset the parse to
8862 * the actual input, and return */
8863 RExC_end = oldregxend;
8866 Set_Node_Offset(ret, RExC_parse);
8867 Set_Node_Cur_Length(ret);
8868 nextchar(pRExC_state);
8869 *flagp |= HASWIDTH|SIMPLE;
8874 if ( RExC_flags & RXf_PMf_EXTENDED)
8875 p = regwhite( pRExC_state, p );
8877 /* Prime the casefolded buffer. Locale rules, which apply
8878 * only to code points < 256, aren't known until execution,
8879 * so for them, just output the original character using
8881 if (LOC && ender < 256) {
8882 if (UNI_IS_INVARIANT(ender)) {
8883 *tmpbuf = (U8) ender;
8886 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8887 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8891 else if (isASCII(ender)) { /* Note: Here can't also be LOC
8893 ender = toLOWER(ender);
8894 *tmpbuf = (U8) ender;
8897 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8899 /* Locale and /aa require more selectivity about the
8900 * fold, so are handled below. Otherwise, here, just
8902 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8905 /* Under locale rules or /aa we are not to mix,
8906 * respectively, ords < 256 or ASCII with non-. So
8907 * reject folds that mix them, using only the
8908 * non-folded code point. So do the fold to a
8909 * temporary, and inspect each character in it. */
8910 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8912 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8913 U8* e = s + foldlen;
8914 bool fold_ok = TRUE;
8918 || (LOC && (UTF8_IS_INVARIANT(*s)
8919 || UTF8_IS_DOWNGRADEABLE_START(*s))))
8927 Copy(trialbuf, tmpbuf, foldlen, U8);
8931 uvuni_to_utf8(tmpbuf, ender);
8932 foldlen = UNISKIP(ender);
8936 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8941 /* Emit all the Unicode characters. */
8943 for (foldbuf = tmpbuf;
8945 foldlen -= numlen) {
8946 ender = utf8_to_uvchr(foldbuf, &numlen);
8948 const STRLEN unilen = reguni(pRExC_state, ender, s);
8951 /* In EBCDIC the numlen
8952 * and unilen can differ. */
8954 if (numlen >= foldlen)
8958 break; /* "Can't happen." */
8962 const STRLEN unilen = reguni(pRExC_state, ender, s);
8971 REGC((char)ender, s++);
8977 /* Emit all the Unicode characters. */
8979 for (foldbuf = tmpbuf;
8981 foldlen -= numlen) {
8982 ender = utf8_to_uvchr(foldbuf, &numlen);
8984 const STRLEN unilen = reguni(pRExC_state, ender, s);
8987 /* In EBCDIC the numlen
8988 * and unilen can differ. */
8990 if (numlen >= foldlen)
8998 const STRLEN unilen = reguni(pRExC_state, ender, s);
9007 REGC((char)ender, s++);
9009 loopdone: /* Jumped to when encounters something that shouldn't be in
9012 Set_Node_Cur_Length(ret); /* MJD */
9013 nextchar(pRExC_state);
9015 /* len is STRLEN which is unsigned, need to copy to signed */
9018 vFAIL("Internal disaster");
9022 if (len == 1 && UNI_IS_INVARIANT(ender))
9026 RExC_size += STR_SZ(len);
9029 RExC_emit += STR_SZ(len);
9037 /* Jumped to when an unrecognized character set is encountered */
9039 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9044 S_regwhite( RExC_state_t *pRExC_state, char *p )
9046 const char *e = RExC_end;
9048 PERL_ARGS_ASSERT_REGWHITE;
9053 else if (*p == '#') {
9062 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9070 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9071 Character classes ([:foo:]) can also be negated ([:^foo:]).
9072 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9073 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9074 but trigger failures because they are currently unimplemented. */
9076 #define POSIXCC_DONE(c) ((c) == ':')
9077 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9078 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9081 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9084 I32 namedclass = OOB_NAMEDCLASS;
9086 PERL_ARGS_ASSERT_REGPPOSIXCC;
9088 if (value == '[' && RExC_parse + 1 < RExC_end &&
9089 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9090 POSIXCC(UCHARAT(RExC_parse))) {
9091 const char c = UCHARAT(RExC_parse);
9092 char* const s = RExC_parse++;
9094 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9096 if (RExC_parse == RExC_end)
9097 /* Grandfather lone [:, [=, [. */
9100 const char* const t = RExC_parse++; /* skip over the c */
9103 if (UCHARAT(RExC_parse) == ']') {
9104 const char *posixcc = s + 1;
9105 RExC_parse++; /* skip over the ending ] */
9108 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9109 const I32 skip = t - posixcc;
9111 /* Initially switch on the length of the name. */
9114 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9115 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9118 /* Names all of length 5. */
9119 /* alnum alpha ascii blank cntrl digit graph lower
9120 print punct space upper */
9121 /* Offset 4 gives the best switch position. */
9122 switch (posixcc[4]) {
9124 if (memEQ(posixcc, "alph", 4)) /* alpha */
9125 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9128 if (memEQ(posixcc, "spac", 4)) /* space */
9129 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9132 if (memEQ(posixcc, "grap", 4)) /* graph */
9133 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9136 if (memEQ(posixcc, "asci", 4)) /* ascii */
9137 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9140 if (memEQ(posixcc, "blan", 4)) /* blank */
9141 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9144 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9145 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9148 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9149 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9152 if (memEQ(posixcc, "lowe", 4)) /* lower */
9153 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9154 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9155 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9158 if (memEQ(posixcc, "digi", 4)) /* digit */
9159 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9160 else if (memEQ(posixcc, "prin", 4)) /* print */
9161 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9162 else if (memEQ(posixcc, "punc", 4)) /* punct */
9163 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9168 if (memEQ(posixcc, "xdigit", 6))
9169 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9173 if (namedclass == OOB_NAMEDCLASS)
9174 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9176 assert (posixcc[skip] == ':');
9177 assert (posixcc[skip+1] == ']');
9178 } else if (!SIZE_ONLY) {
9179 /* [[=foo=]] and [[.foo.]] are still future. */
9181 /* adjust RExC_parse so the warning shows after
9183 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9185 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9188 /* Maternal grandfather:
9189 * "[:" ending in ":" but not in ":]" */
9199 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9203 PERL_ARGS_ASSERT_CHECKPOSIXCC;
9205 if (POSIXCC(UCHARAT(RExC_parse))) {
9206 const char *s = RExC_parse;
9207 const char c = *s++;
9211 if (*s && c == *s && s[1] == ']') {
9213 "POSIX syntax [%c %c] belongs inside character classes",
9216 /* [[=foo=]] and [[.foo.]] are still future. */
9217 if (POSIXCC_NOTYET(c)) {
9218 /* adjust RExC_parse so the error shows after
9220 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9222 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9228 /* No locale test, and always Unicode semantics */
9229 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
9231 for (value = 0; value < 256; value++) \
9233 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9237 case ANYOF_N##NAME: \
9238 for (value = 0; value < 256; value++) \
9240 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9245 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9246 * there are two tests passed in, to use depending on that. There aren't any
9247 * cases where the label is different from the name, so no need for that
9249 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
9251 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
9252 else if (UNI_SEMANTICS) { \
9253 for (value = 0; value < 256; value++) { \
9254 if (TEST_8(value)) stored += \
9255 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9259 for (value = 0; value < 128; value++) { \
9260 if (TEST_7(UNI_TO_NATIVE(value))) stored += \
9261 set_regclass_bit(pRExC_state, ret, \
9262 (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9268 case ANYOF_N##NAME: \
9269 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
9270 else if (UNI_SEMANTICS) { \
9271 for (value = 0; value < 256; value++) { \
9272 if (! TEST_8(value)) stored += \
9273 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9277 for (value = 0; value < 128; value++) { \
9278 if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
9279 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9281 if (AT_LEAST_ASCII_RESTRICTED) { \
9282 for (value = 128; value < 256; value++) { \
9283 stored += set_regclass_bit( \
9284 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9286 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
9289 /* For a non-ut8 target string with DEPENDS semantics, all above \
9290 * ASCII Latin1 code points match the complement of any of the \
9291 * classes. But in utf8, they have their Unicode semantics, so \
9292 * can't just set them in the bitmap, or else regexec.c will think \
9293 * they matched when they shouldn't. */ \
9294 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
9302 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9305 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9306 * Locale folding is done at run-time, so this function should not be
9307 * called for nodes that are for locales.
9309 * This function sets the bit corresponding to the fold of the input
9310 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
9313 * It also knows about the characters that are in the bitmap that have
9314 * folds that are matchable only outside it, and sets the appropriate lists
9317 * It returns the number of bits that actually changed from 0 to 1 */
9322 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9324 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9327 /* It assumes the bit for 'value' has already been set */
9328 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9329 ANYOF_BITMAP_SET(node, fold);
9332 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9333 /* Certain Latin1 characters have matches outside the bitmap. To get
9334 * here, 'value' is one of those characters. None of these matches is
9335 * valid for ASCII characters under /aa, which have been excluded by
9336 * the 'if' above. The matches fall into three categories:
9337 * 1) They are singly folded-to or -from an above 255 character, as
9338 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9340 * 2) They are part of a multi-char fold with another character in the
9341 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9342 * 3) They are part of a multi-char fold with a character not in the
9343 * bitmap, such as various ligatures.
9344 * We aren't dealing fully with multi-char folds, except we do deal
9345 * with the pattern containing a character that has a multi-char fold
9346 * (not so much the inverse).
9347 * For types 1) and 3), the matches only happen when the target string
9348 * is utf8; that's not true for 2), and we set a flag for it.
9350 * The code below adds to the passed in inversion list the single fold
9351 * closures for 'value'. The values are hard-coded here so that an
9352 * innocent-looking character class, like /[ks]/i won't have to go out
9353 * to disk to find the possible matches. XXX It would be better to
9354 * generate these via regen, in case a new version of the Unicode
9355 * standard adds new mappings, though that is not really likely. */
9360 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9364 /* LATIN SMALL LETTER LONG S */
9365 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9368 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9369 GREEK_SMALL_LETTER_MU);
9370 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9371 GREEK_CAPITAL_LETTER_MU);
9373 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9374 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9376 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9377 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
9378 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9379 PL_fold_latin1[value]);
9382 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9383 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9384 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9386 case LATIN_SMALL_LETTER_SHARP_S:
9387 /* 0x1E9E is LATIN CAPITAL LETTER SHARP S */
9388 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x1E9E);
9390 /* Under /a, /d, and /u, this can match the two chars "ss" */
9391 if (! MORE_ASCII_RESTRICTED) {
9392 add_alternate(alternate_ptr, (U8 *) "ss", 2);
9394 /* And under /u or /a, it can match even if the target is
9396 if (AT_LEAST_UNI_SEMANTICS) {
9397 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9405 /* These all are targets of multi-character folds, which can
9406 * occur with only non-Latin1 characters in the fold, so they
9407 * can match if the target string isn't UTF-8 */
9408 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9416 /* These all are targets of multi-character folds, which occur
9417 * only with a non-Latin1 character as part of the fold, so
9418 * they can't match unless the target string is in UTF-8, so no
9419 * action here is necessary */
9422 /* Use deprecated warning to increase the chances of this
9424 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9428 else if (DEPENDS_SEMANTICS
9430 && PL_fold_latin1[value] != value)
9432 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9433 * folds only when the target string is in UTF-8. We add the fold
9434 * here to the list of things to match outside the bitmap, which
9435 * won't be looked at unless it is UTF8 (or else if something else
9436 * says to look even if not utf8, but those things better not happen
9437 * under DEPENDS semantics. */
9438 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9445 PERL_STATIC_INLINE U8
9446 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9448 /* This inline function sets a bit in the bitmap if not already set, and if
9449 * appropriate, its fold, returning the number of bits that actually
9450 * changed from 0 to 1 */
9454 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9456 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
9460 ANYOF_BITMAP_SET(node, value);
9463 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
9464 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9471 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9473 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9474 * alternate list, pointed to by 'alternate_ptr'. This is an array of
9475 * the multi-character folds of characters in the node */
9478 PERL_ARGS_ASSERT_ADD_ALTERNATE;
9480 if (! *alternate_ptr) {
9481 *alternate_ptr = newAV();
9483 sv = newSVpvn_utf8((char*)string, len, TRUE);
9484 av_push(*alternate_ptr, sv);
9489 parse a class specification and produce either an ANYOF node that
9490 matches the pattern or perhaps will be optimized into an EXACTish node
9491 instead. The node contains a bit map for the first 256 characters, with the
9492 corresponding bit set if that character is in the list. For characters
9493 above 255, a range list is used */
9496 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9499 register UV nextvalue;
9500 register IV prevvalue = OOB_UNICODE;
9501 register IV range = 0;
9502 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9503 register regnode *ret;
9506 char *rangebegin = NULL;
9507 bool need_class = 0;
9509 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9510 than just initialized. */
9513 /* code points this node matches that can't be stored in the bitmap */
9514 HV* nonbitmap = NULL;
9516 /* The items that are to match that aren't stored in the bitmap, but are a
9517 * result of things that are stored there. This is the fold closure of
9518 * such a character, either because it has DEPENDS semantics and shouldn't
9519 * be matched unless the target string is utf8, or is a code point that is
9520 * too large for the bit map, as for example, the fold of the MICRO SIGN is
9521 * above 255. This all is solely for performance reasons. By having this
9522 * code know the outside-the-bitmap folds that the bitmapped characters are
9523 * involved with, we don't have to go out to disk to find the list of
9524 * matches, unless the character class includes code points that aren't
9525 * storable in the bit map. That means that a character class with an 's'
9526 * in it, for example, doesn't need to go out to disk to find everything
9527 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
9528 * empty unless there is something whose fold we don't know about, and will
9529 * have to go out to the disk to find. */
9530 HV* l1_fold_invlist = NULL;
9532 /* List of multi-character folds that are matched by this node */
9533 AV* unicode_alternate = NULL;
9535 UV literal_endpoint = 0;
9537 UV stored = 0; /* how many chars stored in the bitmap */
9539 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9540 case we need to change the emitted regop to an EXACT. */
9541 const char * orig_parse = RExC_parse;
9542 GET_RE_DEBUG_FLAGS_DECL;
9544 PERL_ARGS_ASSERT_REGCLASS;
9546 PERL_UNUSED_ARG(depth);
9549 DEBUG_PARSE("clas");
9551 /* Assume we are going to generate an ANYOF node. */
9552 ret = reganode(pRExC_state, ANYOF, 0);
9556 ANYOF_FLAGS(ret) = 0;
9559 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
9563 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9567 RExC_size += ANYOF_SKIP;
9568 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9571 RExC_emit += ANYOF_SKIP;
9573 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9575 ANYOF_BITMAP_ZERO(ret);
9576 listsv = newSVpvs("# comment\n");
9577 initial_listsv_len = SvCUR(listsv);
9580 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9582 if (!SIZE_ONLY && POSIXCC(nextvalue))
9583 checkposixcc(pRExC_state);
9585 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9586 if (UCHARAT(RExC_parse) == ']')
9590 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9594 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9597 rangebegin = RExC_parse;
9599 value = utf8n_to_uvchr((U8*)RExC_parse,
9600 RExC_end - RExC_parse,
9601 &numlen, UTF8_ALLOW_DEFAULT);
9602 RExC_parse += numlen;
9605 value = UCHARAT(RExC_parse++);
9607 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9608 if (value == '[' && POSIXCC(nextvalue))
9609 namedclass = regpposixcc(pRExC_state, value);
9610 else if (value == '\\') {
9612 value = utf8n_to_uvchr((U8*)RExC_parse,
9613 RExC_end - RExC_parse,
9614 &numlen, UTF8_ALLOW_DEFAULT);
9615 RExC_parse += numlen;
9618 value = UCHARAT(RExC_parse++);
9619 /* Some compilers cannot handle switching on 64-bit integer
9620 * values, therefore value cannot be an UV. Yes, this will
9621 * be a problem later if we want switch on Unicode.
9622 * A similar issue a little bit later when switching on
9623 * namedclass. --jhi */
9624 switch ((I32)value) {
9625 case 'w': namedclass = ANYOF_ALNUM; break;
9626 case 'W': namedclass = ANYOF_NALNUM; break;
9627 case 's': namedclass = ANYOF_SPACE; break;
9628 case 'S': namedclass = ANYOF_NSPACE; break;
9629 case 'd': namedclass = ANYOF_DIGIT; break;
9630 case 'D': namedclass = ANYOF_NDIGIT; break;
9631 case 'v': namedclass = ANYOF_VERTWS; break;
9632 case 'V': namedclass = ANYOF_NVERTWS; break;
9633 case 'h': namedclass = ANYOF_HORIZWS; break;
9634 case 'H': namedclass = ANYOF_NHORIZWS; break;
9635 case 'N': /* Handle \N{NAME} in class */
9637 /* We only pay attention to the first char of
9638 multichar strings being returned. I kinda wonder
9639 if this makes sense as it does change the behaviour
9640 from earlier versions, OTOH that behaviour was broken
9642 UV v; /* value is register so we cant & it /grrr */
9643 if (reg_namedseq(pRExC_state, &v, NULL)) {
9653 if (RExC_parse >= RExC_end)
9654 vFAIL2("Empty \\%c{}", (U8)value);
9655 if (*RExC_parse == '{') {
9656 const U8 c = (U8)value;
9657 e = strchr(RExC_parse++, '}');
9659 vFAIL2("Missing right brace on \\%c{}", c);
9660 while (isSPACE(UCHARAT(RExC_parse)))
9662 if (e == RExC_parse)
9663 vFAIL2("Empty \\%c{}", c);
9665 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9673 if (UCHARAT(RExC_parse) == '^') {
9676 value = value == 'p' ? 'P' : 'p'; /* toggle */
9677 while (isSPACE(UCHARAT(RExC_parse))) {
9683 /* Add the property name to the list. If /i matching, give
9684 * a different name which consists of the normal name
9685 * sandwiched between two underscores and '_i'. The design
9686 * is discussed in the commit message for this. */
9687 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9688 (value=='p' ? '+' : '!'),
9697 /* The \p could match something in the Latin1 range, hence
9698 * something that isn't utf8 */
9699 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9700 namedclass = ANYOF_MAX; /* no official name, but it's named */
9702 /* \p means they want Unicode semantics */
9703 RExC_uni_semantics = 1;
9706 case 'n': value = '\n'; break;
9707 case 'r': value = '\r'; break;
9708 case 't': value = '\t'; break;
9709 case 'f': value = '\f'; break;
9710 case 'b': value = '\b'; break;
9711 case 'e': value = ASCII_TO_NATIVE('\033');break;
9712 case 'a': value = ASCII_TO_NATIVE('\007');break;
9714 RExC_parse--; /* function expects to be pointed at the 'o' */
9716 const char* error_msg;
9717 bool valid = grok_bslash_o(RExC_parse,
9722 RExC_parse += numlen;
9727 if (PL_encoding && value < 0x100) {
9728 goto recode_encoding;
9732 if (*RExC_parse == '{') {
9733 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9734 | PERL_SCAN_DISALLOW_PREFIX;
9735 char * const e = strchr(RExC_parse++, '}');
9737 vFAIL("Missing right brace on \\x{}");
9739 numlen = e - RExC_parse;
9740 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9744 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9746 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9747 RExC_parse += numlen;
9749 if (PL_encoding && value < 0x100)
9750 goto recode_encoding;
9753 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9755 case '0': case '1': case '2': case '3': case '4':
9756 case '5': case '6': case '7':
9758 /* Take 1-3 octal digits */
9759 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9761 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9762 RExC_parse += numlen;
9763 if (PL_encoding && value < 0x100)
9764 goto recode_encoding;
9769 SV* enc = PL_encoding;
9770 value = reg_recode((const char)(U8)value, &enc);
9771 if (!enc && SIZE_ONLY)
9772 ckWARNreg(RExC_parse,
9773 "Invalid escape in the specified encoding");
9777 /* Allow \_ to not give an error */
9778 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9779 ckWARN2reg(RExC_parse,
9780 "Unrecognized escape \\%c in character class passed through",
9785 } /* end of \blah */
9791 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9793 /* What matches in a locale is not known until runtime, so need to
9794 * (one time per class) allocate extra space to pass to regexec.
9795 * The space will contain a bit for each named class that is to be
9796 * matched against. This isn't needed for \p{} and pseudo-classes,
9797 * as they are not affected by locale, and hence are dealt with
9799 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9802 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9805 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9806 ANYOF_CLASS_ZERO(ret);
9808 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9811 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
9812 * literal, as is the character that began the false range, i.e.
9813 * the 'a' in the examples */
9817 RExC_parse >= rangebegin ?
9818 RExC_parse - rangebegin : 0;
9819 ckWARN4reg(RExC_parse,
9820 "False [] range \"%*.*s\"",
9824 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9825 if (prevvalue < 256) {
9827 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9830 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9834 range = 0; /* this was not a true range */
9840 const char *what = NULL;
9843 /* Possible truncation here but in some 64-bit environments
9844 * the compiler gets heartburn about switch on 64-bit values.
9845 * A similar issue a little earlier when switching on value.
9847 switch ((I32)namedclass) {
9849 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9850 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9851 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9852 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9853 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9854 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9855 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9856 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9857 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9858 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9859 /* \s, \w match all unicode if utf8. */
9860 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9861 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9862 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9863 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9864 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9867 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9869 for (value = 0; value < 128; value++)
9871 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9874 what = NULL; /* Doesn't match outside ascii, so
9875 don't want to add +utf8:: */
9879 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9881 for (value = 128; value < 256; value++)
9883 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9885 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9891 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9893 /* consecutive digits assumed */
9894 for (value = '0'; value <= '9'; value++)
9896 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9903 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9905 /* consecutive digits assumed */
9906 for (value = 0; value < '0'; value++)
9908 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9909 for (value = '9' + 1; value < 256; value++)
9911 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9915 if (AT_LEAST_ASCII_RESTRICTED ) {
9916 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9920 /* this is to handle \p and \P */
9923 vFAIL("Invalid [::] class");
9926 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9927 /* Strings such as "+utf8::isWord\n" */
9928 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9933 } /* end of namedclass \blah */
9936 if (prevvalue > (IV)value) /* b-a */ {
9937 const int w = RExC_parse - rangebegin;
9938 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9939 range = 0; /* not a valid range */
9943 prevvalue = value; /* save the beginning of the range */
9944 if (RExC_parse+1 < RExC_end
9945 && *RExC_parse == '-'
9946 && RExC_parse[1] != ']')
9950 /* a bad range like \w-, [:word:]- ? */
9951 if (namedclass > OOB_NAMEDCLASS) {
9952 if (ckWARN(WARN_REGEXP)) {
9954 RExC_parse >= rangebegin ?
9955 RExC_parse - rangebegin : 0;
9957 "False [] range \"%*.*s\"",
9962 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9964 range = 1; /* yeah, it's a range! */
9965 continue; /* but do it the next time */
9969 /* non-Latin1 code point implies unicode semantics. Must be set in
9970 * pass1 so is there for the whole of pass 2 */
9972 RExC_uni_semantics = 1;
9975 /* now is the next time */
9977 if (prevvalue < 256) {
9978 const IV ceilvalue = value < 256 ? value : 255;
9981 /* In EBCDIC [\x89-\x91] should include
9982 * the \x8e but [i-j] should not. */
9983 if (literal_endpoint == 2 &&
9984 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9985 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9987 if (isLOWER(prevvalue)) {
9988 for (i = prevvalue; i <= ceilvalue; i++)
9989 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9991 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9994 for (i = prevvalue; i <= ceilvalue; i++)
9995 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9997 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10003 for (i = prevvalue; i <= ceilvalue; i++) {
10004 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10008 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
10009 const UV natvalue = NATIVE_TO_UNI(value);
10010 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
10013 literal_endpoint = 0;
10017 range = 0; /* this range (if it was one) is done now */
10024 /****** !SIZE_ONLY AFTER HERE *********/
10026 /* If folding and there are code points above 255, we calculate all
10027 * characters that could fold to or from the ones already on the list */
10028 if (FOLD && nonbitmap) {
10031 HV* fold_intersection;
10034 /* This is a list of all the characters that participate in folds
10035 * (except marks, etc in multi-char folds */
10036 if (! PL_utf8_foldable) {
10037 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10038 PL_utf8_foldable = _swash_to_invlist(swash);
10041 /* This is a hash that for a particular fold gives all characters
10042 * that are involved in it */
10043 if (! PL_utf8_foldclosures) {
10045 /* If we were unable to find any folds, then we likely won't be
10046 * able to find the closures. So just create an empty list.
10047 * Folding will effectively be restricted to the non-Unicode rules
10048 * hard-coded into Perl. (This case happens legitimately during
10049 * compilation of Perl itself before the Unicode tables are
10051 if (invlist_len(PL_utf8_foldable) == 0) {
10052 PL_utf8_foldclosures = _new_invlist(0);
10054 /* If the folds haven't been read in, call a fold function
10056 if (! PL_utf8_tofold) {
10057 U8 dummy[UTF8_MAXBYTES+1];
10059 to_utf8_fold((U8*) "A", dummy, &dummy_len);
10061 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10065 /* Only the characters in this class that participate in folds need
10066 * be checked. Get the intersection of this class and all the
10067 * possible characters that are foldable. This can quickly narrow
10068 * down a large class */
10069 fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10071 /* Now look at the foldable characters in this class individually */
10072 fold_list = invlist_array(fold_intersection);
10073 for (i = 0; i < invlist_len(fold_intersection); i++) {
10076 /* The next entry is the beginning of the range that is in the
10078 UV start = fold_list[i++];
10081 /* The next entry is the beginning of the next range, which
10082 * isn't in the class, so the end of the current range is one
10083 * less than that */
10084 UV end = fold_list[i] - 1;
10086 /* Look at every character in the range */
10087 for (j = start; j <= end; j++) {
10090 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10092 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10094 if (foldlen > (STRLEN)UNISKIP(f)) {
10096 /* Any multicharacter foldings (disallowed in
10097 * lookbehind patterns) require the following
10098 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10099 * E folds into "pq" and F folds into "rst", all other
10100 * characters fold to single characters. We save away
10101 * these multicharacter foldings, to be later saved as
10102 * part of the additional "s" data. */
10103 if (! RExC_in_lookbehind) {
10105 U8* e = foldbuf + foldlen;
10107 /* If any of the folded characters of this are in
10108 * the Latin1 range, tell the regex engine that
10109 * this can match a non-utf8 target string. The
10110 * only multi-byte fold whose source is in the
10111 * Latin1 range (U+00DF) applies only when the
10112 * target string is utf8, or under unicode rules */
10113 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10116 /* Can't mix ascii with non- under /aa */
10117 if (MORE_ASCII_RESTRICTED
10118 && (isASCII(*loc) != isASCII(j)))
10120 goto end_multi_fold;
10122 if (UTF8_IS_INVARIANT(*loc)
10123 || UTF8_IS_DOWNGRADEABLE_START(*loc))
10125 /* Can't mix above and below 256 under
10128 goto end_multi_fold;
10131 |= ANYOF_NONBITMAP_NON_UTF8;
10134 loc += UTF8SKIP(loc);
10138 add_alternate(&unicode_alternate, foldbuf, foldlen);
10143 /* Single character fold. Add everything in its fold
10144 * closure to the list that this node should match */
10147 /* The fold closures data structure is a hash with the
10148 * keys being every character that is folded to, like
10149 * 'k', and the values each an array of everything that
10150 * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
10151 if ((listp = hv_fetch(PL_utf8_foldclosures,
10152 (char *) foldbuf, foldlen, FALSE)))
10154 AV* list = (AV*) *listp;
10156 for (k = 0; k <= av_len(list); k++) {
10157 SV** c_p = av_fetch(list, k, FALSE);
10160 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10164 /* /aa doesn't allow folds between ASCII and
10165 * non-; /l doesn't allow them between above
10167 if ((MORE_ASCII_RESTRICTED
10168 && (isASCII(c) != isASCII(j)))
10169 || (LOC && ((c < 256) != (j < 256))))
10174 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10175 stored += set_regclass_bit(pRExC_state,
10178 &l1_fold_invlist, &unicode_alternate);
10180 /* It may be that the code point is already
10181 * in this range or already in the bitmap,
10182 * in which case we need do nothing */
10183 else if ((c < start || c > end)
10185 || ! ANYOF_BITMAP_TEST(ret, c)))
10187 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10194 invlist_destroy(fold_intersection);
10197 /* Combine the two lists into one. */
10198 if (l1_fold_invlist) {
10200 nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10203 nonbitmap = l1_fold_invlist;
10207 /* Here, we have calculated what code points should be in the character
10208 * class. Now we can see about various optimizations. Fold calculation
10209 * needs to take place before inversion. Otherwise /[^k]/i would invert to
10210 * include K, which under /i would match k. */
10212 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
10213 * set the FOLD flag yet, so this this does optimize those. It doesn't
10214 * optimize locale. Doing so perhaps could be done as long as there is
10215 * nothing like \w in it; some thought also would have to be given to the
10216 * interaction with above 0x100 chars */
10218 && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10219 && ! unicode_alternate
10221 && SvCUR(listsv) == initial_listsv_len)
10223 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10224 ANYOF_BITMAP(ret)[value] ^= 0xFF;
10225 stored = 256 - stored;
10227 /* The inversion means that everything above 255 is matched; and at the
10228 * same time we clear the invert flag */
10229 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10232 /* Folding in the bitmap is taken care of above, but not for locale (for
10233 * which we have to wait to see what folding is in effect at runtime), and
10234 * for things not in the bitmap. Set run-time fold flag for these */
10235 if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10236 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10239 /* A single character class can be "optimized" into an EXACTish node.
10240 * Note that since we don't currently count how many characters there are
10241 * outside the bitmap, we are XXX missing optimization possibilities for
10242 * them. This optimization can't happen unless this is a truly single
10243 * character class, which means that it can't be an inversion into a
10244 * many-character class, and there must be no possibility of there being
10245 * things outside the bitmap. 'stored' (only) for locales doesn't include
10246 * \w, etc, so have to make a special test that they aren't present
10248 * Similarly A 2-character class of the very special form like [bB] can be
10249 * optimized into an EXACTFish node, but only for non-locales, and for
10250 * characters which only have the two folds; so things like 'fF' and 'Ii'
10251 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10254 && ! unicode_alternate
10255 && SvCUR(listsv) == initial_listsv_len
10256 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10257 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10258 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10259 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10260 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10261 /* If the latest code point has a fold whose
10262 * bit is set, it must be the only other one */
10263 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10264 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10266 /* Note that the information needed to decide to do this optimization
10267 * is not currently available until the 2nd pass, and that the actually
10268 * used EXACTish node takes less space than the calculated ANYOF node,
10269 * and hence the amount of space calculated in the first pass is larger
10270 * than actually used, so this optimization doesn't gain us any space.
10271 * But an EXACT node is faster than an ANYOF node, and can be combined
10272 * with any adjacent EXACT nodes later by the optimizer for further
10273 * gains. The speed of executing an EXACTF is similar to an ANYOF
10274 * node, so the optimization advantage comes from the ability to join
10275 * it to adjacent EXACT nodes */
10277 const char * cur_parse= RExC_parse;
10279 RExC_emit = (regnode *)orig_emit;
10280 RExC_parse = (char *)orig_parse;
10284 /* A locale node with one point can be folded; all the other cases
10285 * with folding will have two points, since we calculate them above
10287 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10293 } /* else 2 chars in the bit map: the folds of each other */
10294 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10296 /* To join adjacent nodes, they must be the exact EXACTish type.
10297 * Try to use the most likely type, by using EXACTFU if the regex
10298 * calls for them, or is required because the character is
10302 else { /* Otherwise, more likely to be EXACTF type */
10306 ret = reg_node(pRExC_state, op);
10307 RExC_parse = (char *)cur_parse;
10308 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10309 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10310 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10312 RExC_emit += STR_SZ(2);
10315 *STRING(ret)= (char)value;
10317 RExC_emit += STR_SZ(1);
10319 SvREFCNT_dec(listsv);
10324 UV* nonbitmap_array = invlist_array(nonbitmap);
10325 UV nonbitmap_len = invlist_len(nonbitmap);
10328 /* Here have the full list of items to match that aren't in the
10329 * bitmap. Convert to the structure that the rest of the code is
10330 * expecting. XXX That rest of the code should convert to this
10332 for (i = 0; i < nonbitmap_len; i++) {
10334 /* The next entry is the beginning of the range that is in the
10336 UV start = nonbitmap_array[i++];
10339 /* The next entry is the beginning of the next range, which isn't
10340 * in the class, so the end of the current range is one less than
10341 * that. But if there is no next range, it means that the range
10342 * begun by 'start' extends to infinity, which for this platform
10343 * ends at UV_MAX */
10344 if (i == nonbitmap_len) {
10348 end = nonbitmap_array[i] - 1;
10351 if (start == end) {
10352 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10355 /* The \t sets the whole range */
10356 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10361 invlist_destroy(nonbitmap);
10364 if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10365 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10366 SvREFCNT_dec(listsv);
10367 SvREFCNT_dec(unicode_alternate);
10371 AV * const av = newAV();
10373 /* The 0th element stores the character class description
10374 * in its textual form: used later (regexec.c:Perl_regclass_swash())
10375 * to initialize the appropriate swash (which gets stored in
10376 * the 1st element), and also useful for dumping the regnode.
10377 * The 2nd element stores the multicharacter foldings,
10378 * used later (regexec.c:S_reginclass()). */
10379 av_store(av, 0, listsv);
10380 av_store(av, 1, NULL);
10381 av_store(av, 2, MUTABLE_SV(unicode_alternate));
10382 if (unicode_alternate) { /* This node is variable length */
10385 rv = newRV_noinc(MUTABLE_SV(av));
10386 n = add_data(pRExC_state, 1, "s");
10387 RExC_rxi->data->data[n] = (void*)rv;
10395 /* reg_skipcomment()
10397 Absorbs an /x style # comments from the input stream.
10398 Returns true if there is more text remaining in the stream.
10399 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10400 terminates the pattern without including a newline.
10402 Note its the callers responsibility to ensure that we are
10403 actually in /x mode
10408 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10412 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10414 while (RExC_parse < RExC_end)
10415 if (*RExC_parse++ == '\n') {
10420 /* we ran off the end of the pattern without ending
10421 the comment, so we have to add an \n when wrapping */
10422 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10430 Advances the parse position, and optionally absorbs
10431 "whitespace" from the inputstream.
10433 Without /x "whitespace" means (?#...) style comments only,
10434 with /x this means (?#...) and # comments and whitespace proper.
10436 Returns the RExC_parse point from BEFORE the scan occurs.
10438 This is the /x friendly way of saying RExC_parse++.
10442 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10444 char* const retval = RExC_parse++;
10446 PERL_ARGS_ASSERT_NEXTCHAR;
10449 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10450 RExC_parse[2] == '#') {
10451 while (*RExC_parse != ')') {
10452 if (RExC_parse == RExC_end)
10453 FAIL("Sequence (?#... not terminated");
10459 if (RExC_flags & RXf_PMf_EXTENDED) {
10460 if (isSPACE(*RExC_parse)) {
10464 else if (*RExC_parse == '#') {
10465 if ( reg_skipcomment( pRExC_state ) )
10474 - reg_node - emit a node
10476 STATIC regnode * /* Location. */
10477 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10480 register regnode *ptr;
10481 regnode * const ret = RExC_emit;
10482 GET_RE_DEBUG_FLAGS_DECL;
10484 PERL_ARGS_ASSERT_REG_NODE;
10487 SIZE_ALIGN(RExC_size);
10491 if (RExC_emit >= RExC_emit_bound)
10492 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10494 NODE_ALIGN_FILL(ret);
10496 FILL_ADVANCE_NODE(ptr, op);
10497 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
10498 #ifdef RE_TRACK_PATTERN_OFFSETS
10499 if (RExC_offsets) { /* MJD */
10500 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10501 "reg_node", __LINE__,
10503 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10504 ? "Overwriting end of array!\n" : "OK",
10505 (UV)(RExC_emit - RExC_emit_start),
10506 (UV)(RExC_parse - RExC_start),
10507 (UV)RExC_offsets[0]));
10508 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10516 - reganode - emit a node with an argument
10518 STATIC regnode * /* Location. */
10519 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10522 register regnode *ptr;
10523 regnode * const ret = RExC_emit;
10524 GET_RE_DEBUG_FLAGS_DECL;
10526 PERL_ARGS_ASSERT_REGANODE;
10529 SIZE_ALIGN(RExC_size);
10534 assert(2==regarglen[op]+1);
10536 Anything larger than this has to allocate the extra amount.
10537 If we changed this to be:
10539 RExC_size += (1 + regarglen[op]);
10541 then it wouldn't matter. Its not clear what side effect
10542 might come from that so its not done so far.
10547 if (RExC_emit >= RExC_emit_bound)
10548 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10550 NODE_ALIGN_FILL(ret);
10552 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10553 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
10554 #ifdef RE_TRACK_PATTERN_OFFSETS
10555 if (RExC_offsets) { /* MJD */
10556 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10560 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10561 "Overwriting end of array!\n" : "OK",
10562 (UV)(RExC_emit - RExC_emit_start),
10563 (UV)(RExC_parse - RExC_start),
10564 (UV)RExC_offsets[0]));
10565 Set_Cur_Node_Offset;
10573 - reguni - emit (if appropriate) a Unicode character
10576 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10580 PERL_ARGS_ASSERT_REGUNI;
10582 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10586 - reginsert - insert an operator in front of already-emitted operand
10588 * Means relocating the operand.
10591 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10594 register regnode *src;
10595 register regnode *dst;
10596 register regnode *place;
10597 const int offset = regarglen[(U8)op];
10598 const int size = NODE_STEP_REGNODE + offset;
10599 GET_RE_DEBUG_FLAGS_DECL;
10601 PERL_ARGS_ASSERT_REGINSERT;
10602 PERL_UNUSED_ARG(depth);
10603 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10604 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10613 if (RExC_open_parens) {
10615 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10616 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10617 if ( RExC_open_parens[paren] >= opnd ) {
10618 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10619 RExC_open_parens[paren] += size;
10621 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10623 if ( RExC_close_parens[paren] >= opnd ) {
10624 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10625 RExC_close_parens[paren] += size;
10627 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10632 while (src > opnd) {
10633 StructCopy(--src, --dst, regnode);
10634 #ifdef RE_TRACK_PATTERN_OFFSETS
10635 if (RExC_offsets) { /* MJD 20010112 */
10636 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10640 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10641 ? "Overwriting end of array!\n" : "OK",
10642 (UV)(src - RExC_emit_start),
10643 (UV)(dst - RExC_emit_start),
10644 (UV)RExC_offsets[0]));
10645 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10646 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10652 place = opnd; /* Op node, where operand used to be. */
10653 #ifdef RE_TRACK_PATTERN_OFFSETS
10654 if (RExC_offsets) { /* MJD */
10655 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10659 (UV)(place - RExC_emit_start) > RExC_offsets[0]
10660 ? "Overwriting end of array!\n" : "OK",
10661 (UV)(place - RExC_emit_start),
10662 (UV)(RExC_parse - RExC_start),
10663 (UV)RExC_offsets[0]));
10664 Set_Node_Offset(place, RExC_parse);
10665 Set_Node_Length(place, 1);
10668 src = NEXTOPER(place);
10669 FILL_ADVANCE_NODE(place, op);
10670 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
10671 Zero(src, offset, regnode);
10675 - regtail - set the next-pointer at the end of a node chain of p to val.
10676 - SEE ALSO: regtail_study
10678 /* TODO: All three parms should be const */
10680 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10683 register regnode *scan;
10684 GET_RE_DEBUG_FLAGS_DECL;
10686 PERL_ARGS_ASSERT_REGTAIL;
10688 PERL_UNUSED_ARG(depth);
10694 /* Find last node. */
10697 regnode * const temp = regnext(scan);
10699 SV * const mysv=sv_newmortal();
10700 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10701 regprop(RExC_rx, mysv, scan);
10702 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10703 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10704 (temp == NULL ? "->" : ""),
10705 (temp == NULL ? PL_reg_name[OP(val)] : "")
10713 if (reg_off_by_arg[OP(scan)]) {
10714 ARG_SET(scan, val - scan);
10717 NEXT_OFF(scan) = val - scan;
10723 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10724 - Look for optimizable sequences at the same time.
10725 - currently only looks for EXACT chains.
10727 This is experimental code. The idea is to use this routine to perform
10728 in place optimizations on branches and groups as they are constructed,
10729 with the long term intention of removing optimization from study_chunk so
10730 that it is purely analytical.
10732 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10733 to control which is which.
10736 /* TODO: All four parms should be const */
10739 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10742 register regnode *scan;
10744 #ifdef EXPERIMENTAL_INPLACESCAN
10747 GET_RE_DEBUG_FLAGS_DECL;
10749 PERL_ARGS_ASSERT_REGTAIL_STUDY;
10755 /* Find last node. */
10759 regnode * const temp = regnext(scan);
10760 #ifdef EXPERIMENTAL_INPLACESCAN
10761 if (PL_regkind[OP(scan)] == EXACT)
10762 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10766 switch (OP(scan)) {
10772 if( exact == PSEUDO )
10774 else if ( exact != OP(scan) )
10783 SV * const mysv=sv_newmortal();
10784 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10785 regprop(RExC_rx, mysv, scan);
10786 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10787 SvPV_nolen_const(mysv),
10788 REG_NODE_NUM(scan),
10789 PL_reg_name[exact]);
10796 SV * const mysv_val=sv_newmortal();
10797 DEBUG_PARSE_MSG("");
10798 regprop(RExC_rx, mysv_val, val);
10799 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10800 SvPV_nolen_const(mysv_val),
10801 (IV)REG_NODE_NUM(val),
10805 if (reg_off_by_arg[OP(scan)]) {
10806 ARG_SET(scan, val - scan);
10809 NEXT_OFF(scan) = val - scan;
10817 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10821 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10827 for (bit=0; bit<32; bit++) {
10828 if (flags & (1<<bit)) {
10829 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10832 if (!set++ && lead)
10833 PerlIO_printf(Perl_debug_log, "%s",lead);
10834 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10837 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10838 if (!set++ && lead) {
10839 PerlIO_printf(Perl_debug_log, "%s",lead);
10842 case REGEX_UNICODE_CHARSET:
10843 PerlIO_printf(Perl_debug_log, "UNICODE");
10845 case REGEX_LOCALE_CHARSET:
10846 PerlIO_printf(Perl_debug_log, "LOCALE");
10848 case REGEX_ASCII_RESTRICTED_CHARSET:
10849 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10851 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10852 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10855 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10861 PerlIO_printf(Perl_debug_log, "\n");
10863 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10869 Perl_regdump(pTHX_ const regexp *r)
10873 SV * const sv = sv_newmortal();
10874 SV *dsv= sv_newmortal();
10875 RXi_GET_DECL(r,ri);
10876 GET_RE_DEBUG_FLAGS_DECL;
10878 PERL_ARGS_ASSERT_REGDUMP;
10880 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10882 /* Header fields of interest. */
10883 if (r->anchored_substr) {
10884 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10885 RE_SV_DUMPLEN(r->anchored_substr), 30);
10886 PerlIO_printf(Perl_debug_log,
10887 "anchored %s%s at %"IVdf" ",
10888 s, RE_SV_TAIL(r->anchored_substr),
10889 (IV)r->anchored_offset);
10890 } else if (r->anchored_utf8) {
10891 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10892 RE_SV_DUMPLEN(r->anchored_utf8), 30);
10893 PerlIO_printf(Perl_debug_log,
10894 "anchored utf8 %s%s at %"IVdf" ",
10895 s, RE_SV_TAIL(r->anchored_utf8),
10896 (IV)r->anchored_offset);
10898 if (r->float_substr) {
10899 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10900 RE_SV_DUMPLEN(r->float_substr), 30);
10901 PerlIO_printf(Perl_debug_log,
10902 "floating %s%s at %"IVdf"..%"UVuf" ",
10903 s, RE_SV_TAIL(r->float_substr),
10904 (IV)r->float_min_offset, (UV)r->float_max_offset);
10905 } else if (r->float_utf8) {
10906 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10907 RE_SV_DUMPLEN(r->float_utf8), 30);
10908 PerlIO_printf(Perl_debug_log,
10909 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10910 s, RE_SV_TAIL(r->float_utf8),
10911 (IV)r->float_min_offset, (UV)r->float_max_offset);
10913 if (r->check_substr || r->check_utf8)
10914 PerlIO_printf(Perl_debug_log,
10916 (r->check_substr == r->float_substr
10917 && r->check_utf8 == r->float_utf8
10918 ? "(checking floating" : "(checking anchored"));
10919 if (r->extflags & RXf_NOSCAN)
10920 PerlIO_printf(Perl_debug_log, " noscan");
10921 if (r->extflags & RXf_CHECK_ALL)
10922 PerlIO_printf(Perl_debug_log, " isall");
10923 if (r->check_substr || r->check_utf8)
10924 PerlIO_printf(Perl_debug_log, ") ");
10926 if (ri->regstclass) {
10927 regprop(r, sv, ri->regstclass);
10928 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10930 if (r->extflags & RXf_ANCH) {
10931 PerlIO_printf(Perl_debug_log, "anchored");
10932 if (r->extflags & RXf_ANCH_BOL)
10933 PerlIO_printf(Perl_debug_log, "(BOL)");
10934 if (r->extflags & RXf_ANCH_MBOL)
10935 PerlIO_printf(Perl_debug_log, "(MBOL)");
10936 if (r->extflags & RXf_ANCH_SBOL)
10937 PerlIO_printf(Perl_debug_log, "(SBOL)");
10938 if (r->extflags & RXf_ANCH_GPOS)
10939 PerlIO_printf(Perl_debug_log, "(GPOS)");
10940 PerlIO_putc(Perl_debug_log, ' ');
10942 if (r->extflags & RXf_GPOS_SEEN)
10943 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10944 if (r->intflags & PREGf_SKIP)
10945 PerlIO_printf(Perl_debug_log, "plus ");
10946 if (r->intflags & PREGf_IMPLICIT)
10947 PerlIO_printf(Perl_debug_log, "implicit ");
10948 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10949 if (r->extflags & RXf_EVAL_SEEN)
10950 PerlIO_printf(Perl_debug_log, "with eval ");
10951 PerlIO_printf(Perl_debug_log, "\n");
10952 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10954 PERL_ARGS_ASSERT_REGDUMP;
10955 PERL_UNUSED_CONTEXT;
10956 PERL_UNUSED_ARG(r);
10957 #endif /* DEBUGGING */
10961 - regprop - printable representation of opcode
10963 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10966 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10967 if (flags & ANYOF_INVERT) \
10968 /*make sure the invert info is in each */ \
10969 sv_catpvs(sv, "^"); \
10975 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10980 RXi_GET_DECL(prog,progi);
10981 GET_RE_DEBUG_FLAGS_DECL;
10983 PERL_ARGS_ASSERT_REGPROP;
10987 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
10988 /* It would be nice to FAIL() here, but this may be called from
10989 regexec.c, and it would be hard to supply pRExC_state. */
10990 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10991 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10993 k = PL_regkind[OP(o)];
10996 sv_catpvs(sv, " ");
10997 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10998 * is a crude hack but it may be the best for now since
10999 * we have no flag "this EXACTish node was UTF-8"
11001 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
11002 PERL_PV_ESCAPE_UNI_DETECT |
11003 PERL_PV_ESCAPE_NONASCII |
11004 PERL_PV_PRETTY_ELLIPSES |
11005 PERL_PV_PRETTY_LTGT |
11006 PERL_PV_PRETTY_NOCLEAR
11008 } else if (k == TRIE) {
11009 /* print the details of the trie in dumpuntil instead, as
11010 * progi->data isn't available here */
11011 const char op = OP(o);
11012 const U32 n = ARG(o);
11013 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11014 (reg_ac_data *)progi->data->data[n] :
11016 const reg_trie_data * const trie
11017 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11019 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11020 DEBUG_TRIE_COMPILE_r(
11021 Perl_sv_catpvf(aTHX_ sv,
11022 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11023 (UV)trie->startstate,
11024 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11025 (UV)trie->wordcount,
11028 (UV)TRIE_CHARCOUNT(trie),
11029 (UV)trie->uniquecharcount
11032 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11034 int rangestart = -1;
11035 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11036 sv_catpvs(sv, "[");
11037 for (i = 0; i <= 256; i++) {
11038 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11039 if (rangestart == -1)
11041 } else if (rangestart != -1) {
11042 if (i <= rangestart + 3)
11043 for (; rangestart < i; rangestart++)
11044 put_byte(sv, rangestart);
11046 put_byte(sv, rangestart);
11047 sv_catpvs(sv, "-");
11048 put_byte(sv, i - 1);
11053 sv_catpvs(sv, "]");
11056 } else if (k == CURLY) {
11057 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11058 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11059 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11061 else if (k == WHILEM && o->flags) /* Ordinal/of */
11062 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11063 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11064 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
11065 if ( RXp_PAREN_NAMES(prog) ) {
11066 if ( k != REF || (OP(o) < NREF)) {
11067 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11068 SV **name= av_fetch(list, ARG(o), 0 );
11070 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11073 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11074 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11075 I32 *nums=(I32*)SvPVX(sv_dat);
11076 SV **name= av_fetch(list, nums[0], 0 );
11079 for ( n=0; n<SvIVX(sv_dat); n++ ) {
11080 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11081 (n ? "," : ""), (IV)nums[n]);
11083 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11087 } else if (k == GOSUB)
11088 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11089 else if (k == VERB) {
11091 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11092 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11093 } else if (k == LOGICAL)
11094 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
11095 else if (k == FOLDCHAR)
11096 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11097 else if (k == ANYOF) {
11098 int i, rangestart = -1;
11099 const U8 flags = ANYOF_FLAGS(o);
11102 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11103 static const char * const anyofs[] = {
11136 if (flags & ANYOF_LOCALE)
11137 sv_catpvs(sv, "{loc}");
11138 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11139 sv_catpvs(sv, "{i}");
11140 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11141 if (flags & ANYOF_INVERT)
11142 sv_catpvs(sv, "^");
11144 /* output what the standard cp 0-255 bitmap matches */
11145 for (i = 0; i <= 256; i++) {
11146 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11147 if (rangestart == -1)
11149 } else if (rangestart != -1) {
11150 if (i <= rangestart + 3)
11151 for (; rangestart < i; rangestart++)
11152 put_byte(sv, rangestart);
11154 put_byte(sv, rangestart);
11155 sv_catpvs(sv, "-");
11156 put_byte(sv, i - 1);
11163 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11164 /* output any special charclass tests (used entirely under use locale) */
11165 if (ANYOF_CLASS_TEST_ANY_SET(o))
11166 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11167 if (ANYOF_CLASS_TEST(o,i)) {
11168 sv_catpv(sv, anyofs[i]);
11172 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11174 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11175 sv_catpvs(sv, "{non-utf8-latin1-all}");
11178 /* output information about the unicode matching */
11179 if (flags & ANYOF_UNICODE_ALL)
11180 sv_catpvs(sv, "{unicode_all}");
11181 else if (ANYOF_NONBITMAP(o))
11182 sv_catpvs(sv, "{unicode}");
11183 if (flags & ANYOF_NONBITMAP_NON_UTF8)
11184 sv_catpvs(sv, "{outside bitmap}");
11186 if (ANYOF_NONBITMAP(o)) {
11188 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11192 U8 s[UTF8_MAXBYTES_CASE+1];
11194 for (i = 0; i <= 256; i++) { /* just the first 256 */
11195 uvchr_to_utf8(s, i);
11197 if (i < 256 && swash_fetch(sw, s, TRUE)) {
11198 if (rangestart == -1)
11200 } else if (rangestart != -1) {
11201 if (i <= rangestart + 3)
11202 for (; rangestart < i; rangestart++) {
11203 const U8 * const e = uvchr_to_utf8(s,rangestart);
11205 for(p = s; p < e; p++)
11209 const U8 *e = uvchr_to_utf8(s,rangestart);
11211 for (p = s; p < e; p++)
11213 sv_catpvs(sv, "-");
11214 e = uvchr_to_utf8(s, i-1);
11215 for (p = s; p < e; p++)
11222 sv_catpvs(sv, "..."); /* et cetera */
11226 char *s = savesvpv(lv);
11227 char * const origs = s;
11229 while (*s && *s != '\n')
11233 const char * const t = ++s;
11251 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11253 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11254 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11256 PERL_UNUSED_CONTEXT;
11257 PERL_UNUSED_ARG(sv);
11258 PERL_UNUSED_ARG(o);
11259 PERL_UNUSED_ARG(prog);
11260 #endif /* DEBUGGING */
11264 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11265 { /* Assume that RE_INTUIT is set */
11267 struct regexp *const prog = (struct regexp *)SvANY(r);
11268 GET_RE_DEBUG_FLAGS_DECL;
11270 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11271 PERL_UNUSED_CONTEXT;
11275 const char * const s = SvPV_nolen_const(prog->check_substr
11276 ? prog->check_substr : prog->check_utf8);
11278 if (!PL_colorset) reginitcolors();
11279 PerlIO_printf(Perl_debug_log,
11280 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11282 prog->check_substr ? "" : "utf8 ",
11283 PL_colors[5],PL_colors[0],
11286 (strlen(s) > 60 ? "..." : ""));
11289 return prog->check_substr ? prog->check_substr : prog->check_utf8;
11295 handles refcounting and freeing the perl core regexp structure. When
11296 it is necessary to actually free the structure the first thing it
11297 does is call the 'free' method of the regexp_engine associated to
11298 the regexp, allowing the handling of the void *pprivate; member
11299 first. (This routine is not overridable by extensions, which is why
11300 the extensions free is called first.)
11302 See regdupe and regdupe_internal if you change anything here.
11304 #ifndef PERL_IN_XSUB_RE
11306 Perl_pregfree(pTHX_ REGEXP *r)
11312 Perl_pregfree2(pTHX_ REGEXP *rx)
11315 struct regexp *const r = (struct regexp *)SvANY(rx);
11316 GET_RE_DEBUG_FLAGS_DECL;
11318 PERL_ARGS_ASSERT_PREGFREE2;
11320 if (r->mother_re) {
11321 ReREFCNT_dec(r->mother_re);
11323 CALLREGFREE_PVT(rx); /* free the private data */
11324 SvREFCNT_dec(RXp_PAREN_NAMES(r));
11327 SvREFCNT_dec(r->anchored_substr);
11328 SvREFCNT_dec(r->anchored_utf8);
11329 SvREFCNT_dec(r->float_substr);
11330 SvREFCNT_dec(r->float_utf8);
11331 Safefree(r->substrs);
11333 RX_MATCH_COPY_FREE(rx);
11334 #ifdef PERL_OLD_COPY_ON_WRITE
11335 SvREFCNT_dec(r->saved_copy);
11342 This is a hacky workaround to the structural issue of match results
11343 being stored in the regexp structure which is in turn stored in
11344 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11345 could be PL_curpm in multiple contexts, and could require multiple
11346 result sets being associated with the pattern simultaneously, such
11347 as when doing a recursive match with (??{$qr})
11349 The solution is to make a lightweight copy of the regexp structure
11350 when a qr// is returned from the code executed by (??{$qr}) this
11351 lightweight copy doesn't actually own any of its data except for
11352 the starp/end and the actual regexp structure itself.
11358 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11360 struct regexp *ret;
11361 struct regexp *const r = (struct regexp *)SvANY(rx);
11362 register const I32 npar = r->nparens+1;
11364 PERL_ARGS_ASSERT_REG_TEMP_COPY;
11367 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11368 ret = (struct regexp *)SvANY(ret_x);
11370 (void)ReREFCNT_inc(rx);
11371 /* We can take advantage of the existing "copied buffer" mechanism in SVs
11372 by pointing directly at the buffer, but flagging that the allocated
11373 space in the copy is zero. As we've just done a struct copy, it's now
11374 a case of zero-ing that, rather than copying the current length. */
11375 SvPV_set(ret_x, RX_WRAPPED(rx));
11376 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11377 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11378 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11379 SvLEN_set(ret_x, 0);
11380 SvSTASH_set(ret_x, NULL);
11381 SvMAGIC_set(ret_x, NULL);
11382 Newx(ret->offs, npar, regexp_paren_pair);
11383 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11385 Newx(ret->substrs, 1, struct reg_substr_data);
11386 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11388 SvREFCNT_inc_void(ret->anchored_substr);
11389 SvREFCNT_inc_void(ret->anchored_utf8);
11390 SvREFCNT_inc_void(ret->float_substr);
11391 SvREFCNT_inc_void(ret->float_utf8);
11393 /* check_substr and check_utf8, if non-NULL, point to either their
11394 anchored or float namesakes, and don't hold a second reference. */
11396 RX_MATCH_COPIED_off(ret_x);
11397 #ifdef PERL_OLD_COPY_ON_WRITE
11398 ret->saved_copy = NULL;
11400 ret->mother_re = rx;
11406 /* regfree_internal()
11408 Free the private data in a regexp. This is overloadable by
11409 extensions. Perl takes care of the regexp structure in pregfree(),
11410 this covers the *pprivate pointer which technically perl doesn't
11411 know about, however of course we have to handle the
11412 regexp_internal structure when no extension is in use.
11414 Note this is called before freeing anything in the regexp
11419 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11422 struct regexp *const r = (struct regexp *)SvANY(rx);
11423 RXi_GET_DECL(r,ri);
11424 GET_RE_DEBUG_FLAGS_DECL;
11426 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11432 SV *dsv= sv_newmortal();
11433 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11434 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11435 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11436 PL_colors[4],PL_colors[5],s);
11439 #ifdef RE_TRACK_PATTERN_OFFSETS
11441 Safefree(ri->u.offsets); /* 20010421 MJD */
11444 int n = ri->data->count;
11445 PAD* new_comppad = NULL;
11450 /* If you add a ->what type here, update the comment in regcomp.h */
11451 switch (ri->data->what[n]) {
11456 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11459 Safefree(ri->data->data[n]);
11462 new_comppad = MUTABLE_AV(ri->data->data[n]);
11465 if (new_comppad == NULL)
11466 Perl_croak(aTHX_ "panic: pregfree comppad");
11467 PAD_SAVE_LOCAL(old_comppad,
11468 /* Watch out for global destruction's random ordering. */
11469 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11472 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11475 op_free((OP_4tree*)ri->data->data[n]);
11477 PAD_RESTORE_LOCAL(old_comppad);
11478 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11479 new_comppad = NULL;
11484 { /* Aho Corasick add-on structure for a trie node.
11485 Used in stclass optimization only */
11487 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11489 refcount = --aho->refcount;
11492 PerlMemShared_free(aho->states);
11493 PerlMemShared_free(aho->fail);
11494 /* do this last!!!! */
11495 PerlMemShared_free(ri->data->data[n]);
11496 PerlMemShared_free(ri->regstclass);
11502 /* trie structure. */
11504 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11506 refcount = --trie->refcount;
11509 PerlMemShared_free(trie->charmap);
11510 PerlMemShared_free(trie->states);
11511 PerlMemShared_free(trie->trans);
11513 PerlMemShared_free(trie->bitmap);
11515 PerlMemShared_free(trie->jump);
11516 PerlMemShared_free(trie->wordinfo);
11517 /* do this last!!!! */
11518 PerlMemShared_free(ri->data->data[n]);
11523 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11526 Safefree(ri->data->what);
11527 Safefree(ri->data);
11533 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11534 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11535 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11538 re_dup - duplicate a regexp.
11540 This routine is expected to clone a given regexp structure. It is only
11541 compiled under USE_ITHREADS.
11543 After all of the core data stored in struct regexp is duplicated
11544 the regexp_engine.dupe method is used to copy any private data
11545 stored in the *pprivate pointer. This allows extensions to handle
11546 any duplication it needs to do.
11548 See pregfree() and regfree_internal() if you change anything here.
11550 #if defined(USE_ITHREADS)
11551 #ifndef PERL_IN_XSUB_RE
11553 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11557 const struct regexp *r = (const struct regexp *)SvANY(sstr);
11558 struct regexp *ret = (struct regexp *)SvANY(dstr);
11560 PERL_ARGS_ASSERT_RE_DUP_GUTS;
11562 npar = r->nparens+1;
11563 Newx(ret->offs, npar, regexp_paren_pair);
11564 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11566 /* no need to copy these */
11567 Newx(ret->swap, npar, regexp_paren_pair);
11570 if (ret->substrs) {
11571 /* Do it this way to avoid reading from *r after the StructCopy().
11572 That way, if any of the sv_dup_inc()s dislodge *r from the L1
11573 cache, it doesn't matter. */
11574 const bool anchored = r->check_substr
11575 ? r->check_substr == r->anchored_substr
11576 : r->check_utf8 == r->anchored_utf8;
11577 Newx(ret->substrs, 1, struct reg_substr_data);
11578 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11580 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11581 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11582 ret->float_substr = sv_dup_inc(ret->float_substr, param);
11583 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11585 /* check_substr and check_utf8, if non-NULL, point to either their
11586 anchored or float namesakes, and don't hold a second reference. */
11588 if (ret->check_substr) {
11590 assert(r->check_utf8 == r->anchored_utf8);
11591 ret->check_substr = ret->anchored_substr;
11592 ret->check_utf8 = ret->anchored_utf8;
11594 assert(r->check_substr == r->float_substr);
11595 assert(r->check_utf8 == r->float_utf8);
11596 ret->check_substr = ret->float_substr;
11597 ret->check_utf8 = ret->float_utf8;
11599 } else if (ret->check_utf8) {
11601 ret->check_utf8 = ret->anchored_utf8;
11603 ret->check_utf8 = ret->float_utf8;
11608 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11611 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11613 if (RX_MATCH_COPIED(dstr))
11614 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
11616 ret->subbeg = NULL;
11617 #ifdef PERL_OLD_COPY_ON_WRITE
11618 ret->saved_copy = NULL;
11621 if (ret->mother_re) {
11622 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11623 /* Our storage points directly to our mother regexp, but that's
11624 1: a buffer in a different thread
11625 2: something we no longer hold a reference on
11626 so we need to copy it locally. */
11627 /* Note we need to sue SvCUR() on our mother_re, because it, in
11628 turn, may well be pointing to its own mother_re. */
11629 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11630 SvCUR(ret->mother_re)+1));
11631 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11633 ret->mother_re = NULL;
11637 #endif /* PERL_IN_XSUB_RE */
11642 This is the internal complement to regdupe() which is used to copy
11643 the structure pointed to by the *pprivate pointer in the regexp.
11644 This is the core version of the extension overridable cloning hook.
11645 The regexp structure being duplicated will be copied by perl prior
11646 to this and will be provided as the regexp *r argument, however
11647 with the /old/ structures pprivate pointer value. Thus this routine
11648 may override any copying normally done by perl.
11650 It returns a pointer to the new regexp_internal structure.
11654 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11657 struct regexp *const r = (struct regexp *)SvANY(rx);
11658 regexp_internal *reti;
11660 RXi_GET_DECL(r,ri);
11662 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11664 npar = r->nparens+1;
11667 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11668 Copy(ri->program, reti->program, len+1, regnode);
11671 reti->regstclass = NULL;
11674 struct reg_data *d;
11675 const int count = ri->data->count;
11678 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11679 char, struct reg_data);
11680 Newx(d->what, count, U8);
11683 for (i = 0; i < count; i++) {
11684 d->what[i] = ri->data->what[i];
11685 switch (d->what[i]) {
11686 /* legal options are one of: sSfpontTua
11687 see also regcomp.h and pregfree() */
11688 case 'a': /* actually an AV, but the dup function is identical. */
11691 case 'p': /* actually an AV, but the dup function is identical. */
11692 case 'u': /* actually an HV, but the dup function is identical. */
11693 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11696 /* This is cheating. */
11697 Newx(d->data[i], 1, struct regnode_charclass_class);
11698 StructCopy(ri->data->data[i], d->data[i],
11699 struct regnode_charclass_class);
11700 reti->regstclass = (regnode*)d->data[i];
11703 /* Compiled op trees are readonly and in shared memory,
11704 and can thus be shared without duplication. */
11706 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11710 /* Trie stclasses are readonly and can thus be shared
11711 * without duplication. We free the stclass in pregfree
11712 * when the corresponding reg_ac_data struct is freed.
11714 reti->regstclass= ri->regstclass;
11718 ((reg_trie_data*)ri->data->data[i])->refcount++;
11722 d->data[i] = ri->data->data[i];
11725 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11734 reti->name_list_idx = ri->name_list_idx;
11736 #ifdef RE_TRACK_PATTERN_OFFSETS
11737 if (ri->u.offsets) {
11738 Newx(reti->u.offsets, 2*len+1, U32);
11739 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11742 SetProgLen(reti,len);
11745 return (void*)reti;
11748 #endif /* USE_ITHREADS */
11750 #ifndef PERL_IN_XSUB_RE
11753 - regnext - dig the "next" pointer out of a node
11756 Perl_regnext(pTHX_ register regnode *p)
11759 register I32 offset;
11764 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
11765 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11768 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11777 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11780 STRLEN l1 = strlen(pat1);
11781 STRLEN l2 = strlen(pat2);
11784 const char *message;
11786 PERL_ARGS_ASSERT_RE_CROAK2;
11792 Copy(pat1, buf, l1 , char);
11793 Copy(pat2, buf + l1, l2 , char);
11794 buf[l1 + l2] = '\n';
11795 buf[l1 + l2 + 1] = '\0';
11797 /* ANSI variant takes additional second argument */
11798 va_start(args, pat2);
11802 msv = vmess(buf, &args);
11804 message = SvPV_const(msv,l1);
11807 Copy(message, buf, l1 , char);
11808 buf[l1-1] = '\0'; /* Overwrite \n */
11809 Perl_croak(aTHX_ "%s", buf);
11812 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
11814 #ifndef PERL_IN_XSUB_RE
11816 Perl_save_re_context(pTHX)
11820 struct re_save_state *state;
11822 SAVEVPTR(PL_curcop);
11823 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11825 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11826 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11827 SSPUSHUV(SAVEt_RE_STATE);
11829 Copy(&PL_reg_state, state, 1, struct re_save_state);
11831 PL_reg_start_tmp = 0;
11832 PL_reg_start_tmpl = 0;
11833 PL_reg_oldsaved = NULL;
11834 PL_reg_oldsavedlen = 0;
11835 PL_reg_maxiter = 0;
11836 PL_reg_leftiter = 0;
11837 PL_reg_poscache = NULL;
11838 PL_reg_poscache_size = 0;
11839 #ifdef PERL_OLD_COPY_ON_WRITE
11843 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11845 const REGEXP * const rx = PM_GETRE(PL_curpm);
11848 for (i = 1; i <= RX_NPARENS(rx); i++) {
11849 char digits[TYPE_CHARS(long)];
11850 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11851 GV *const *const gvp
11852 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11855 GV * const gv = *gvp;
11856 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11866 clear_re(pTHX_ void *r)
11869 ReREFCNT_dec((REGEXP *)r);
11875 S_put_byte(pTHX_ SV *sv, int c)
11877 PERL_ARGS_ASSERT_PUT_BYTE;
11879 /* Our definition of isPRINT() ignores locales, so only bytes that are
11880 not part of UTF-8 are considered printable. I assume that the same
11881 holds for UTF-EBCDIC.
11882 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11883 which Wikipedia says:
11885 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11886 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11887 identical, to the ASCII delete (DEL) or rubout control character.
11888 ) So the old condition can be simplified to !isPRINT(c) */
11891 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11894 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11898 const char string = c;
11899 if (c == '-' || c == ']' || c == '\\' || c == '^')
11900 sv_catpvs(sv, "\\");
11901 sv_catpvn(sv, &string, 1);
11906 #define CLEAR_OPTSTART \
11907 if (optstart) STMT_START { \
11908 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11912 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11914 STATIC const regnode *
11915 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11916 const regnode *last, const regnode *plast,
11917 SV* sv, I32 indent, U32 depth)
11920 register U8 op = PSEUDO; /* Arbitrary non-END op. */
11921 register const regnode *next;
11922 const regnode *optstart= NULL;
11924 RXi_GET_DECL(r,ri);
11925 GET_RE_DEBUG_FLAGS_DECL;
11927 PERL_ARGS_ASSERT_DUMPUNTIL;
11929 #ifdef DEBUG_DUMPUNTIL
11930 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11931 last ? last-start : 0,plast ? plast-start : 0);
11934 if (plast && plast < last)
11937 while (PL_regkind[op] != END && (!last || node < last)) {
11938 /* While that wasn't END last time... */
11941 if (op == CLOSE || op == WHILEM)
11943 next = regnext((regnode *)node);
11946 if (OP(node) == OPTIMIZED) {
11947 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11954 regprop(r, sv, node);
11955 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11956 (int)(2*indent + 1), "", SvPVX_const(sv));
11958 if (OP(node) != OPTIMIZED) {
11959 if (next == NULL) /* Next ptr. */
11960 PerlIO_printf(Perl_debug_log, " (0)");
11961 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11962 PerlIO_printf(Perl_debug_log, " (FAIL)");
11964 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11965 (void)PerlIO_putc(Perl_debug_log, '\n');
11969 if (PL_regkind[(U8)op] == BRANCHJ) {
11972 register const regnode *nnode = (OP(next) == LONGJMP
11973 ? regnext((regnode *)next)
11975 if (last && nnode > last)
11977 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11980 else if (PL_regkind[(U8)op] == BRANCH) {
11982 DUMPUNTIL(NEXTOPER(node), next);
11984 else if ( PL_regkind[(U8)op] == TRIE ) {
11985 const regnode *this_trie = node;
11986 const char op = OP(node);
11987 const U32 n = ARG(node);
11988 const reg_ac_data * const ac = op>=AHOCORASICK ?
11989 (reg_ac_data *)ri->data->data[n] :
11991 const reg_trie_data * const trie =
11992 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11994 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11996 const regnode *nextbranch= NULL;
11999 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
12000 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
12002 PerlIO_printf(Perl_debug_log, "%*s%s ",
12003 (int)(2*(indent+3)), "",
12004 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12005 PL_colors[0], PL_colors[1],
12006 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12007 PERL_PV_PRETTY_ELLIPSES |
12008 PERL_PV_PRETTY_LTGT
12013 U16 dist= trie->jump[word_idx+1];
12014 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12015 (UV)((dist ? this_trie + dist : next) - start));
12018 nextbranch= this_trie + trie->jump[0];
12019 DUMPUNTIL(this_trie + dist, nextbranch);
12021 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12022 nextbranch= regnext((regnode *)nextbranch);
12024 PerlIO_printf(Perl_debug_log, "\n");
12027 if (last && next > last)
12032 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
12033 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12034 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12036 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12038 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12040 else if ( op == PLUS || op == STAR) {
12041 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
12043 else if (PL_regkind[(U8)op] == ANYOF) {
12044 /* arglen 1 + class block */
12045 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
12046 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12047 node = NEXTOPER(node);
12049 else if (PL_regkind[(U8)op] == EXACT) {
12050 /* Literal string, where present. */
12051 node += NODE_SZ_STR(node) - 1;
12052 node = NEXTOPER(node);
12055 node = NEXTOPER(node);
12056 node += regarglen[(U8)op];
12058 if (op == CURLYX || op == OPEN)
12062 #ifdef DEBUG_DUMPUNTIL
12063 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12068 #endif /* DEBUGGING */
12072 * c-indentation-style: bsd
12073 * c-basic-offset: 4
12074 * indent-tabs-mode: t
12077 * ex: set ts=8 sts=4 sw=4 noet: