5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
140 HV *paren_names; /* Paren names */
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
147 char *starttry; /* -Dr: where regtry was called. */
148 #define RExC_starttry (pRExC_state->starttry)
151 const char *lastparse;
153 AV *paren_name_list; /* idx -> name */
154 #define RExC_lastparse (pRExC_state->lastparse)
155 #define RExC_lastnum (pRExC_state->lastnum)
156 #define RExC_paren_name_list (pRExC_state->paren_name_list)
160 #define RExC_flags (pRExC_state->flags)
161 #define RExC_precomp (pRExC_state->precomp)
162 #define RExC_rx_sv (pRExC_state->rx_sv)
163 #define RExC_rx (pRExC_state->rx)
164 #define RExC_rxi (pRExC_state->rxi)
165 #define RExC_start (pRExC_state->start)
166 #define RExC_end (pRExC_state->end)
167 #define RExC_parse (pRExC_state->parse)
168 #define RExC_whilem_seen (pRExC_state->whilem_seen)
169 #ifdef RE_TRACK_PATTERN_OFFSETS
170 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
172 #define RExC_emit (pRExC_state->emit)
173 #define RExC_emit_start (pRExC_state->emit_start)
174 #define RExC_emit_bound (pRExC_state->emit_bound)
175 #define RExC_naughty (pRExC_state->naughty)
176 #define RExC_sawback (pRExC_state->sawback)
177 #define RExC_seen (pRExC_state->seen)
178 #define RExC_size (pRExC_state->size)
179 #define RExC_npar (pRExC_state->npar)
180 #define RExC_nestroot (pRExC_state->nestroot)
181 #define RExC_extralen (pRExC_state->extralen)
182 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
183 #define RExC_seen_evals (pRExC_state->seen_evals)
184 #define RExC_utf8 (pRExC_state->utf8)
185 #define RExC_uni_semantics (pRExC_state->uni_semantics)
186 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
187 #define RExC_open_parens (pRExC_state->open_parens)
188 #define RExC_close_parens (pRExC_state->close_parens)
189 #define RExC_opend (pRExC_state->opend)
190 #define RExC_paren_names (pRExC_state->paren_names)
191 #define RExC_recurse (pRExC_state->recurse)
192 #define RExC_recurse_count (pRExC_state->recurse_count)
193 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
194 #define RExC_contains_locale (pRExC_state->contains_locale)
197 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
198 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
199 ((*s) == '{' && regcurly(s)))
202 #undef SPSTART /* dratted cpp namespace... */
205 * Flags to be passed up and down.
207 #define WORST 0 /* Worst case. */
208 #define HASWIDTH 0x01 /* Known to match non-null strings. */
210 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
211 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
213 #define SPSTART 0x04 /* Starts with * or +. */
214 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
215 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
217 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
219 /* whether trie related optimizations are enabled */
220 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
221 #define TRIE_STUDY_OPT
222 #define FULL_TRIE_STUDY
228 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
229 #define PBITVAL(paren) (1 << ((paren) & 7))
230 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
231 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
232 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
234 /* If not already in utf8, do a longjmp back to the beginning */
235 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
236 #define REQUIRE_UTF8 STMT_START { \
237 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
240 /* About scan_data_t.
242 During optimisation we recurse through the regexp program performing
243 various inplace (keyhole style) optimisations. In addition study_chunk
244 and scan_commit populate this data structure with information about
245 what strings MUST appear in the pattern. We look for the longest
246 string that must appear at a fixed location, and we look for the
247 longest string that may appear at a floating location. So for instance
252 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
253 strings (because they follow a .* construct). study_chunk will identify
254 both FOO and BAR as being the longest fixed and floating strings respectively.
256 The strings can be composites, for instance
260 will result in a composite fixed substring 'foo'.
262 For each string some basic information is maintained:
264 - offset or min_offset
265 This is the position the string must appear at, or not before.
266 It also implicitly (when combined with minlenp) tells us how many
267 characters must match before the string we are searching for.
268 Likewise when combined with minlenp and the length of the string it
269 tells us how many characters must appear after the string we have
273 Only used for floating strings. This is the rightmost point that
274 the string can appear at. If set to I32 max it indicates that the
275 string can occur infinitely far to the right.
278 A pointer to the minimum length of the pattern that the string
279 was found inside. This is important as in the case of positive
280 lookahead or positive lookbehind we can have multiple patterns
285 The minimum length of the pattern overall is 3, the minimum length
286 of the lookahead part is 3, but the minimum length of the part that
287 will actually match is 1. So 'FOO's minimum length is 3, but the
288 minimum length for the F is 1. This is important as the minimum length
289 is used to determine offsets in front of and behind the string being
290 looked for. Since strings can be composites this is the length of the
291 pattern at the time it was committed with a scan_commit. Note that
292 the length is calculated by study_chunk, so that the minimum lengths
293 are not known until the full pattern has been compiled, thus the
294 pointer to the value.
298 In the case of lookbehind the string being searched for can be
299 offset past the start point of the final matching string.
300 If this value was just blithely removed from the min_offset it would
301 invalidate some of the calculations for how many chars must match
302 before or after (as they are derived from min_offset and minlen and
303 the length of the string being searched for).
304 When the final pattern is compiled and the data is moved from the
305 scan_data_t structure into the regexp structure the information
306 about lookbehind is factored in, with the information that would
307 have been lost precalculated in the end_shift field for the
310 The fields pos_min and pos_delta are used to store the minimum offset
311 and the delta to the maximum offset at the current point in the pattern.
315 typedef struct scan_data_t {
316 /*I32 len_min; unused */
317 /*I32 len_delta; unused */
321 I32 last_end; /* min value, <0 unless valid. */
324 SV **longest; /* Either &l_fixed, or &l_float. */
325 SV *longest_fixed; /* longest fixed string found in pattern */
326 I32 offset_fixed; /* offset where it starts */
327 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
328 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
329 SV *longest_float; /* longest floating string found in pattern */
330 I32 offset_float_min; /* earliest point in string it can appear */
331 I32 offset_float_max; /* latest point in string it can appear */
332 I32 *minlen_float; /* pointer to the minlen relevant to the string */
333 I32 lookbehind_float; /* is the position of the string modified by LB */
337 struct regnode_charclass_class *start_class;
341 * Forward declarations for pregcomp()'s friends.
344 static const scan_data_t zero_scan_data =
345 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
347 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
348 #define SF_BEFORE_SEOL 0x0001
349 #define SF_BEFORE_MEOL 0x0002
350 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
351 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
354 # define SF_FIX_SHIFT_EOL (0+2)
355 # define SF_FL_SHIFT_EOL (0+4)
357 # define SF_FIX_SHIFT_EOL (+2)
358 # define SF_FL_SHIFT_EOL (+4)
361 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
362 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
364 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
365 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
366 #define SF_IS_INF 0x0040
367 #define SF_HAS_PAR 0x0080
368 #define SF_IN_PAR 0x0100
369 #define SF_HAS_EVAL 0x0200
370 #define SCF_DO_SUBSTR 0x0400
371 #define SCF_DO_STCLASS_AND 0x0800
372 #define SCF_DO_STCLASS_OR 0x1000
373 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
374 #define SCF_WHILEM_VISITED_POS 0x2000
376 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
377 #define SCF_SEEN_ACCEPT 0x8000
379 #define UTF cBOOL(RExC_utf8)
380 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
381 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
382 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
383 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
384 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
385 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
386 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
388 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
390 #define OOB_UNICODE 12345678
391 #define OOB_NAMEDCLASS -1
393 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
394 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
397 /* length of regex to show in messages that don't mark a position within */
398 #define RegexLengthToShowInErrorMessages 127
401 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
402 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
403 * op/pragma/warn/regcomp.
405 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
406 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
408 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
411 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
412 * arg. Show regex, up to a maximum length. If it's too long, chop and add
415 #define _FAIL(code) STMT_START { \
416 const char *ellipses = ""; \
417 IV len = RExC_end - RExC_precomp; \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
421 if (len > RegexLengthToShowInErrorMessages) { \
422 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
423 len = RegexLengthToShowInErrorMessages - 10; \
429 #define FAIL(msg) _FAIL( \
430 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
431 msg, (int)len, RExC_precomp, ellipses))
433 #define FAIL2(msg,arg) _FAIL( \
434 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
435 arg, (int)len, RExC_precomp, ellipses))
438 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
440 #define Simple_vFAIL(m) STMT_START { \
441 const IV offset = RExC_parse - RExC_precomp; \
442 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
443 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
449 #define vFAIL(m) STMT_START { \
451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
456 * Like Simple_vFAIL(), but accepts two arguments.
458 #define Simple_vFAIL2(m,a1) STMT_START { \
459 const IV offset = RExC_parse - RExC_precomp; \
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
465 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
467 #define vFAIL2(m,a1) STMT_START { \
469 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
470 Simple_vFAIL2(m, a1); \
475 * Like Simple_vFAIL(), but accepts three arguments.
477 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
478 const IV offset = RExC_parse - RExC_precomp; \
479 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
480 (int)offset, RExC_precomp, RExC_precomp + offset); \
484 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
486 #define vFAIL3(m,a1,a2) STMT_START { \
488 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
489 Simple_vFAIL3(m, a1, a2); \
493 * Like Simple_vFAIL(), but accepts four arguments.
495 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
496 const IV offset = RExC_parse - RExC_precomp; \
497 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
498 (int)offset, RExC_precomp, RExC_precomp + offset); \
501 #define ckWARNreg(loc,m) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 (int)offset, RExC_precomp, RExC_precomp + offset); \
507 #define ckWARNregdep(loc,m) STMT_START { \
508 const IV offset = loc - RExC_precomp; \
509 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
514 #define ckWARN2regdep(loc,m, a1) STMT_START { \
515 const IV offset = loc - RExC_precomp; \
516 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
518 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define ckWARN2reg(loc, m, a1) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
527 #define vWARN3(loc, m, a1, a2) STMT_START { \
528 const IV offset = loc - RExC_precomp; \
529 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
530 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
533 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
534 const IV offset = loc - RExC_precomp; \
535 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
536 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
539 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
542 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
545 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
546 const IV offset = loc - RExC_precomp; \
547 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
548 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
551 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
552 const IV offset = loc - RExC_precomp; \
553 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
554 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
558 /* Allow for side effects in s */
559 #define REGC(c,s) STMT_START { \
560 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
563 /* Macros for recording node offsets. 20001227 mjd@plover.com
564 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
565 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
566 * Element 0 holds the number n.
567 * Position is 1 indexed.
569 #ifndef RE_TRACK_PATTERN_OFFSETS
570 #define Set_Node_Offset_To_R(node,byte)
571 #define Set_Node_Offset(node,byte)
572 #define Set_Cur_Node_Offset
573 #define Set_Node_Length_To_R(node,len)
574 #define Set_Node_Length(node,len)
575 #define Set_Node_Cur_Length(node)
576 #define Node_Offset(n)
577 #define Node_Length(n)
578 #define Set_Node_Offset_Length(node,offset,len)
579 #define ProgLen(ri) ri->u.proglen
580 #define SetProgLen(ri,x) ri->u.proglen = x
582 #define ProgLen(ri) ri->u.offsets[0]
583 #define SetProgLen(ri,x) ri->u.offsets[0] = x
584 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
586 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
587 __LINE__, (int)(node), (int)(byte))); \
589 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
591 RExC_offsets[2*(node)-1] = (byte); \
596 #define Set_Node_Offset(node,byte) \
597 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
598 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
600 #define Set_Node_Length_To_R(node,len) STMT_START { \
602 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
603 __LINE__, (int)(node), (int)(len))); \
605 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
607 RExC_offsets[2*(node)] = (len); \
612 #define Set_Node_Length(node,len) \
613 Set_Node_Length_To_R((node)-RExC_emit_start, len)
614 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
615 #define Set_Node_Cur_Length(node) \
616 Set_Node_Length(node, RExC_parse - parse_start)
618 /* Get offsets and lengths */
619 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
620 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
622 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
623 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
624 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
628 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
629 #define EXPERIMENTAL_INPLACESCAN
630 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
632 #define DEBUG_STUDYDATA(str,data,depth) \
633 DEBUG_OPTIMISE_MORE_r(if(data){ \
634 PerlIO_printf(Perl_debug_log, \
635 "%*s" str "Pos:%"IVdf"/%"IVdf \
636 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
637 (int)(depth)*2, "", \
638 (IV)((data)->pos_min), \
639 (IV)((data)->pos_delta), \
640 (UV)((data)->flags), \
641 (IV)((data)->whilem_c), \
642 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
643 is_inf ? "INF " : "" \
645 if ((data)->last_found) \
646 PerlIO_printf(Perl_debug_log, \
647 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
648 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
649 SvPVX_const((data)->last_found), \
650 (IV)((data)->last_end), \
651 (IV)((data)->last_start_min), \
652 (IV)((data)->last_start_max), \
653 ((data)->longest && \
654 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
655 SvPVX_const((data)->longest_fixed), \
656 (IV)((data)->offset_fixed), \
657 ((data)->longest && \
658 (data)->longest==&((data)->longest_float)) ? "*" : "", \
659 SvPVX_const((data)->longest_float), \
660 (IV)((data)->offset_float_min), \
661 (IV)((data)->offset_float_max) \
663 PerlIO_printf(Perl_debug_log,"\n"); \
666 static void clear_re(pTHX_ void *r);
668 /* Mark that we cannot extend a found fixed substring at this point.
669 Update the longest found anchored substring and the longest found
670 floating substrings if needed. */
673 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
675 const STRLEN l = CHR_SVLEN(data->last_found);
676 const STRLEN old_l = CHR_SVLEN(*data->longest);
677 GET_RE_DEBUG_FLAGS_DECL;
679 PERL_ARGS_ASSERT_SCAN_COMMIT;
681 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
682 SvSetMagicSV(*data->longest, data->last_found);
683 if (*data->longest == data->longest_fixed) {
684 data->offset_fixed = l ? data->last_start_min : data->pos_min;
685 if (data->flags & SF_BEFORE_EOL)
687 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
689 data->flags &= ~SF_FIX_BEFORE_EOL;
690 data->minlen_fixed=minlenp;
691 data->lookbehind_fixed=0;
693 else { /* *data->longest == data->longest_float */
694 data->offset_float_min = l ? data->last_start_min : data->pos_min;
695 data->offset_float_max = (l
696 ? data->last_start_max
697 : data->pos_min + data->pos_delta);
698 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
699 data->offset_float_max = I32_MAX;
700 if (data->flags & SF_BEFORE_EOL)
702 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
704 data->flags &= ~SF_FL_BEFORE_EOL;
705 data->minlen_float=minlenp;
706 data->lookbehind_float=0;
709 SvCUR_set(data->last_found, 0);
711 SV * const sv = data->last_found;
712 if (SvUTF8(sv) && SvMAGICAL(sv)) {
713 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
719 data->flags &= ~SF_BEFORE_EOL;
720 DEBUG_STUDYDATA("commit: ",data,0);
723 /* Can match anything (initialization) */
725 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
727 PERL_ARGS_ASSERT_CL_ANYTHING;
729 ANYOF_BITMAP_SETALL(cl);
730 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
731 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL
732 /* Even though no bitmap is in use here, we need to set
733 * the flag below so an AND with a node that does have one
734 * doesn't lose that one. The flag should get cleared if
735 * the other one doesn't; and the code in regexec.c is
736 * structured so this being set when not needed does no
737 * harm. It seemed a little cleaner to set it here than do
738 * a special case in cl_and() */
739 |ANYOF_NONBITMAP_NON_UTF8;
741 /* If any portion of the regex is to operate under locale rules,
742 * initialization includes it. The reason this isn't done for all regexes
743 * is that the optimizer was written under the assumption that locale was
744 * all-or-nothing. Given the complexity and lack of documentation in the
745 * optimizer, and that there are inadequate test cases for locale, so many
746 * parts of it may not work properly, it is safest to avoid locale unless
748 if (RExC_contains_locale) {
749 ANYOF_CLASS_SETALL(cl); /* /l uses class */
750 cl->flags |= ANYOF_LOCALE;
753 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
757 /* Can match anything (initialization) */
759 S_cl_is_anything(const struct regnode_charclass_class *cl)
763 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
765 for (value = 0; value <= ANYOF_MAX; value += 2)
766 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
768 if (!(cl->flags & ANYOF_UNICODE_ALL))
770 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
775 /* Can match anything (initialization) */
777 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
779 PERL_ARGS_ASSERT_CL_INIT;
781 Zero(cl, 1, struct regnode_charclass_class);
783 cl_anything(pRExC_state, cl);
784 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
787 /* These two functions currently do the exact same thing */
788 #define cl_init_zero S_cl_init
790 /* 'AND' a given class with another one. Can create false positives. 'cl'
791 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
792 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
794 S_cl_and(struct regnode_charclass_class *cl,
795 const struct regnode_charclass_class *and_with)
797 PERL_ARGS_ASSERT_CL_AND;
799 assert(and_with->type == ANYOF);
801 /* I (khw) am not sure all these restrictions are necessary XXX */
802 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
803 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
804 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
806 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
809 if (and_with->flags & ANYOF_INVERT)
810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 cl->bitmap[i] &= ~and_with->bitmap[i];
813 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
814 cl->bitmap[i] &= and_with->bitmap[i];
815 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
817 if (and_with->flags & ANYOF_INVERT) {
819 /* Here, the and'ed node is inverted. Get the AND of the flags that
820 * aren't affected by the inversion. Those that are affected are
821 * handled individually below */
822 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
823 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
824 cl->flags |= affected_flags;
826 /* We currently don't know how to deal with things that aren't in the
827 * bitmap, but we know that the intersection is no greater than what
828 * is already in cl, so let there be false positives that get sorted
829 * out after the synthetic start class succeeds, and the node is
830 * matched for real. */
832 /* The inversion of these two flags indicate that the resulting
833 * intersection doesn't have them */
834 if (and_with->flags & ANYOF_UNICODE_ALL) {
835 cl->flags &= ~ANYOF_UNICODE_ALL;
837 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
838 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
841 else { /* and'd node is not inverted */
842 if (! ANYOF_NONBITMAP(and_with)) {
844 /* Here 'and_with' doesn't match anything outside the bitmap
845 * (except possibly ANYOF_UNICODE_ALL), which means the
846 * intersection can't either, except for ANYOF_UNICODE_ALL, in
847 * which case we don't know what the intersection is, but it's no
848 * greater than what cl already has, so can just leave it alone,
849 * with possible false positives */
850 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
851 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
852 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
855 else if (! ANYOF_NONBITMAP(cl)) {
857 /* Here, 'and_with' does match something outside the bitmap, and cl
858 * doesn't have a list of things to match outside the bitmap. If
859 * cl can match all code points above 255, the intersection will
860 * be those above-255 code points that 'and_with' matches. There
861 * may be false positives from code points in 'and_with' that are
862 * outside the bitmap but below 256, but those get sorted out
863 * after the synthetic start class succeeds). If cl can't match
864 * all Unicode code points, it means here that it can't match *
865 * anything outside the bitmap, so we leave the bitmap empty */
866 if (cl->flags & ANYOF_UNICODE_ALL) {
867 ARG_SET(cl, ARG(and_with));
871 /* Here, both 'and_with' and cl match something outside the
872 * bitmap. Currently we do not do the intersection, so just match
873 * whatever cl had at the beginning. */
877 /* Take the intersection of the two sets of flags */
878 cl->flags &= and_with->flags;
882 /* 'OR' a given class with another one. Can create false positives. 'cl'
883 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
884 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
886 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
888 PERL_ARGS_ASSERT_CL_OR;
890 if (or_with->flags & ANYOF_INVERT) {
892 /* Here, the or'd node is to be inverted. This means we take the
893 * complement of everything not in the bitmap, but currently we don't
894 * know what that is, so give up and match anything */
895 if (ANYOF_NONBITMAP(or_with)) {
896 cl_anything(pRExC_state, cl);
899 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
900 * <= (B1 | !B2) | (CL1 | !CL2)
901 * which is wasteful if CL2 is small, but we ignore CL2:
902 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
903 * XXXX Can we handle case-fold? Unclear:
904 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
905 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
907 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
908 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
909 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
912 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
913 cl->bitmap[i] |= ~or_with->bitmap[i];
914 } /* XXXX: logic is complicated otherwise */
916 cl_anything(pRExC_state, cl);
919 /* And, we can just take the union of the flags that aren't affected
920 * by the inversion */
921 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
923 /* For the remaining flags:
924 ANYOF_UNICODE_ALL and inverted means to not match anything above
925 255, which means that the union with cl should just be
926 what cl has in it, so can ignore this flag
927 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
928 is 127-255 to match them, but then invert that, so the
929 union with cl should just be what cl has in it, so can
932 } else { /* 'or_with' is not inverted */
933 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
934 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
935 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
936 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
939 /* OR char bitmap and class bitmap separately */
940 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
941 cl->bitmap[i] |= or_with->bitmap[i];
942 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
943 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
944 cl->classflags[i] |= or_with->classflags[i];
945 cl->flags |= ANYOF_CLASS;
948 else { /* XXXX: logic is complicated, leave it along for a moment. */
949 cl_anything(pRExC_state, cl);
952 if (ANYOF_NONBITMAP(or_with)) {
954 /* Use the added node's outside-the-bit-map match if there isn't a
955 * conflict. If there is a conflict (both nodes match something
956 * outside the bitmap, but what they match outside is not the same
957 * pointer, and hence not easily compared until XXX we extend
958 * inversion lists this far), give up and allow the start class to
959 * match everything outside the bitmap. If that stuff is all above
960 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
961 if (! ANYOF_NONBITMAP(cl)) {
962 ARG_SET(cl, ARG(or_with));
964 else if (ARG(cl) != ARG(or_with)) {
966 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
967 cl_anything(pRExC_state, cl);
970 cl->flags |= ANYOF_UNICODE_ALL;
975 cl->flags |= or_with->flags;
980 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
981 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
982 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
983 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
988 dump_trie(trie,widecharmap,revcharmap)
989 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
990 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
992 These routines dump out a trie in a somewhat readable format.
993 The _interim_ variants are used for debugging the interim
994 tables that are used to generate the final compressed
995 representation which is what dump_trie expects.
997 Part of the reason for their existence is to provide a form
998 of documentation as to how the different representations function.
1003 Dumps the final compressed table form of the trie to Perl_debug_log.
1004 Used for debugging make_trie().
1008 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1009 AV *revcharmap, U32 depth)
1012 SV *sv=sv_newmortal();
1013 int colwidth= widecharmap ? 6 : 4;
1015 GET_RE_DEBUG_FLAGS_DECL;
1017 PERL_ARGS_ASSERT_DUMP_TRIE;
1019 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1020 (int)depth * 2 + 2,"",
1021 "Match","Base","Ofs" );
1023 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1024 SV ** const tmp = av_fetch( revcharmap, state, 0);
1026 PerlIO_printf( Perl_debug_log, "%*s",
1028 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1029 PL_colors[0], PL_colors[1],
1030 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031 PERL_PV_ESCAPE_FIRSTCHAR
1036 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1037 (int)depth * 2 + 2,"");
1039 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1040 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1041 PerlIO_printf( Perl_debug_log, "\n");
1043 for( state = 1 ; state < trie->statecount ; state++ ) {
1044 const U32 base = trie->states[ state ].trans.base;
1046 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1048 if ( trie->states[ state ].wordnum ) {
1049 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1051 PerlIO_printf( Perl_debug_log, "%6s", "" );
1054 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1059 while( ( base + ofs < trie->uniquecharcount ) ||
1060 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1061 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1064 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1066 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1067 if ( ( base + ofs >= trie->uniquecharcount ) &&
1068 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1069 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1071 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1073 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1075 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1079 PerlIO_printf( Perl_debug_log, "]");
1082 PerlIO_printf( Perl_debug_log, "\n" );
1084 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1085 for (word=1; word <= trie->wordcount; word++) {
1086 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1087 (int)word, (int)(trie->wordinfo[word].prev),
1088 (int)(trie->wordinfo[word].len));
1090 PerlIO_printf(Perl_debug_log, "\n" );
1093 Dumps a fully constructed but uncompressed trie in list form.
1094 List tries normally only are used for construction when the number of
1095 possible chars (trie->uniquecharcount) is very high.
1096 Used for debugging make_trie().
1099 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1100 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1104 SV *sv=sv_newmortal();
1105 int colwidth= widecharmap ? 6 : 4;
1106 GET_RE_DEBUG_FLAGS_DECL;
1108 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1110 /* print out the table precompression. */
1111 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1112 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1113 "------:-----+-----------------\n" );
1115 for( state=1 ; state < next_alloc ; state ++ ) {
1118 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1119 (int)depth * 2 + 2,"", (UV)state );
1120 if ( ! trie->states[ state ].wordnum ) {
1121 PerlIO_printf( Perl_debug_log, "%5s| ","");
1123 PerlIO_printf( Perl_debug_log, "W%4x| ",
1124 trie->states[ state ].wordnum
1127 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1128 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1130 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1132 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1133 PL_colors[0], PL_colors[1],
1134 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1135 PERL_PV_ESCAPE_FIRSTCHAR
1137 TRIE_LIST_ITEM(state,charid).forid,
1138 (UV)TRIE_LIST_ITEM(state,charid).newstate
1141 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1142 (int)((depth * 2) + 14), "");
1145 PerlIO_printf( Perl_debug_log, "\n");
1150 Dumps a fully constructed but uncompressed trie in table form.
1151 This is the normal DFA style state transition table, with a few
1152 twists to facilitate compression later.
1153 Used for debugging make_trie().
1156 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1157 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1162 SV *sv=sv_newmortal();
1163 int colwidth= widecharmap ? 6 : 4;
1164 GET_RE_DEBUG_FLAGS_DECL;
1166 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1169 print out the table precompression so that we can do a visual check
1170 that they are identical.
1173 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1175 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1176 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1178 PerlIO_printf( Perl_debug_log, "%*s",
1180 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1181 PL_colors[0], PL_colors[1],
1182 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1183 PERL_PV_ESCAPE_FIRSTCHAR
1189 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1191 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1192 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1195 PerlIO_printf( Perl_debug_log, "\n" );
1197 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1199 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1200 (int)depth * 2 + 2,"",
1201 (UV)TRIE_NODENUM( state ) );
1203 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1206 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1208 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1210 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1211 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1213 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1214 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1222 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1223 startbranch: the first branch in the whole branch sequence
1224 first : start branch of sequence of branch-exact nodes.
1225 May be the same as startbranch
1226 last : Thing following the last branch.
1227 May be the same as tail.
1228 tail : item following the branch sequence
1229 count : words in the sequence
1230 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1231 depth : indent depth
1233 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1235 A trie is an N'ary tree where the branches are determined by digital
1236 decomposition of the key. IE, at the root node you look up the 1st character and
1237 follow that branch repeat until you find the end of the branches. Nodes can be
1238 marked as "accepting" meaning they represent a complete word. Eg:
1242 would convert into the following structure. Numbers represent states, letters
1243 following numbers represent valid transitions on the letter from that state, if
1244 the number is in square brackets it represents an accepting state, otherwise it
1245 will be in parenthesis.
1247 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1251 (1) +-i->(6)-+-s->[7]
1253 +-s->(3)-+-h->(4)-+-e->[5]
1255 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1257 This shows that when matching against the string 'hers' we will begin at state 1
1258 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1259 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1260 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1261 single traverse. We store a mapping from accepting to state to which word was
1262 matched, and then when we have multiple possibilities we try to complete the
1263 rest of the regex in the order in which they occured in the alternation.
1265 The only prior NFA like behaviour that would be changed by the TRIE support is
1266 the silent ignoring of duplicate alternations which are of the form:
1268 / (DUPE|DUPE) X? (?{ ... }) Y /x
1270 Thus EVAL blocks following a trie may be called a different number of times with
1271 and without the optimisation. With the optimisations dupes will be silently
1272 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1273 the following demonstrates:
1275 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1277 which prints out 'word' three times, but
1279 'words'=~/(word|word|word)(?{ print $1 })S/
1281 which doesnt print it out at all. This is due to other optimisations kicking in.
1283 Example of what happens on a structural level:
1285 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1287 1: CURLYM[1] {1,32767}(18)
1298 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1299 and should turn into:
1301 1: CURLYM[1] {1,32767}(18)
1303 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1311 Cases where tail != last would be like /(?foo|bar)baz/:
1321 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1322 and would end up looking like:
1325 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1332 d = uvuni_to_utf8_flags(d, uv, 0);
1334 is the recommended Unicode-aware way of saying
1339 #define TRIE_STORE_REVCHAR \
1342 SV *zlopp = newSV(2); \
1343 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1344 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1345 SvCUR_set(zlopp, kapow - flrbbbbb); \
1348 av_push(revcharmap, zlopp); \
1350 char ooooff = (char)uvc; \
1351 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1355 #define TRIE_READ_CHAR STMT_START { \
1359 if ( foldlen > 0 ) { \
1360 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1365 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1366 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1367 foldlen -= UNISKIP( uvc ); \
1368 scan = foldbuf + UNISKIP( uvc ); \
1371 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1381 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1382 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1383 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1384 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1386 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1387 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1388 TRIE_LIST_CUR( state )++; \
1391 #define TRIE_LIST_NEW(state) STMT_START { \
1392 Newxz( trie->states[ state ].trans.list, \
1393 4, reg_trie_trans_le ); \
1394 TRIE_LIST_CUR( state ) = 1; \
1395 TRIE_LIST_LEN( state ) = 4; \
1398 #define TRIE_HANDLE_WORD(state) STMT_START { \
1399 U16 dupe= trie->states[ state ].wordnum; \
1400 regnode * const noper_next = regnext( noper ); \
1403 /* store the word for dumping */ \
1405 if (OP(noper) != NOTHING) \
1406 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1408 tmp = newSVpvn_utf8( "", 0, UTF ); \
1409 av_push( trie_words, tmp ); \
1413 trie->wordinfo[curword].prev = 0; \
1414 trie->wordinfo[curword].len = wordlen; \
1415 trie->wordinfo[curword].accept = state; \
1417 if ( noper_next < tail ) { \
1419 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1420 trie->jump[curword] = (U16)(noper_next - convert); \
1422 jumper = noper_next; \
1424 nextbranch= regnext(cur); \
1428 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1429 /* chain, so that when the bits of chain are later */\
1430 /* linked together, the dups appear in the chain */\
1431 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1432 trie->wordinfo[dupe].prev = curword; \
1434 /* we haven't inserted this word yet. */ \
1435 trie->states[ state ].wordnum = curword; \
1440 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1441 ( ( base + charid >= ucharcount \
1442 && base + charid < ubound \
1443 && state == trie->trans[ base - ucharcount + charid ].check \
1444 && trie->trans[ base - ucharcount + charid ].next ) \
1445 ? trie->trans[ base - ucharcount + charid ].next \
1446 : ( state==1 ? special : 0 ) \
1450 #define MADE_JUMP_TRIE 2
1451 #define MADE_EXACT_TRIE 4
1454 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1457 /* first pass, loop through and scan words */
1458 reg_trie_data *trie;
1459 HV *widecharmap = NULL;
1460 AV *revcharmap = newAV();
1462 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1467 regnode *jumper = NULL;
1468 regnode *nextbranch = NULL;
1469 regnode *convert = NULL;
1470 U32 *prev_states; /* temp array mapping each state to previous one */
1471 /* we just use folder as a flag in utf8 */
1472 const U8 * folder = NULL;
1475 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1476 AV *trie_words = NULL;
1477 /* along with revcharmap, this only used during construction but both are
1478 * useful during debugging so we store them in the struct when debugging.
1481 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1482 STRLEN trie_charcount=0;
1484 SV *re_trie_maxbuff;
1485 GET_RE_DEBUG_FLAGS_DECL;
1487 PERL_ARGS_ASSERT_MAKE_TRIE;
1489 PERL_UNUSED_ARG(depth);
1494 case EXACTFU: folder = PL_fold_latin1; break;
1495 case EXACTF: folder = PL_fold; break;
1496 case EXACTFL: folder = PL_fold_locale; break;
1499 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1501 trie->startstate = 1;
1502 trie->wordcount = word_count;
1503 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1504 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1505 if (!(UTF && folder))
1506 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1507 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1508 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1511 trie_words = newAV();
1514 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1515 if (!SvIOK(re_trie_maxbuff)) {
1516 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1519 PerlIO_printf( Perl_debug_log,
1520 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1521 (int)depth * 2 + 2, "",
1522 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1523 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1527 /* Find the node we are going to overwrite */
1528 if ( first == startbranch && OP( last ) != BRANCH ) {
1529 /* whole branch chain */
1532 /* branch sub-chain */
1533 convert = NEXTOPER( first );
1536 /* -- First loop and Setup --
1538 We first traverse the branches and scan each word to determine if it
1539 contains widechars, and how many unique chars there are, this is
1540 important as we have to build a table with at least as many columns as we
1543 We use an array of integers to represent the character codes 0..255
1544 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1545 native representation of the character value as the key and IV's for the
1548 *TODO* If we keep track of how many times each character is used we can
1549 remap the columns so that the table compression later on is more
1550 efficient in terms of memory by ensuring the most common value is in the
1551 middle and the least common are on the outside. IMO this would be better
1552 than a most to least common mapping as theres a decent chance the most
1553 common letter will share a node with the least common, meaning the node
1554 will not be compressible. With a middle is most common approach the worst
1555 case is when we have the least common nodes twice.
1559 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1560 regnode * const noper = NEXTOPER( cur );
1561 const U8 *uc = (U8*)STRING( noper );
1562 const U8 * const e = uc + STR_LEN( noper );
1564 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1565 const U8 *scan = (U8*)NULL;
1566 U32 wordlen = 0; /* required init */
1568 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1570 if (OP(noper) == NOTHING) {
1574 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1575 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1576 regardless of encoding */
1578 for ( ; uc < e ; uc += len ) {
1579 TRIE_CHARCOUNT(trie)++;
1583 if ( !trie->charmap[ uvc ] ) {
1584 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1586 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1590 /* store the codepoint in the bitmap, and its folded
1592 TRIE_BITMAP_SET(trie,uvc);
1594 /* store the folded codepoint */
1595 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1598 /* store first byte of utf8 representation of
1599 variant codepoints */
1600 if (! UNI_IS_INVARIANT(uvc)) {
1601 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1604 set_bit = 0; /* We've done our bit :-) */
1609 widecharmap = newHV();
1611 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1614 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1616 if ( !SvTRUE( *svpp ) ) {
1617 sv_setiv( *svpp, ++trie->uniquecharcount );
1622 if( cur == first ) {
1625 } else if (chars < trie->minlen) {
1627 } else if (chars > trie->maxlen) {
1631 } /* end first pass */
1632 DEBUG_TRIE_COMPILE_r(
1633 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1634 (int)depth * 2 + 2,"",
1635 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1636 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1637 (int)trie->minlen, (int)trie->maxlen )
1641 We now know what we are dealing with in terms of unique chars and
1642 string sizes so we can calculate how much memory a naive
1643 representation using a flat table will take. If it's over a reasonable
1644 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1645 conservative but potentially much slower representation using an array
1648 At the end we convert both representations into the same compressed
1649 form that will be used in regexec.c for matching with. The latter
1650 is a form that cannot be used to construct with but has memory
1651 properties similar to the list form and access properties similar
1652 to the table form making it both suitable for fast searches and
1653 small enough that its feasable to store for the duration of a program.
1655 See the comment in the code where the compressed table is produced
1656 inplace from the flat tabe representation for an explanation of how
1657 the compression works.
1662 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1665 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1667 Second Pass -- Array Of Lists Representation
1669 Each state will be represented by a list of charid:state records
1670 (reg_trie_trans_le) the first such element holds the CUR and LEN
1671 points of the allocated array. (See defines above).
1673 We build the initial structure using the lists, and then convert
1674 it into the compressed table form which allows faster lookups
1675 (but cant be modified once converted).
1678 STRLEN transcount = 1;
1680 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1681 "%*sCompiling trie using list compiler\n",
1682 (int)depth * 2 + 2, ""));
1684 trie->states = (reg_trie_state *)
1685 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1686 sizeof(reg_trie_state) );
1690 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1692 regnode * const noper = NEXTOPER( cur );
1693 U8 *uc = (U8*)STRING( noper );
1694 const U8 * const e = uc + STR_LEN( noper );
1695 U32 state = 1; /* required init */
1696 U16 charid = 0; /* sanity init */
1697 U8 *scan = (U8*)NULL; /* sanity init */
1698 STRLEN foldlen = 0; /* required init */
1699 U32 wordlen = 0; /* required init */
1700 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1702 if (OP(noper) != NOTHING) {
1703 for ( ; uc < e ; uc += len ) {
1708 charid = trie->charmap[ uvc ];
1710 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1714 charid=(U16)SvIV( *svpp );
1717 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1724 if ( !trie->states[ state ].trans.list ) {
1725 TRIE_LIST_NEW( state );
1727 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1728 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1729 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1734 newstate = next_alloc++;
1735 prev_states[newstate] = state;
1736 TRIE_LIST_PUSH( state, charid, newstate );
1741 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1745 TRIE_HANDLE_WORD(state);
1747 } /* end second pass */
1749 /* next alloc is the NEXT state to be allocated */
1750 trie->statecount = next_alloc;
1751 trie->states = (reg_trie_state *)
1752 PerlMemShared_realloc( trie->states,
1754 * sizeof(reg_trie_state) );
1756 /* and now dump it out before we compress it */
1757 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1758 revcharmap, next_alloc,
1762 trie->trans = (reg_trie_trans *)
1763 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1770 for( state=1 ; state < next_alloc ; state ++ ) {
1774 DEBUG_TRIE_COMPILE_MORE_r(
1775 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1779 if (trie->states[state].trans.list) {
1780 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1784 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1785 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1786 if ( forid < minid ) {
1788 } else if ( forid > maxid ) {
1792 if ( transcount < tp + maxid - minid + 1) {
1794 trie->trans = (reg_trie_trans *)
1795 PerlMemShared_realloc( trie->trans,
1797 * sizeof(reg_trie_trans) );
1798 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1800 base = trie->uniquecharcount + tp - minid;
1801 if ( maxid == minid ) {
1803 for ( ; zp < tp ; zp++ ) {
1804 if ( ! trie->trans[ zp ].next ) {
1805 base = trie->uniquecharcount + zp - minid;
1806 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1807 trie->trans[ zp ].check = state;
1813 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1814 trie->trans[ tp ].check = state;
1819 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1820 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1821 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1822 trie->trans[ tid ].check = state;
1824 tp += ( maxid - minid + 1 );
1826 Safefree(trie->states[ state ].trans.list);
1829 DEBUG_TRIE_COMPILE_MORE_r(
1830 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1833 trie->states[ state ].trans.base=base;
1835 trie->lasttrans = tp + 1;
1839 Second Pass -- Flat Table Representation.
1841 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1842 We know that we will need Charcount+1 trans at most to store the data
1843 (one row per char at worst case) So we preallocate both structures
1844 assuming worst case.
1846 We then construct the trie using only the .next slots of the entry
1849 We use the .check field of the first entry of the node temporarily to
1850 make compression both faster and easier by keeping track of how many non
1851 zero fields are in the node.
1853 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1856 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1857 number representing the first entry of the node, and state as a
1858 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1859 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1860 are 2 entrys per node. eg:
1868 The table is internally in the right hand, idx form. However as we also
1869 have to deal with the states array which is indexed by nodenum we have to
1870 use TRIE_NODENUM() to convert.
1873 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1874 "%*sCompiling trie using table compiler\n",
1875 (int)depth * 2 + 2, ""));
1877 trie->trans = (reg_trie_trans *)
1878 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1879 * trie->uniquecharcount + 1,
1880 sizeof(reg_trie_trans) );
1881 trie->states = (reg_trie_state *)
1882 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1883 sizeof(reg_trie_state) );
1884 next_alloc = trie->uniquecharcount + 1;
1887 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1889 regnode * const noper = NEXTOPER( cur );
1890 const U8 *uc = (U8*)STRING( noper );
1891 const U8 * const e = uc + STR_LEN( noper );
1893 U32 state = 1; /* required init */
1895 U16 charid = 0; /* sanity init */
1896 U32 accept_state = 0; /* sanity init */
1897 U8 *scan = (U8*)NULL; /* sanity init */
1899 STRLEN foldlen = 0; /* required init */
1900 U32 wordlen = 0; /* required init */
1901 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1903 if ( OP(noper) != NOTHING ) {
1904 for ( ; uc < e ; uc += len ) {
1909 charid = trie->charmap[ uvc ];
1911 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1912 charid = svpp ? (U16)SvIV(*svpp) : 0;
1916 if ( !trie->trans[ state + charid ].next ) {
1917 trie->trans[ state + charid ].next = next_alloc;
1918 trie->trans[ state ].check++;
1919 prev_states[TRIE_NODENUM(next_alloc)]
1920 = TRIE_NODENUM(state);
1921 next_alloc += trie->uniquecharcount;
1923 state = trie->trans[ state + charid ].next;
1925 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1927 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1930 accept_state = TRIE_NODENUM( state );
1931 TRIE_HANDLE_WORD(accept_state);
1933 } /* end second pass */
1935 /* and now dump it out before we compress it */
1936 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1938 next_alloc, depth+1));
1942 * Inplace compress the table.*
1944 For sparse data sets the table constructed by the trie algorithm will
1945 be mostly 0/FAIL transitions or to put it another way mostly empty.
1946 (Note that leaf nodes will not contain any transitions.)
1948 This algorithm compresses the tables by eliminating most such
1949 transitions, at the cost of a modest bit of extra work during lookup:
1951 - Each states[] entry contains a .base field which indicates the
1952 index in the state[] array wheres its transition data is stored.
1954 - If .base is 0 there are no valid transitions from that node.
1956 - If .base is nonzero then charid is added to it to find an entry in
1959 -If trans[states[state].base+charid].check!=state then the
1960 transition is taken to be a 0/Fail transition. Thus if there are fail
1961 transitions at the front of the node then the .base offset will point
1962 somewhere inside the previous nodes data (or maybe even into a node
1963 even earlier), but the .check field determines if the transition is
1967 The following process inplace converts the table to the compressed
1968 table: We first do not compress the root node 1,and mark all its
1969 .check pointers as 1 and set its .base pointer as 1 as well. This
1970 allows us to do a DFA construction from the compressed table later,
1971 and ensures that any .base pointers we calculate later are greater
1974 - We set 'pos' to indicate the first entry of the second node.
1976 - We then iterate over the columns of the node, finding the first and
1977 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1978 and set the .check pointers accordingly, and advance pos
1979 appropriately and repreat for the next node. Note that when we copy
1980 the next pointers we have to convert them from the original
1981 NODEIDX form to NODENUM form as the former is not valid post
1984 - If a node has no transitions used we mark its base as 0 and do not
1985 advance the pos pointer.
1987 - If a node only has one transition we use a second pointer into the
1988 structure to fill in allocated fail transitions from other states.
1989 This pointer is independent of the main pointer and scans forward
1990 looking for null transitions that are allocated to a state. When it
1991 finds one it writes the single transition into the "hole". If the
1992 pointer doesnt find one the single transition is appended as normal.
1994 - Once compressed we can Renew/realloc the structures to release the
1997 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1998 specifically Fig 3.47 and the associated pseudocode.
2002 const U32 laststate = TRIE_NODENUM( next_alloc );
2005 trie->statecount = laststate;
2007 for ( state = 1 ; state < laststate ; state++ ) {
2009 const U32 stateidx = TRIE_NODEIDX( state );
2010 const U32 o_used = trie->trans[ stateidx ].check;
2011 U32 used = trie->trans[ stateidx ].check;
2012 trie->trans[ stateidx ].check = 0;
2014 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2015 if ( flag || trie->trans[ stateidx + charid ].next ) {
2016 if ( trie->trans[ stateidx + charid ].next ) {
2018 for ( ; zp < pos ; zp++ ) {
2019 if ( ! trie->trans[ zp ].next ) {
2023 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2024 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2025 trie->trans[ zp ].check = state;
2026 if ( ++zp > pos ) pos = zp;
2033 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2035 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2036 trie->trans[ pos ].check = state;
2041 trie->lasttrans = pos + 1;
2042 trie->states = (reg_trie_state *)
2043 PerlMemShared_realloc( trie->states, laststate
2044 * sizeof(reg_trie_state) );
2045 DEBUG_TRIE_COMPILE_MORE_r(
2046 PerlIO_printf( Perl_debug_log,
2047 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2048 (int)depth * 2 + 2,"",
2049 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2052 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2055 } /* end table compress */
2057 DEBUG_TRIE_COMPILE_MORE_r(
2058 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2059 (int)depth * 2 + 2, "",
2060 (UV)trie->statecount,
2061 (UV)trie->lasttrans)
2063 /* resize the trans array to remove unused space */
2064 trie->trans = (reg_trie_trans *)
2065 PerlMemShared_realloc( trie->trans, trie->lasttrans
2066 * sizeof(reg_trie_trans) );
2068 { /* Modify the program and insert the new TRIE node */
2069 U8 nodetype =(U8)(flags & 0xFF);
2073 regnode *optimize = NULL;
2074 #ifdef RE_TRACK_PATTERN_OFFSETS
2077 U32 mjd_nodelen = 0;
2078 #endif /* RE_TRACK_PATTERN_OFFSETS */
2079 #endif /* DEBUGGING */
2081 This means we convert either the first branch or the first Exact,
2082 depending on whether the thing following (in 'last') is a branch
2083 or not and whther first is the startbranch (ie is it a sub part of
2084 the alternation or is it the whole thing.)
2085 Assuming its a sub part we convert the EXACT otherwise we convert
2086 the whole branch sequence, including the first.
2088 /* Find the node we are going to overwrite */
2089 if ( first != startbranch || OP( last ) == BRANCH ) {
2090 /* branch sub-chain */
2091 NEXT_OFF( first ) = (U16)(last - first);
2092 #ifdef RE_TRACK_PATTERN_OFFSETS
2094 mjd_offset= Node_Offset((convert));
2095 mjd_nodelen= Node_Length((convert));
2098 /* whole branch chain */
2100 #ifdef RE_TRACK_PATTERN_OFFSETS
2103 const regnode *nop = NEXTOPER( convert );
2104 mjd_offset= Node_Offset((nop));
2105 mjd_nodelen= Node_Length((nop));
2109 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2110 (int)depth * 2 + 2, "",
2111 (UV)mjd_offset, (UV)mjd_nodelen)
2114 /* But first we check to see if there is a common prefix we can
2115 split out as an EXACT and put in front of the TRIE node. */
2116 trie->startstate= 1;
2117 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2119 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2123 const U32 base = trie->states[ state ].trans.base;
2125 if ( trie->states[state].wordnum )
2128 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2129 if ( ( base + ofs >= trie->uniquecharcount ) &&
2130 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2131 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2133 if ( ++count > 1 ) {
2134 SV **tmp = av_fetch( revcharmap, ofs, 0);
2135 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2136 if ( state == 1 ) break;
2138 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2140 PerlIO_printf(Perl_debug_log,
2141 "%*sNew Start State=%"UVuf" Class: [",
2142 (int)depth * 2 + 2, "",
2145 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2146 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2148 TRIE_BITMAP_SET(trie,*ch);
2150 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2152 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2156 TRIE_BITMAP_SET(trie,*ch);
2158 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2159 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2165 SV **tmp = av_fetch( revcharmap, idx, 0);
2167 char *ch = SvPV( *tmp, len );
2169 SV *sv=sv_newmortal();
2170 PerlIO_printf( Perl_debug_log,
2171 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2172 (int)depth * 2 + 2, "",
2174 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2175 PL_colors[0], PL_colors[1],
2176 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2177 PERL_PV_ESCAPE_FIRSTCHAR
2182 OP( convert ) = nodetype;
2183 str=STRING(convert);
2186 STR_LEN(convert) += len;
2192 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2197 trie->prefixlen = (state-1);
2199 regnode *n = convert+NODE_SZ_STR(convert);
2200 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2201 trie->startstate = state;
2202 trie->minlen -= (state - 1);
2203 trie->maxlen -= (state - 1);
2205 /* At least the UNICOS C compiler choked on this
2206 * being argument to DEBUG_r(), so let's just have
2209 #ifdef PERL_EXT_RE_BUILD
2215 regnode *fix = convert;
2216 U32 word = trie->wordcount;
2218 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2219 while( ++fix < n ) {
2220 Set_Node_Offset_Length(fix, 0, 0);
2223 SV ** const tmp = av_fetch( trie_words, word, 0 );
2225 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2226 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2228 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2236 NEXT_OFF(convert) = (U16)(tail - convert);
2237 DEBUG_r(optimize= n);
2243 if ( trie->maxlen ) {
2244 NEXT_OFF( convert ) = (U16)(tail - convert);
2245 ARG_SET( convert, data_slot );
2246 /* Store the offset to the first unabsorbed branch in
2247 jump[0], which is otherwise unused by the jump logic.
2248 We use this when dumping a trie and during optimisation. */
2250 trie->jump[0] = (U16)(nextbranch - convert);
2252 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2253 * and there is a bitmap
2254 * and the first "jump target" node we found leaves enough room
2255 * then convert the TRIE node into a TRIEC node, with the bitmap
2256 * embedded inline in the opcode - this is hypothetically faster.
2258 if ( !trie->states[trie->startstate].wordnum
2260 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2262 OP( convert ) = TRIEC;
2263 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2264 PerlMemShared_free(trie->bitmap);
2267 OP( convert ) = TRIE;
2269 /* store the type in the flags */
2270 convert->flags = nodetype;
2274 + regarglen[ OP( convert ) ];
2276 /* XXX We really should free up the resource in trie now,
2277 as we won't use them - (which resources?) dmq */
2279 /* needed for dumping*/
2280 DEBUG_r(if (optimize) {
2281 regnode *opt = convert;
2283 while ( ++opt < optimize) {
2284 Set_Node_Offset_Length(opt,0,0);
2287 Try to clean up some of the debris left after the
2290 while( optimize < jumper ) {
2291 mjd_nodelen += Node_Length((optimize));
2292 OP( optimize ) = OPTIMIZED;
2293 Set_Node_Offset_Length(optimize,0,0);
2296 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2298 } /* end node insert */
2300 /* Finish populating the prev field of the wordinfo array. Walk back
2301 * from each accept state until we find another accept state, and if
2302 * so, point the first word's .prev field at the second word. If the
2303 * second already has a .prev field set, stop now. This will be the
2304 * case either if we've already processed that word's accept state,
2305 * or that state had multiple words, and the overspill words were
2306 * already linked up earlier.
2313 for (word=1; word <= trie->wordcount; word++) {
2315 if (trie->wordinfo[word].prev)
2317 state = trie->wordinfo[word].accept;
2319 state = prev_states[state];
2322 prev = trie->states[state].wordnum;
2326 trie->wordinfo[word].prev = prev;
2328 Safefree(prev_states);
2332 /* and now dump out the compressed format */
2333 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2335 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2337 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2338 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2340 SvREFCNT_dec(revcharmap);
2344 : trie->startstate>1
2350 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2352 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2354 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2355 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2358 We find the fail state for each state in the trie, this state is the longest proper
2359 suffix of the current state's 'word' that is also a proper prefix of another word in our
2360 trie. State 1 represents the word '' and is thus the default fail state. This allows
2361 the DFA not to have to restart after its tried and failed a word at a given point, it
2362 simply continues as though it had been matching the other word in the first place.
2364 'abcdgu'=~/abcdefg|cdgu/
2365 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2366 fail, which would bring us to the state representing 'd' in the second word where we would
2367 try 'g' and succeed, proceeding to match 'cdgu'.
2369 /* add a fail transition */
2370 const U32 trie_offset = ARG(source);
2371 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2373 const U32 ucharcount = trie->uniquecharcount;
2374 const U32 numstates = trie->statecount;
2375 const U32 ubound = trie->lasttrans + ucharcount;
2379 U32 base = trie->states[ 1 ].trans.base;
2382 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2383 GET_RE_DEBUG_FLAGS_DECL;
2385 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2387 PERL_UNUSED_ARG(depth);
2391 ARG_SET( stclass, data_slot );
2392 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2393 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2394 aho->trie=trie_offset;
2395 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2396 Copy( trie->states, aho->states, numstates, reg_trie_state );
2397 Newxz( q, numstates, U32);
2398 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2401 /* initialize fail[0..1] to be 1 so that we always have
2402 a valid final fail state */
2403 fail[ 0 ] = fail[ 1 ] = 1;
2405 for ( charid = 0; charid < ucharcount ; charid++ ) {
2406 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2408 q[ q_write ] = newstate;
2409 /* set to point at the root */
2410 fail[ q[ q_write++ ] ]=1;
2413 while ( q_read < q_write) {
2414 const U32 cur = q[ q_read++ % numstates ];
2415 base = trie->states[ cur ].trans.base;
2417 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2418 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2420 U32 fail_state = cur;
2423 fail_state = fail[ fail_state ];
2424 fail_base = aho->states[ fail_state ].trans.base;
2425 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2427 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2428 fail[ ch_state ] = fail_state;
2429 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2431 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2433 q[ q_write++ % numstates] = ch_state;
2437 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2438 when we fail in state 1, this allows us to use the
2439 charclass scan to find a valid start char. This is based on the principle
2440 that theres a good chance the string being searched contains lots of stuff
2441 that cant be a start char.
2443 fail[ 0 ] = fail[ 1 ] = 0;
2444 DEBUG_TRIE_COMPILE_r({
2445 PerlIO_printf(Perl_debug_log,
2446 "%*sStclass Failtable (%"UVuf" states): 0",
2447 (int)(depth * 2), "", (UV)numstates
2449 for( q_read=1; q_read<numstates; q_read++ ) {
2450 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2452 PerlIO_printf(Perl_debug_log, "\n");
2455 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2460 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2461 * These need to be revisited when a newer toolchain becomes available.
2463 #if defined(__sparc64__) && defined(__GNUC__)
2464 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2465 # undef SPARC64_GCC_WORKAROUND
2466 # define SPARC64_GCC_WORKAROUND 1
2470 #define DEBUG_PEEP(str,scan,depth) \
2471 DEBUG_OPTIMISE_r({if (scan){ \
2472 SV * const mysv=sv_newmortal(); \
2473 regnode *Next = regnext(scan); \
2474 regprop(RExC_rx, mysv, scan); \
2475 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2476 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2477 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2484 #define JOIN_EXACT(scan,min,flags) \
2485 if (PL_regkind[OP(scan)] == EXACT) \
2486 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2489 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2490 /* Merge several consecutive EXACTish nodes into one. */
2491 regnode *n = regnext(scan);
2493 regnode *next = scan + NODE_SZ_STR(scan);
2497 regnode *stop = scan;
2498 GET_RE_DEBUG_FLAGS_DECL;
2500 PERL_UNUSED_ARG(depth);
2503 PERL_ARGS_ASSERT_JOIN_EXACT;
2504 #ifndef EXPERIMENTAL_INPLACESCAN
2505 PERL_UNUSED_ARG(flags);
2506 PERL_UNUSED_ARG(val);
2508 DEBUG_PEEP("join",scan,depth);
2510 /* Skip NOTHING, merge EXACT*. */
2512 ( PL_regkind[OP(n)] == NOTHING ||
2513 (stringok && (OP(n) == OP(scan))))
2515 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2517 if (OP(n) == TAIL || n > next)
2519 if (PL_regkind[OP(n)] == NOTHING) {
2520 DEBUG_PEEP("skip:",n,depth);
2521 NEXT_OFF(scan) += NEXT_OFF(n);
2522 next = n + NODE_STEP_REGNODE;
2529 else if (stringok) {
2530 const unsigned int oldl = STR_LEN(scan);
2531 regnode * const nnext = regnext(n);
2533 DEBUG_PEEP("merg",n,depth);
2536 if (oldl + STR_LEN(n) > U8_MAX)
2538 NEXT_OFF(scan) += NEXT_OFF(n);
2539 STR_LEN(scan) += STR_LEN(n);
2540 next = n + NODE_SZ_STR(n);
2541 /* Now we can overwrite *n : */
2542 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2550 #ifdef EXPERIMENTAL_INPLACESCAN
2551 if (flags && !NEXT_OFF(n)) {
2552 DEBUG_PEEP("atch", val, depth);
2553 if (reg_off_by_arg[OP(n)]) {
2554 ARG_SET(n, val - n);
2557 NEXT_OFF(n) = val - n;
2563 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2564 #define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2565 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2566 #define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2569 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2570 && ( STR_LEN(scan) >= 6 ) )
2573 Two problematic code points in Unicode casefolding of EXACT nodes:
2575 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2576 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2582 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2583 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2585 This means that in case-insensitive matching (or "loose matching",
2586 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2587 length of the above casefolded versions) can match a target string
2588 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2589 This would rather mess up the minimum length computation.
2591 What we'll do is to look for the tail four bytes, and then peek
2592 at the preceding two bytes to see whether we need to decrease
2593 the minimum length by four (six minus two).
2595 Thanks to the design of UTF-8, there cannot be false matches:
2596 A sequence of valid UTF-8 bytes cannot be a subsequence of
2597 another valid sequence of UTF-8 bytes.
2600 char * const s0 = STRING(scan), *s, *t;
2601 char * const s1 = s0 + STR_LEN(scan) - 1;
2602 char * const s2 = s1 - 4;
2603 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2604 const char t0[] = "\xaf\x49\xaf\x42";
2606 const char t0[] = "\xcc\x88\xcc\x81";
2608 const char * const t1 = t0 + 3;
2611 s < s2 && (t = ninstr(s, s1, t0, t1));
2614 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2615 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2617 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2618 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2626 n = scan + NODE_SZ_STR(scan);
2628 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2635 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2639 /* REx optimizer. Converts nodes into quicker variants "in place".
2640 Finds fixed substrings. */
2642 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2643 to the position after last scanned or to NULL. */
2645 #define INIT_AND_WITHP \
2646 assert(!and_withp); \
2647 Newx(and_withp,1,struct regnode_charclass_class); \
2648 SAVEFREEPV(and_withp)
2650 /* this is a chain of data about sub patterns we are processing that
2651 need to be handled separately/specially in study_chunk. Its so
2652 we can simulate recursion without losing state. */
2654 typedef struct scan_frame {
2655 regnode *last; /* last node to process in this frame */
2656 regnode *next; /* next node to process when last is reached */
2657 struct scan_frame *prev; /*previous frame*/
2658 I32 stop; /* what stopparen do we use */
2662 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2664 #define CASE_SYNST_FNC(nAmE) \
2666 if (flags & SCF_DO_STCLASS_AND) { \
2667 for (value = 0; value < 256; value++) \
2668 if (!is_ ## nAmE ## _cp(value)) \
2669 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2672 for (value = 0; value < 256; value++) \
2673 if (is_ ## nAmE ## _cp(value)) \
2674 ANYOF_BITMAP_SET(data->start_class, value); \
2678 if (flags & SCF_DO_STCLASS_AND) { \
2679 for (value = 0; value < 256; value++) \
2680 if (is_ ## nAmE ## _cp(value)) \
2681 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2684 for (value = 0; value < 256; value++) \
2685 if (!is_ ## nAmE ## _cp(value)) \
2686 ANYOF_BITMAP_SET(data->start_class, value); \
2693 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2694 I32 *minlenp, I32 *deltap,
2699 struct regnode_charclass_class *and_withp,
2700 U32 flags, U32 depth)
2701 /* scanp: Start here (read-write). */
2702 /* deltap: Write maxlen-minlen here. */
2703 /* last: Stop before this one. */
2704 /* data: string data about the pattern */
2705 /* stopparen: treat close N as END */
2706 /* recursed: which subroutines have we recursed into */
2707 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2710 I32 min = 0, pars = 0, code;
2711 regnode *scan = *scanp, *next;
2713 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2714 int is_inf_internal = 0; /* The studied chunk is infinite */
2715 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2716 scan_data_t data_fake;
2717 SV *re_trie_maxbuff = NULL;
2718 regnode *first_non_open = scan;
2719 I32 stopmin = I32_MAX;
2720 scan_frame *frame = NULL;
2721 GET_RE_DEBUG_FLAGS_DECL;
2723 PERL_ARGS_ASSERT_STUDY_CHUNK;
2726 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2730 while (first_non_open && OP(first_non_open) == OPEN)
2731 first_non_open=regnext(first_non_open);
2736 while ( scan && OP(scan) != END && scan < last ){
2737 /* Peephole optimizer: */
2738 DEBUG_STUDYDATA("Peep:", data,depth);
2739 DEBUG_PEEP("Peep",scan,depth);
2740 JOIN_EXACT(scan,&min,0);
2742 /* Follow the next-chain of the current node and optimize
2743 away all the NOTHINGs from it. */
2744 if (OP(scan) != CURLYX) {
2745 const int max = (reg_off_by_arg[OP(scan)]
2747 /* I32 may be smaller than U16 on CRAYs! */
2748 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2749 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2753 /* Skip NOTHING and LONGJMP. */
2754 while ((n = regnext(n))
2755 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2756 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2757 && off + noff < max)
2759 if (reg_off_by_arg[OP(scan)])
2762 NEXT_OFF(scan) = off;
2767 /* The principal pseudo-switch. Cannot be a switch, since we
2768 look into several different things. */
2769 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2770 || OP(scan) == IFTHEN) {
2771 next = regnext(scan);
2773 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2775 if (OP(next) == code || code == IFTHEN) {
2776 /* NOTE - There is similar code to this block below for handling
2777 TRIE nodes on a re-study. If you change stuff here check there
2779 I32 max1 = 0, min1 = I32_MAX, num = 0;
2780 struct regnode_charclass_class accum;
2781 regnode * const startbranch=scan;
2783 if (flags & SCF_DO_SUBSTR)
2784 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2785 if (flags & SCF_DO_STCLASS)
2786 cl_init_zero(pRExC_state, &accum);
2788 while (OP(scan) == code) {
2789 I32 deltanext, minnext, f = 0, fake;
2790 struct regnode_charclass_class this_class;
2793 data_fake.flags = 0;
2795 data_fake.whilem_c = data->whilem_c;
2796 data_fake.last_closep = data->last_closep;
2799 data_fake.last_closep = &fake;
2801 data_fake.pos_delta = delta;
2802 next = regnext(scan);
2803 scan = NEXTOPER(scan);
2805 scan = NEXTOPER(scan);
2806 if (flags & SCF_DO_STCLASS) {
2807 cl_init(pRExC_state, &this_class);
2808 data_fake.start_class = &this_class;
2809 f = SCF_DO_STCLASS_AND;
2811 if (flags & SCF_WHILEM_VISITED_POS)
2812 f |= SCF_WHILEM_VISITED_POS;
2814 /* we suppose the run is continuous, last=next...*/
2815 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2817 stopparen, recursed, NULL, f,depth+1);
2820 if (max1 < minnext + deltanext)
2821 max1 = minnext + deltanext;
2822 if (deltanext == I32_MAX)
2823 is_inf = is_inf_internal = 1;
2825 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2827 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2828 if ( stopmin > minnext)
2829 stopmin = min + min1;
2830 flags &= ~SCF_DO_SUBSTR;
2832 data->flags |= SCF_SEEN_ACCEPT;
2835 if (data_fake.flags & SF_HAS_EVAL)
2836 data->flags |= SF_HAS_EVAL;
2837 data->whilem_c = data_fake.whilem_c;
2839 if (flags & SCF_DO_STCLASS)
2840 cl_or(pRExC_state, &accum, &this_class);
2842 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2844 if (flags & SCF_DO_SUBSTR) {
2845 data->pos_min += min1;
2846 data->pos_delta += max1 - min1;
2847 if (max1 != min1 || is_inf)
2848 data->longest = &(data->longest_float);
2851 delta += max1 - min1;
2852 if (flags & SCF_DO_STCLASS_OR) {
2853 cl_or(pRExC_state, data->start_class, &accum);
2855 cl_and(data->start_class, and_withp);
2856 flags &= ~SCF_DO_STCLASS;
2859 else if (flags & SCF_DO_STCLASS_AND) {
2861 cl_and(data->start_class, &accum);
2862 flags &= ~SCF_DO_STCLASS;
2865 /* Switch to OR mode: cache the old value of
2866 * data->start_class */
2868 StructCopy(data->start_class, and_withp,
2869 struct regnode_charclass_class);
2870 flags &= ~SCF_DO_STCLASS_AND;
2871 StructCopy(&accum, data->start_class,
2872 struct regnode_charclass_class);
2873 flags |= SCF_DO_STCLASS_OR;
2874 data->start_class->flags |= ANYOF_EOS;
2878 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2881 Assuming this was/is a branch we are dealing with: 'scan' now
2882 points at the item that follows the branch sequence, whatever
2883 it is. We now start at the beginning of the sequence and look
2890 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2892 If we can find such a subsequence we need to turn the first
2893 element into a trie and then add the subsequent branch exact
2894 strings to the trie.
2898 1. patterns where the whole set of branches can be converted.
2900 2. patterns where only a subset can be converted.
2902 In case 1 we can replace the whole set with a single regop
2903 for the trie. In case 2 we need to keep the start and end
2906 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2907 becomes BRANCH TRIE; BRANCH X;
2909 There is an additional case, that being where there is a
2910 common prefix, which gets split out into an EXACT like node
2911 preceding the TRIE node.
2913 If x(1..n)==tail then we can do a simple trie, if not we make
2914 a "jump" trie, such that when we match the appropriate word
2915 we "jump" to the appropriate tail node. Essentially we turn
2916 a nested if into a case structure of sorts.
2921 if (!re_trie_maxbuff) {
2922 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2923 if (!SvIOK(re_trie_maxbuff))
2924 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2926 if ( SvIV(re_trie_maxbuff)>=0 ) {
2928 regnode *first = (regnode *)NULL;
2929 regnode *last = (regnode *)NULL;
2930 regnode *tail = scan;
2935 SV * const mysv = sv_newmortal(); /* for dumping */
2937 /* var tail is used because there may be a TAIL
2938 regop in the way. Ie, the exacts will point to the
2939 thing following the TAIL, but the last branch will
2940 point at the TAIL. So we advance tail. If we
2941 have nested (?:) we may have to move through several
2945 while ( OP( tail ) == TAIL ) {
2946 /* this is the TAIL generated by (?:) */
2947 tail = regnext( tail );
2952 regprop(RExC_rx, mysv, tail );
2953 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2954 (int)depth * 2 + 2, "",
2955 "Looking for TRIE'able sequences. Tail node is: ",
2956 SvPV_nolen_const( mysv )
2962 step through the branches, cur represents each
2963 branch, noper is the first thing to be matched
2964 as part of that branch and noper_next is the
2965 regnext() of that node. if noper is an EXACT
2966 and noper_next is the same as scan (our current
2967 position in the regex) then the EXACT branch is
2968 a possible optimization target. Once we have
2969 two or more consecutive such branches we can
2970 create a trie of the EXACT's contents and stich
2971 it in place. If the sequence represents all of
2972 the branches we eliminate the whole thing and
2973 replace it with a single TRIE. If it is a
2974 subsequence then we need to stitch it in. This
2975 means the first branch has to remain, and needs
2976 to be repointed at the item on the branch chain
2977 following the last branch optimized. This could
2978 be either a BRANCH, in which case the
2979 subsequence is internal, or it could be the
2980 item following the branch sequence in which
2981 case the subsequence is at the end.
2985 /* dont use tail as the end marker for this traverse */
2986 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2987 regnode * const noper = NEXTOPER( cur );
2988 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2989 regnode * const noper_next = regnext( noper );
2993 regprop(RExC_rx, mysv, cur);
2994 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2995 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2997 regprop(RExC_rx, mysv, noper);
2998 PerlIO_printf( Perl_debug_log, " -> %s",
2999 SvPV_nolen_const(mysv));
3002 regprop(RExC_rx, mysv, noper_next );
3003 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3004 SvPV_nolen_const(mysv));
3006 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3007 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3009 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3010 : PL_regkind[ OP( noper ) ] == EXACT )
3011 || OP(noper) == NOTHING )
3013 && noper_next == tail
3018 if ( !first || optype == NOTHING ) {
3019 if (!first) first = cur;
3020 optype = OP( noper );
3026 Currently we do not believe that the trie logic can
3027 handle case insensitive matching properly when the
3028 pattern is not unicode (thus forcing unicode semantics).
3030 If/when this is fixed the following define can be swapped
3031 in below to fully enable trie logic.
3033 XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3036 #define TRIE_TYPE_IS_SAFE 1
3039 #define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
3041 if ( last && TRIE_TYPE_IS_SAFE ) {
3042 make_trie( pRExC_state,
3043 startbranch, first, cur, tail, count,
3046 if ( PL_regkind[ OP( noper ) ] == EXACT
3048 && noper_next == tail
3053 optype = OP( noper );
3063 regprop(RExC_rx, mysv, cur);
3064 PerlIO_printf( Perl_debug_log,
3065 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3066 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3070 if ( last && TRIE_TYPE_IS_SAFE ) {
3071 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3072 #ifdef TRIE_STUDY_OPT
3073 if ( ((made == MADE_EXACT_TRIE &&
3074 startbranch == first)
3075 || ( first_non_open == first )) &&
3077 flags |= SCF_TRIE_RESTUDY;
3078 if ( startbranch == first
3081 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3091 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3092 scan = NEXTOPER(NEXTOPER(scan));
3093 } else /* single branch is optimized. */
3094 scan = NEXTOPER(scan);
3096 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3097 scan_frame *newframe = NULL;
3102 if (OP(scan) != SUSPEND) {
3103 /* set the pointer */
3104 if (OP(scan) == GOSUB) {
3106 RExC_recurse[ARG2L(scan)] = scan;
3107 start = RExC_open_parens[paren-1];
3108 end = RExC_close_parens[paren-1];
3111 start = RExC_rxi->program + 1;
3115 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3116 SAVEFREEPV(recursed);
3118 if (!PAREN_TEST(recursed,paren+1)) {
3119 PAREN_SET(recursed,paren+1);
3120 Newx(newframe,1,scan_frame);
3122 if (flags & SCF_DO_SUBSTR) {
3123 SCAN_COMMIT(pRExC_state,data,minlenp);
3124 data->longest = &(data->longest_float);
3126 is_inf = is_inf_internal = 1;
3127 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3128 cl_anything(pRExC_state, data->start_class);
3129 flags &= ~SCF_DO_STCLASS;
3132 Newx(newframe,1,scan_frame);
3135 end = regnext(scan);
3140 SAVEFREEPV(newframe);
3141 newframe->next = regnext(scan);
3142 newframe->last = last;
3143 newframe->stop = stopparen;
3144 newframe->prev = frame;
3154 else if (OP(scan) == EXACT) {
3155 I32 l = STR_LEN(scan);
3158 const U8 * const s = (U8*)STRING(scan);
3159 l = utf8_length(s, s + l);
3160 uc = utf8_to_uvchr(s, NULL);
3162 uc = *((U8*)STRING(scan));
3165 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3166 /* The code below prefers earlier match for fixed
3167 offset, later match for variable offset. */
3168 if (data->last_end == -1) { /* Update the start info. */
3169 data->last_start_min = data->pos_min;
3170 data->last_start_max = is_inf
3171 ? I32_MAX : data->pos_min + data->pos_delta;
3173 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3175 SvUTF8_on(data->last_found);
3177 SV * const sv = data->last_found;
3178 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3179 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3180 if (mg && mg->mg_len >= 0)
3181 mg->mg_len += utf8_length((U8*)STRING(scan),
3182 (U8*)STRING(scan)+STR_LEN(scan));
3184 data->last_end = data->pos_min + l;
3185 data->pos_min += l; /* As in the first entry. */
3186 data->flags &= ~SF_BEFORE_EOL;
3188 if (flags & SCF_DO_STCLASS_AND) {
3189 /* Check whether it is compatible with what we know already! */
3193 /* If compatible, we or it in below. It is compatible if is
3194 * in the bitmp and either 1) its bit or its fold is set, or 2)
3195 * it's for a locale. Even if there isn't unicode semantics
3196 * here, at runtime there may be because of matching against a
3197 * utf8 string, so accept a possible false positive for
3198 * latin1-range folds */
3200 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3201 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3202 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3203 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3208 ANYOF_CLASS_ZERO(data->start_class);
3209 ANYOF_BITMAP_ZERO(data->start_class);
3211 ANYOF_BITMAP_SET(data->start_class, uc);
3212 else if (uc >= 0x100) {
3215 /* Some Unicode code points fold to the Latin1 range; as
3216 * XXX temporary code, instead of figuring out if this is
3217 * one, just assume it is and set all the start class bits
3218 * that could be some such above 255 code point's fold
3219 * which will generate fals positives. As the code
3220 * elsewhere that does compute the fold settles down, it
3221 * can be extracted out and re-used here */
3222 for (i = 0; i < 256; i++){
3223 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3224 ANYOF_BITMAP_SET(data->start_class, i);
3228 data->start_class->flags &= ~ANYOF_EOS;
3230 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3232 else if (flags & SCF_DO_STCLASS_OR) {
3233 /* false positive possible if the class is case-folded */
3235 ANYOF_BITMAP_SET(data->start_class, uc);
3237 data->start_class->flags |= ANYOF_UNICODE_ALL;
3238 data->start_class->flags &= ~ANYOF_EOS;
3239 cl_and(data->start_class, and_withp);
3241 flags &= ~SCF_DO_STCLASS;
3243 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3244 I32 l = STR_LEN(scan);
3245 UV uc = *((U8*)STRING(scan));
3247 /* Search for fixed substrings supports EXACT only. */
3248 if (flags & SCF_DO_SUBSTR) {
3250 SCAN_COMMIT(pRExC_state, data, minlenp);
3253 const U8 * const s = (U8 *)STRING(scan);
3254 l = utf8_length(s, s + l);
3255 uc = utf8_to_uvchr(s, NULL);
3258 if (flags & SCF_DO_SUBSTR)
3260 if (flags & SCF_DO_STCLASS_AND) {
3261 /* Check whether it is compatible with what we know already! */
3264 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3265 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3266 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3270 ANYOF_CLASS_ZERO(data->start_class);
3271 ANYOF_BITMAP_ZERO(data->start_class);
3273 ANYOF_BITMAP_SET(data->start_class, uc);
3274 data->start_class->flags &= ~ANYOF_EOS;
3275 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3276 if (OP(scan) == EXACTFL) {
3277 /* XXX This set is probably no longer necessary, and
3278 * probably wrong as LOCALE now is on in the initial
3280 data->start_class->flags |= ANYOF_LOCALE;
3284 /* Also set the other member of the fold pair. In case
3285 * that unicode semantics is called for at runtime, use
3286 * the full latin1 fold. (Can't do this for locale,
3287 * because not known until runtime */
3288 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3291 else if (uc >= 0x100) {
3293 for (i = 0; i < 256; i++){
3294 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3295 ANYOF_BITMAP_SET(data->start_class, i);
3300 else if (flags & SCF_DO_STCLASS_OR) {
3301 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3302 /* false positive possible if the class is case-folded.
3303 Assume that the locale settings are the same... */
3305 ANYOF_BITMAP_SET(data->start_class, uc);
3306 if (OP(scan) != EXACTFL) {
3308 /* And set the other member of the fold pair, but
3309 * can't do that in locale because not known until
3311 ANYOF_BITMAP_SET(data->start_class,
3312 PL_fold_latin1[uc]);
3315 data->start_class->flags &= ~ANYOF_EOS;
3317 cl_and(data->start_class, and_withp);
3319 flags &= ~SCF_DO_STCLASS;
3321 else if (REGNODE_VARIES(OP(scan))) {
3322 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3323 I32 f = flags, pos_before = 0;
3324 regnode * const oscan = scan;
3325 struct regnode_charclass_class this_class;
3326 struct regnode_charclass_class *oclass = NULL;
3327 I32 next_is_eval = 0;
3329 switch (PL_regkind[OP(scan)]) {
3330 case WHILEM: /* End of (?:...)* . */
3331 scan = NEXTOPER(scan);
3334 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3335 next = NEXTOPER(scan);
3336 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3338 maxcount = REG_INFTY;
3339 next = regnext(scan);
3340 scan = NEXTOPER(scan);
3344 if (flags & SCF_DO_SUBSTR)
3349 if (flags & SCF_DO_STCLASS) {
3351 maxcount = REG_INFTY;
3352 next = regnext(scan);
3353 scan = NEXTOPER(scan);
3356 is_inf = is_inf_internal = 1;
3357 scan = regnext(scan);
3358 if (flags & SCF_DO_SUBSTR) {
3359 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3360 data->longest = &(data->longest_float);
3362 goto optimize_curly_tail;
3364 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3365 && (scan->flags == stopparen))
3370 mincount = ARG1(scan);
3371 maxcount = ARG2(scan);
3373 next = regnext(scan);
3374 if (OP(scan) == CURLYX) {
3375 I32 lp = (data ? *(data->last_closep) : 0);
3376 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3378 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3379 next_is_eval = (OP(scan) == EVAL);
3381 if (flags & SCF_DO_SUBSTR) {
3382 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3383 pos_before = data->pos_min;
3387 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3389 data->flags |= SF_IS_INF;
3391 if (flags & SCF_DO_STCLASS) {
3392 cl_init(pRExC_state, &this_class);
3393 oclass = data->start_class;
3394 data->start_class = &this_class;
3395 f |= SCF_DO_STCLASS_AND;
3396 f &= ~SCF_DO_STCLASS_OR;
3398 /* Exclude from super-linear cache processing any {n,m}
3399 regops for which the combination of input pos and regex
3400 pos is not enough information to determine if a match
3403 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3404 regex pos at the \s*, the prospects for a match depend not
3405 only on the input position but also on how many (bar\s*)
3406 repeats into the {4,8} we are. */
3407 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3408 f &= ~SCF_WHILEM_VISITED_POS;
3410 /* This will finish on WHILEM, setting scan, or on NULL: */
3411 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3412 last, data, stopparen, recursed, NULL,
3414 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3416 if (flags & SCF_DO_STCLASS)
3417 data->start_class = oclass;
3418 if (mincount == 0 || minnext == 0) {
3419 if (flags & SCF_DO_STCLASS_OR) {
3420 cl_or(pRExC_state, data->start_class, &this_class);
3422 else if (flags & SCF_DO_STCLASS_AND) {
3423 /* Switch to OR mode: cache the old value of
3424 * data->start_class */
3426 StructCopy(data->start_class, and_withp,
3427 struct regnode_charclass_class);
3428 flags &= ~SCF_DO_STCLASS_AND;
3429 StructCopy(&this_class, data->start_class,
3430 struct regnode_charclass_class);
3431 flags |= SCF_DO_STCLASS_OR;
3432 data->start_class->flags |= ANYOF_EOS;
3434 } else { /* Non-zero len */
3435 if (flags & SCF_DO_STCLASS_OR) {
3436 cl_or(pRExC_state, data->start_class, &this_class);
3437 cl_and(data->start_class, and_withp);
3439 else if (flags & SCF_DO_STCLASS_AND)
3440 cl_and(data->start_class, &this_class);
3441 flags &= ~SCF_DO_STCLASS;
3443 if (!scan) /* It was not CURLYX, but CURLY. */
3445 if ( /* ? quantifier ok, except for (?{ ... }) */
3446 (next_is_eval || !(mincount == 0 && maxcount == 1))
3447 && (minnext == 0) && (deltanext == 0)
3448 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3449 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3451 ckWARNreg(RExC_parse,
3452 "Quantifier unexpected on zero-length expression");
3455 min += minnext * mincount;
3456 is_inf_internal |= ((maxcount == REG_INFTY
3457 && (minnext + deltanext) > 0)
3458 || deltanext == I32_MAX);
3459 is_inf |= is_inf_internal;
3460 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3462 /* Try powerful optimization CURLYX => CURLYN. */
3463 if ( OP(oscan) == CURLYX && data
3464 && data->flags & SF_IN_PAR
3465 && !(data->flags & SF_HAS_EVAL)
3466 && !deltanext && minnext == 1 ) {
3467 /* Try to optimize to CURLYN. */
3468 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3469 regnode * const nxt1 = nxt;
3476 if (!REGNODE_SIMPLE(OP(nxt))
3477 && !(PL_regkind[OP(nxt)] == EXACT
3478 && STR_LEN(nxt) == 1))
3484 if (OP(nxt) != CLOSE)
3486 if (RExC_open_parens) {
3487 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3488 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3490 /* Now we know that nxt2 is the only contents: */
3491 oscan->flags = (U8)ARG(nxt);
3493 OP(nxt1) = NOTHING; /* was OPEN. */
3496 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3497 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3498 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3499 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3500 OP(nxt + 1) = OPTIMIZED; /* was count. */
3501 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3506 /* Try optimization CURLYX => CURLYM. */
3507 if ( OP(oscan) == CURLYX && data
3508 && !(data->flags & SF_HAS_PAR)
3509 && !(data->flags & SF_HAS_EVAL)
3510 && !deltanext /* atom is fixed width */
3511 && minnext != 0 /* CURLYM can't handle zero width */
3513 /* XXXX How to optimize if data == 0? */
3514 /* Optimize to a simpler form. */
3515 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3519 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3520 && (OP(nxt2) != WHILEM))
3522 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3523 /* Need to optimize away parenths. */
3524 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3525 /* Set the parenth number. */
3526 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3528 oscan->flags = (U8)ARG(nxt);
3529 if (RExC_open_parens) {
3530 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3531 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3533 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3534 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3537 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3538 OP(nxt + 1) = OPTIMIZED; /* was count. */
3539 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3540 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3543 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3544 regnode *nnxt = regnext(nxt1);
3546 if (reg_off_by_arg[OP(nxt1)])
3547 ARG_SET(nxt1, nxt2 - nxt1);
3548 else if (nxt2 - nxt1 < U16_MAX)
3549 NEXT_OFF(nxt1) = nxt2 - nxt1;
3551 OP(nxt) = NOTHING; /* Cannot beautify */
3556 /* Optimize again: */
3557 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3558 NULL, stopparen, recursed, NULL, 0,depth+1);
3563 else if ((OP(oscan) == CURLYX)
3564 && (flags & SCF_WHILEM_VISITED_POS)
3565 /* See the comment on a similar expression above.
3566 However, this time it's not a subexpression
3567 we care about, but the expression itself. */
3568 && (maxcount == REG_INFTY)
3569 && data && ++data->whilem_c < 16) {
3570 /* This stays as CURLYX, we can put the count/of pair. */
3571 /* Find WHILEM (as in regexec.c) */
3572 regnode *nxt = oscan + NEXT_OFF(oscan);
3574 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3576 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3577 | (RExC_whilem_seen << 4)); /* On WHILEM */
3579 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3581 if (flags & SCF_DO_SUBSTR) {
3582 SV *last_str = NULL;
3583 int counted = mincount != 0;
3585 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3586 #if defined(SPARC64_GCC_WORKAROUND)
3589 const char *s = NULL;
3592 if (pos_before >= data->last_start_min)
3595 b = data->last_start_min;
3598 s = SvPV_const(data->last_found, l);
3599 old = b - data->last_start_min;
3602 I32 b = pos_before >= data->last_start_min
3603 ? pos_before : data->last_start_min;
3605 const char * const s = SvPV_const(data->last_found, l);
3606 I32 old = b - data->last_start_min;
3610 old = utf8_hop((U8*)s, old) - (U8*)s;
3612 /* Get the added string: */
3613 last_str = newSVpvn_utf8(s + old, l, UTF);
3614 if (deltanext == 0 && pos_before == b) {
3615 /* What was added is a constant string */
3617 SvGROW(last_str, (mincount * l) + 1);
3618 repeatcpy(SvPVX(last_str) + l,
3619 SvPVX_const(last_str), l, mincount - 1);
3620 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3621 /* Add additional parts. */
3622 SvCUR_set(data->last_found,
3623 SvCUR(data->last_found) - l);
3624 sv_catsv(data->last_found, last_str);
3626 SV * sv = data->last_found;
3628 SvUTF8(sv) && SvMAGICAL(sv) ?
3629 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3630 if (mg && mg->mg_len >= 0)
3631 mg->mg_len += CHR_SVLEN(last_str) - l;
3633 data->last_end += l * (mincount - 1);
3636 /* start offset must point into the last copy */
3637 data->last_start_min += minnext * (mincount - 1);
3638 data->last_start_max += is_inf ? I32_MAX
3639 : (maxcount - 1) * (minnext + data->pos_delta);
3642 /* It is counted once already... */
3643 data->pos_min += minnext * (mincount - counted);
3644 data->pos_delta += - counted * deltanext +
3645 (minnext + deltanext) * maxcount - minnext * mincount;
3646 if (mincount != maxcount) {
3647 /* Cannot extend fixed substrings found inside
3649 SCAN_COMMIT(pRExC_state,data,minlenp);
3650 if (mincount && last_str) {
3651 SV * const sv = data->last_found;
3652 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3653 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3657 sv_setsv(sv, last_str);
3658 data->last_end = data->pos_min;
3659 data->last_start_min =
3660 data->pos_min - CHR_SVLEN(last_str);
3661 data->last_start_max = is_inf
3663 : data->pos_min + data->pos_delta
3664 - CHR_SVLEN(last_str);
3666 data->longest = &(data->longest_float);
3668 SvREFCNT_dec(last_str);
3670 if (data && (fl & SF_HAS_EVAL))
3671 data->flags |= SF_HAS_EVAL;
3672 optimize_curly_tail:
3673 if (OP(oscan) != CURLYX) {
3674 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3676 NEXT_OFF(oscan) += NEXT_OFF(next);
3679 default: /* REF, ANYOFV, and CLUMP only? */
3680 if (flags & SCF_DO_SUBSTR) {
3681 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3682 data->longest = &(data->longest_float);
3684 is_inf = is_inf_internal = 1;
3685 if (flags & SCF_DO_STCLASS_OR)
3686 cl_anything(pRExC_state, data->start_class);
3687 flags &= ~SCF_DO_STCLASS;
3691 else if (OP(scan) == LNBREAK) {
3692 if (flags & SCF_DO_STCLASS) {
3694 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3695 if (flags & SCF_DO_STCLASS_AND) {
3696 for (value = 0; value < 256; value++)
3697 if (!is_VERTWS_cp(value))
3698 ANYOF_BITMAP_CLEAR(data->start_class, value);
3701 for (value = 0; value < 256; value++)
3702 if (is_VERTWS_cp(value))
3703 ANYOF_BITMAP_SET(data->start_class, value);
3705 if (flags & SCF_DO_STCLASS_OR)
3706 cl_and(data->start_class, and_withp);
3707 flags &= ~SCF_DO_STCLASS;
3711 if (flags & SCF_DO_SUBSTR) {
3712 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3714 data->pos_delta += 1;
3715 data->longest = &(data->longest_float);
3718 else if (OP(scan) == FOLDCHAR) {
3719 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3720 flags &= ~SCF_DO_STCLASS;
3723 if (flags & SCF_DO_SUBSTR) {
3724 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3726 data->pos_delta += d;
3727 data->longest = &(data->longest_float);
3730 else if (REGNODE_SIMPLE(OP(scan))) {
3733 if (flags & SCF_DO_SUBSTR) {
3734 SCAN_COMMIT(pRExC_state,data,minlenp);
3738 if (flags & SCF_DO_STCLASS) {
3739 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3741 /* Some of the logic below assumes that switching
3742 locale on will only add false positives. */
3743 switch (PL_regkind[OP(scan)]) {
3747 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3748 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3749 cl_anything(pRExC_state, data->start_class);
3752 if (OP(scan) == SANY)
3754 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3755 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3756 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3757 cl_anything(pRExC_state, data->start_class);
3759 if (flags & SCF_DO_STCLASS_AND || !value)
3760 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3763 if (flags & SCF_DO_STCLASS_AND)
3764 cl_and(data->start_class,
3765 (struct regnode_charclass_class*)scan);
3767 cl_or(pRExC_state, data->start_class,
3768 (struct regnode_charclass_class*)scan);
3771 if (flags & SCF_DO_STCLASS_AND) {
3772 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3773 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3774 if (OP(scan) == ALNUMU) {
3775 for (value = 0; value < 256; value++) {
3776 if (!isWORDCHAR_L1(value)) {
3777 ANYOF_BITMAP_CLEAR(data->start_class, value);
3781 for (value = 0; value < 256; value++) {
3782 if (!isALNUM(value)) {
3783 ANYOF_BITMAP_CLEAR(data->start_class, value);
3790 if (data->start_class->flags & ANYOF_LOCALE)
3791 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3793 /* Even if under locale, set the bits for non-locale
3794 * in case it isn't a true locale-node. This will
3795 * create false positives if it truly is locale */
3796 if (OP(scan) == ALNUMU) {
3797 for (value = 0; value < 256; value++) {
3798 if (isWORDCHAR_L1(value)) {
3799 ANYOF_BITMAP_SET(data->start_class, value);
3803 for (value = 0; value < 256; value++) {
3804 if (isALNUM(value)) {
3805 ANYOF_BITMAP_SET(data->start_class, value);
3812 if (flags & SCF_DO_STCLASS_AND) {
3813 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3814 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3815 if (OP(scan) == NALNUMU) {
3816 for (value = 0; value < 256; value++) {
3817 if (isWORDCHAR_L1(value)) {
3818 ANYOF_BITMAP_CLEAR(data->start_class, value);
3822 for (value = 0; value < 256; value++) {
3823 if (isALNUM(value)) {
3824 ANYOF_BITMAP_CLEAR(data->start_class, value);
3831 if (data->start_class->flags & ANYOF_LOCALE)
3832 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3834 /* Even if under locale, set the bits for non-locale in
3835 * case it isn't a true locale-node. This will create
3836 * false positives if it truly is locale */
3837 if (OP(scan) == NALNUMU) {
3838 for (value = 0; value < 256; value++) {
3839 if (! isWORDCHAR_L1(value)) {
3840 ANYOF_BITMAP_SET(data->start_class, value);
3844 for (value = 0; value < 256; value++) {
3845 if (! isALNUM(value)) {
3846 ANYOF_BITMAP_SET(data->start_class, value);
3853 if (flags & SCF_DO_STCLASS_AND) {
3854 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3855 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3856 if (OP(scan) == SPACEU) {
3857 for (value = 0; value < 256; value++) {
3858 if (!isSPACE_L1(value)) {
3859 ANYOF_BITMAP_CLEAR(data->start_class, value);
3863 for (value = 0; value < 256; value++) {
3864 if (!isSPACE(value)) {
3865 ANYOF_BITMAP_CLEAR(data->start_class, value);
3872 if (data->start_class->flags & ANYOF_LOCALE) {
3873 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3875 if (OP(scan) == SPACEU) {
3876 for (value = 0; value < 256; value++) {
3877 if (isSPACE_L1(value)) {
3878 ANYOF_BITMAP_SET(data->start_class, value);
3882 for (value = 0; value < 256; value++) {
3883 if (isSPACE(value)) {
3884 ANYOF_BITMAP_SET(data->start_class, value);
3891 if (flags & SCF_DO_STCLASS_AND) {
3892 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3893 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3894 if (OP(scan) == NSPACEU) {
3895 for (value = 0; value < 256; value++) {
3896 if (isSPACE_L1(value)) {
3897 ANYOF_BITMAP_CLEAR(data->start_class, value);
3901 for (value = 0; value < 256; value++) {
3902 if (isSPACE(value)) {
3903 ANYOF_BITMAP_CLEAR(data->start_class, value);
3910 if (data->start_class->flags & ANYOF_LOCALE)
3911 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3912 if (OP(scan) == NSPACEU) {
3913 for (value = 0; value < 256; value++) {
3914 if (!isSPACE_L1(value)) {
3915 ANYOF_BITMAP_SET(data->start_class, value);
3920 for (value = 0; value < 256; value++) {
3921 if (!isSPACE(value)) {
3922 ANYOF_BITMAP_SET(data->start_class, value);
3929 if (flags & SCF_DO_STCLASS_AND) {
3930 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3931 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3932 for (value = 0; value < 256; value++)
3933 if (!isDIGIT(value))
3934 ANYOF_BITMAP_CLEAR(data->start_class, value);
3938 if (data->start_class->flags & ANYOF_LOCALE)
3939 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3940 for (value = 0; value < 256; value++)
3942 ANYOF_BITMAP_SET(data->start_class, value);
3946 if (flags & SCF_DO_STCLASS_AND) {
3947 if (!(data->start_class->flags & ANYOF_LOCALE))
3948 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3949 for (value = 0; value < 256; value++)
3951 ANYOF_BITMAP_CLEAR(data->start_class, value);
3954 if (data->start_class->flags & ANYOF_LOCALE)
3955 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3956 for (value = 0; value < 256; value++)
3957 if (!isDIGIT(value))
3958 ANYOF_BITMAP_SET(data->start_class, value);
3961 CASE_SYNST_FNC(VERTWS);
3962 CASE_SYNST_FNC(HORIZWS);
3965 if (flags & SCF_DO_STCLASS_OR)
3966 cl_and(data->start_class, and_withp);
3967 flags &= ~SCF_DO_STCLASS;
3970 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3971 data->flags |= (OP(scan) == MEOL
3975 else if ( PL_regkind[OP(scan)] == BRANCHJ
3976 /* Lookbehind, or need to calculate parens/evals/stclass: */
3977 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3978 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3979 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3980 || OP(scan) == UNLESSM )
3982 /* Negative Lookahead/lookbehind
3983 In this case we can't do fixed string optimisation.
3986 I32 deltanext, minnext, fake = 0;
3988 struct regnode_charclass_class intrnl;
3991 data_fake.flags = 0;
3993 data_fake.whilem_c = data->whilem_c;
3994 data_fake.last_closep = data->last_closep;
3997 data_fake.last_closep = &fake;
3998 data_fake.pos_delta = delta;
3999 if ( flags & SCF_DO_STCLASS && !scan->flags
4000 && OP(scan) == IFMATCH ) { /* Lookahead */
4001 cl_init(pRExC_state, &intrnl);
4002 data_fake.start_class = &intrnl;
4003 f |= SCF_DO_STCLASS_AND;
4005 if (flags & SCF_WHILEM_VISITED_POS)
4006 f |= SCF_WHILEM_VISITED_POS;
4007 next = regnext(scan);
4008 nscan = NEXTOPER(NEXTOPER(scan));
4009 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4010 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4013 FAIL("Variable length lookbehind not implemented");
4015 else if (minnext > (I32)U8_MAX) {
4016 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4018 scan->flags = (U8)minnext;
4021 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4023 if (data_fake.flags & SF_HAS_EVAL)
4024 data->flags |= SF_HAS_EVAL;
4025 data->whilem_c = data_fake.whilem_c;
4027 if (f & SCF_DO_STCLASS_AND) {
4028 if (flags & SCF_DO_STCLASS_OR) {
4029 /* OR before, AND after: ideally we would recurse with
4030 * data_fake to get the AND applied by study of the
4031 * remainder of the pattern, and then derecurse;
4032 * *** HACK *** for now just treat as "no information".
4033 * See [perl #56690].
4035 cl_init(pRExC_state, data->start_class);
4037 /* AND before and after: combine and continue */
4038 const int was = (data->start_class->flags & ANYOF_EOS);
4040 cl_and(data->start_class, &intrnl);
4042 data->start_class->flags |= ANYOF_EOS;
4046 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4048 /* Positive Lookahead/lookbehind
4049 In this case we can do fixed string optimisation,
4050 but we must be careful about it. Note in the case of
4051 lookbehind the positions will be offset by the minimum
4052 length of the pattern, something we won't know about
4053 until after the recurse.
4055 I32 deltanext, fake = 0;
4057 struct regnode_charclass_class intrnl;
4059 /* We use SAVEFREEPV so that when the full compile
4060 is finished perl will clean up the allocated
4061 minlens when it's all done. This way we don't
4062 have to worry about freeing them when we know
4063 they wont be used, which would be a pain.
4066 Newx( minnextp, 1, I32 );
4067 SAVEFREEPV(minnextp);
4070 StructCopy(data, &data_fake, scan_data_t);
4071 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4074 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4075 data_fake.last_found=newSVsv(data->last_found);
4079 data_fake.last_closep = &fake;
4080 data_fake.flags = 0;
4081 data_fake.pos_delta = delta;
4083 data_fake.flags |= SF_IS_INF;
4084 if ( flags & SCF_DO_STCLASS && !scan->flags
4085 && OP(scan) == IFMATCH ) { /* Lookahead */
4086 cl_init(pRExC_state, &intrnl);
4087 data_fake.start_class = &intrnl;
4088 f |= SCF_DO_STCLASS_AND;
4090 if (flags & SCF_WHILEM_VISITED_POS)
4091 f |= SCF_WHILEM_VISITED_POS;
4092 next = regnext(scan);
4093 nscan = NEXTOPER(NEXTOPER(scan));
4095 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4096 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4099 FAIL("Variable length lookbehind not implemented");
4101 else if (*minnextp > (I32)U8_MAX) {
4102 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4104 scan->flags = (U8)*minnextp;
4109 if (f & SCF_DO_STCLASS_AND) {
4110 const int was = (data->start_class->flags & ANYOF_EOS);
4112 cl_and(data->start_class, &intrnl);
4114 data->start_class->flags |= ANYOF_EOS;
4117 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4119 if (data_fake.flags & SF_HAS_EVAL)
4120 data->flags |= SF_HAS_EVAL;
4121 data->whilem_c = data_fake.whilem_c;
4122 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4123 if (RExC_rx->minlen<*minnextp)
4124 RExC_rx->minlen=*minnextp;
4125 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4126 SvREFCNT_dec(data_fake.last_found);
4128 if ( data_fake.minlen_fixed != minlenp )
4130 data->offset_fixed= data_fake.offset_fixed;
4131 data->minlen_fixed= data_fake.minlen_fixed;
4132 data->lookbehind_fixed+= scan->flags;
4134 if ( data_fake.minlen_float != minlenp )
4136 data->minlen_float= data_fake.minlen_float;
4137 data->offset_float_min=data_fake.offset_float_min;
4138 data->offset_float_max=data_fake.offset_float_max;
4139 data->lookbehind_float+= scan->flags;
4148 else if (OP(scan) == OPEN) {
4149 if (stopparen != (I32)ARG(scan))
4152 else if (OP(scan) == CLOSE) {
4153 if (stopparen == (I32)ARG(scan)) {
4156 if ((I32)ARG(scan) == is_par) {
4157 next = regnext(scan);
4159 if ( next && (OP(next) != WHILEM) && next < last)
4160 is_par = 0; /* Disable optimization */
4163 *(data->last_closep) = ARG(scan);
4165 else if (OP(scan) == EVAL) {
4167 data->flags |= SF_HAS_EVAL;
4169 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4170 if (flags & SCF_DO_SUBSTR) {
4171 SCAN_COMMIT(pRExC_state,data,minlenp);
4172 flags &= ~SCF_DO_SUBSTR;
4174 if (data && OP(scan)==ACCEPT) {
4175 data->flags |= SCF_SEEN_ACCEPT;
4180 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4182 if (flags & SCF_DO_SUBSTR) {
4183 SCAN_COMMIT(pRExC_state,data,minlenp);
4184 data->longest = &(data->longest_float);
4186 is_inf = is_inf_internal = 1;
4187 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4188 cl_anything(pRExC_state, data->start_class);
4189 flags &= ~SCF_DO_STCLASS;
4191 else if (OP(scan) == GPOS) {
4192 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4193 !(delta || is_inf || (data && data->pos_delta)))
4195 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4196 RExC_rx->extflags |= RXf_ANCH_GPOS;
4197 if (RExC_rx->gofs < (U32)min)
4198 RExC_rx->gofs = min;
4200 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4204 #ifdef TRIE_STUDY_OPT
4205 #ifdef FULL_TRIE_STUDY
4206 else if (PL_regkind[OP(scan)] == TRIE) {
4207 /* NOTE - There is similar code to this block above for handling
4208 BRANCH nodes on the initial study. If you change stuff here
4210 regnode *trie_node= scan;
4211 regnode *tail= regnext(scan);
4212 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4213 I32 max1 = 0, min1 = I32_MAX;
4214 struct regnode_charclass_class accum;
4216 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4217 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4218 if (flags & SCF_DO_STCLASS)
4219 cl_init_zero(pRExC_state, &accum);
4225 const regnode *nextbranch= NULL;
4228 for ( word=1 ; word <= trie->wordcount ; word++)
4230 I32 deltanext=0, minnext=0, f = 0, fake;
4231 struct regnode_charclass_class this_class;
4233 data_fake.flags = 0;
4235 data_fake.whilem_c = data->whilem_c;
4236 data_fake.last_closep = data->last_closep;
4239 data_fake.last_closep = &fake;
4240 data_fake.pos_delta = delta;
4241 if (flags & SCF_DO_STCLASS) {
4242 cl_init(pRExC_state, &this_class);
4243 data_fake.start_class = &this_class;
4244 f = SCF_DO_STCLASS_AND;
4246 if (flags & SCF_WHILEM_VISITED_POS)
4247 f |= SCF_WHILEM_VISITED_POS;
4249 if (trie->jump[word]) {
4251 nextbranch = trie_node + trie->jump[0];
4252 scan= trie_node + trie->jump[word];
4253 /* We go from the jump point to the branch that follows
4254 it. Note this means we need the vestigal unused branches
4255 even though they arent otherwise used.
4257 minnext = study_chunk(pRExC_state, &scan, minlenp,
4258 &deltanext, (regnode *)nextbranch, &data_fake,
4259 stopparen, recursed, NULL, f,depth+1);
4261 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4262 nextbranch= regnext((regnode*)nextbranch);
4264 if (min1 > (I32)(minnext + trie->minlen))
4265 min1 = minnext + trie->minlen;
4266 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4267 max1 = minnext + deltanext + trie->maxlen;
4268 if (deltanext == I32_MAX)
4269 is_inf = is_inf_internal = 1;
4271 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4273 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4274 if ( stopmin > min + min1)
4275 stopmin = min + min1;
4276 flags &= ~SCF_DO_SUBSTR;
4278 data->flags |= SCF_SEEN_ACCEPT;
4281 if (data_fake.flags & SF_HAS_EVAL)
4282 data->flags |= SF_HAS_EVAL;
4283 data->whilem_c = data_fake.whilem_c;
4285 if (flags & SCF_DO_STCLASS)
4286 cl_or(pRExC_state, &accum, &this_class);
4289 if (flags & SCF_DO_SUBSTR) {
4290 data->pos_min += min1;
4291 data->pos_delta += max1 - min1;
4292 if (max1 != min1 || is_inf)
4293 data->longest = &(data->longest_float);
4296 delta += max1 - min1;
4297 if (flags & SCF_DO_STCLASS_OR) {
4298 cl_or(pRExC_state, data->start_class, &accum);
4300 cl_and(data->start_class, and_withp);
4301 flags &= ~SCF_DO_STCLASS;
4304 else if (flags & SCF_DO_STCLASS_AND) {
4306 cl_and(data->start_class, &accum);
4307 flags &= ~SCF_DO_STCLASS;
4310 /* Switch to OR mode: cache the old value of
4311 * data->start_class */
4313 StructCopy(data->start_class, and_withp,
4314 struct regnode_charclass_class);
4315 flags &= ~SCF_DO_STCLASS_AND;
4316 StructCopy(&accum, data->start_class,
4317 struct regnode_charclass_class);
4318 flags |= SCF_DO_STCLASS_OR;
4319 data->start_class->flags |= ANYOF_EOS;
4326 else if (PL_regkind[OP(scan)] == TRIE) {
4327 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4330 min += trie->minlen;
4331 delta += (trie->maxlen - trie->minlen);
4332 flags &= ~SCF_DO_STCLASS; /* xxx */
4333 if (flags & SCF_DO_SUBSTR) {
4334 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4335 data->pos_min += trie->minlen;
4336 data->pos_delta += (trie->maxlen - trie->minlen);
4337 if (trie->maxlen != trie->minlen)
4338 data->longest = &(data->longest_float);
4340 if (trie->jump) /* no more substrings -- for now /grr*/
4341 flags &= ~SCF_DO_SUBSTR;
4343 #endif /* old or new */
4344 #endif /* TRIE_STUDY_OPT */
4346 /* Else: zero-length, ignore. */
4347 scan = regnext(scan);
4352 stopparen = frame->stop;
4353 frame = frame->prev;
4354 goto fake_study_recurse;
4359 DEBUG_STUDYDATA("pre-fin:",data,depth);
4362 *deltap = is_inf_internal ? I32_MAX : delta;
4363 if (flags & SCF_DO_SUBSTR && is_inf)
4364 data->pos_delta = I32_MAX - data->pos_min;
4365 if (is_par > (I32)U8_MAX)
4367 if (is_par && pars==1 && data) {
4368 data->flags |= SF_IN_PAR;
4369 data->flags &= ~SF_HAS_PAR;
4371 else if (pars && data) {
4372 data->flags |= SF_HAS_PAR;
4373 data->flags &= ~SF_IN_PAR;
4375 if (flags & SCF_DO_STCLASS_OR)
4376 cl_and(data->start_class, and_withp);
4377 if (flags & SCF_TRIE_RESTUDY)
4378 data->flags |= SCF_TRIE_RESTUDY;
4380 DEBUG_STUDYDATA("post-fin:",data,depth);
4382 return min < stopmin ? min : stopmin;
4386 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4388 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4390 PERL_ARGS_ASSERT_ADD_DATA;
4392 Renewc(RExC_rxi->data,
4393 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4394 char, struct reg_data);
4396 Renew(RExC_rxi->data->what, count + n, U8);
4398 Newx(RExC_rxi->data->what, n, U8);
4399 RExC_rxi->data->count = count + n;
4400 Copy(s, RExC_rxi->data->what + count, n, U8);
4404 /*XXX: todo make this not included in a non debugging perl */
4405 #ifndef PERL_IN_XSUB_RE
4407 Perl_reginitcolors(pTHX)
4410 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4412 char *t = savepv(s);
4416 t = strchr(t, '\t');
4422 PL_colors[i] = t = (char *)"";
4427 PL_colors[i++] = (char *)"";
4434 #ifdef TRIE_STUDY_OPT
4435 #define CHECK_RESTUDY_GOTO \
4437 (data.flags & SCF_TRIE_RESTUDY) \
4441 #define CHECK_RESTUDY_GOTO
4445 - pregcomp - compile a regular expression into internal code
4447 * We can't allocate space until we know how big the compiled form will be,
4448 * but we can't compile it (and thus know how big it is) until we've got a
4449 * place to put the code. So we cheat: we compile it twice, once with code
4450 * generation turned off and size counting turned on, and once "for real".
4451 * This also means that we don't allocate space until we are sure that the
4452 * thing really will compile successfully, and we never have to move the
4453 * code and thus invalidate pointers into it. (Note that it has to be in
4454 * one piece because free() must be able to free it all.) [NB: not true in perl]
4456 * Beware that the optimization-preparation code in here knows about some
4457 * of the structure of the compiled regexp. [I'll say.]
4462 #ifndef PERL_IN_XSUB_RE
4463 #define RE_ENGINE_PTR &PL_core_reg_engine
4465 extern const struct regexp_engine my_reg_engine;
4466 #define RE_ENGINE_PTR &my_reg_engine
4469 #ifndef PERL_IN_XSUB_RE
4471 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4474 HV * const table = GvHV(PL_hintgv);
4476 PERL_ARGS_ASSERT_PREGCOMP;
4478 /* Dispatch a request to compile a regexp to correct
4481 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4482 GET_RE_DEBUG_FLAGS_DECL;
4483 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4484 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4486 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4489 return CALLREGCOMP_ENG(eng, pattern, flags);
4492 return Perl_re_compile(aTHX_ pattern, flags);
4497 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4502 register regexp_internal *ri;
4511 /* these are all flags - maybe they should be turned
4512 * into a single int with different bit masks */
4513 I32 sawlookahead = 0;
4516 bool used_setjump = FALSE;
4517 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4522 RExC_state_t RExC_state;
4523 RExC_state_t * const pRExC_state = &RExC_state;
4524 #ifdef TRIE_STUDY_OPT
4526 RExC_state_t copyRExC_state;
4528 GET_RE_DEBUG_FLAGS_DECL;
4530 PERL_ARGS_ASSERT_RE_COMPILE;
4532 DEBUG_r(if (!PL_colorset) reginitcolors());
4534 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4535 RExC_uni_semantics = 0;
4536 RExC_contains_locale = 0;
4538 /****************** LONG JUMP TARGET HERE***********************/
4539 /* Longjmp back to here if have to switch in midstream to utf8 */
4540 if (! RExC_orig_utf8) {
4541 JMPENV_PUSH(jump_ret);
4542 used_setjump = TRUE;
4545 if (jump_ret == 0) { /* First time through */
4546 exp = SvPV(pattern, plen);
4548 /* ignore the utf8ness if the pattern is 0 length */
4550 RExC_utf8 = RExC_orig_utf8 = 0;
4554 SV *dsv= sv_newmortal();
4555 RE_PV_QUOTED_DECL(s, RExC_utf8,
4556 dsv, exp, plen, 60);
4557 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4558 PL_colors[4],PL_colors[5],s);
4561 else { /* longjumped back */
4564 /* If the cause for the longjmp was other than changing to utf8, pop
4565 * our own setjmp, and longjmp to the correct handler */
4566 if (jump_ret != UTF8_LONGJMP) {
4568 JMPENV_JUMP(jump_ret);
4573 /* It's possible to write a regexp in ascii that represents Unicode
4574 codepoints outside of the byte range, such as via \x{100}. If we
4575 detect such a sequence we have to convert the entire pattern to utf8
4576 and then recompile, as our sizing calculation will have been based
4577 on 1 byte == 1 character, but we will need to use utf8 to encode
4578 at least some part of the pattern, and therefore must convert the whole
4581 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4582 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4583 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4585 RExC_orig_utf8 = RExC_utf8 = 1;
4589 #ifdef TRIE_STUDY_OPT
4593 pm_flags = orig_pm_flags;
4595 if (initial_charset == REGEX_LOCALE_CHARSET) {
4596 RExC_contains_locale = 1;
4598 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4600 /* Set to use unicode semantics if the pattern is in utf8 and has the
4601 * 'depends' charset specified, as it means unicode when utf8 */
4602 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4606 RExC_flags = pm_flags;
4610 RExC_in_lookbehind = 0;
4611 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4612 RExC_seen_evals = 0;
4615 /* First pass: determine size, legality. */
4623 RExC_emit = &PL_regdummy;
4624 RExC_whilem_seen = 0;
4625 RExC_open_parens = NULL;
4626 RExC_close_parens = NULL;
4628 RExC_paren_names = NULL;
4630 RExC_paren_name_list = NULL;
4632 RExC_recurse = NULL;
4633 RExC_recurse_count = 0;
4635 #if 0 /* REGC() is (currently) a NOP at the first pass.
4636 * Clever compilers notice this and complain. --jhi */
4637 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4639 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4640 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4641 RExC_precomp = NULL;
4645 /* Here, finished first pass. Get rid of any added setjmp */
4651 PerlIO_printf(Perl_debug_log,
4652 "Required size %"IVdf" nodes\n"
4653 "Starting second pass (creation)\n",
4656 RExC_lastparse=NULL;
4659 /* The first pass could have found things that force Unicode semantics */
4660 if ((RExC_utf8 || RExC_uni_semantics)
4661 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4663 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4666 /* Small enough for pointer-storage convention?
4667 If extralen==0, this means that we will not need long jumps. */
4668 if (RExC_size >= 0x10000L && RExC_extralen)
4669 RExC_size += RExC_extralen;
4672 if (RExC_whilem_seen > 15)
4673 RExC_whilem_seen = 15;
4675 /* Allocate space and zero-initialize. Note, the two step process
4676 of zeroing when in debug mode, thus anything assigned has to
4677 happen after that */
4678 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4679 r = (struct regexp*)SvANY(rx);
4680 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4681 char, regexp_internal);
4682 if ( r == NULL || ri == NULL )
4683 FAIL("Regexp out of space");
4685 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4686 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4688 /* bulk initialize base fields with 0. */
4689 Zero(ri, sizeof(regexp_internal), char);
4692 /* non-zero initialization begins here */
4694 r->engine= RE_ENGINE_PTR;
4695 r->extflags = pm_flags;
4697 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4698 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4700 /* The caret is output if there are any defaults: if not all the STD
4701 * flags are set, or if no character set specifier is needed */
4703 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4705 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4706 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4707 >> RXf_PMf_STD_PMMOD_SHIFT);
4708 const char *fptr = STD_PAT_MODS; /*"msix"*/
4710 /* Allocate for the worst case, which is all the std flags are turned
4711 * on. If more precision is desired, we could do a population count of
4712 * the flags set. This could be done with a small lookup table, or by
4713 * shifting, masking and adding, or even, when available, assembly
4714 * language for a machine-language population count.
4715 * We never output a minus, as all those are defaults, so are
4716 * covered by the caret */
4717 const STRLEN wraplen = plen + has_p + has_runon
4718 + has_default /* If needs a caret */
4720 /* If needs a character set specifier */
4721 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4722 + (sizeof(STD_PAT_MODS) - 1)
4723 + (sizeof("(?:)") - 1);
4725 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4727 SvFLAGS(rx) |= SvUTF8(pattern);
4730 /* If a default, cover it using the caret */
4732 *p++= DEFAULT_PAT_MOD;
4736 const char* const name = get_regex_charset_name(r->extflags, &len);
4737 Copy(name, p, len, char);
4741 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4744 while((ch = *fptr++)) {
4752 Copy(RExC_precomp, p, plen, char);
4753 assert ((RX_WRAPPED(rx) - p) < 16);
4754 r->pre_prefix = p - RX_WRAPPED(rx);
4760 SvCUR_set(rx, p - SvPVX_const(rx));
4764 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4766 if (RExC_seen & REG_SEEN_RECURSE) {
4767 Newxz(RExC_open_parens, RExC_npar,regnode *);
4768 SAVEFREEPV(RExC_open_parens);
4769 Newxz(RExC_close_parens,RExC_npar,regnode *);
4770 SAVEFREEPV(RExC_close_parens);
4773 /* Useful during FAIL. */
4774 #ifdef RE_TRACK_PATTERN_OFFSETS
4775 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4776 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4777 "%s %"UVuf" bytes for offset annotations.\n",
4778 ri->u.offsets ? "Got" : "Couldn't get",
4779 (UV)((2*RExC_size+1) * sizeof(U32))));
4781 SetProgLen(ri,RExC_size);
4786 /* Second pass: emit code. */
4787 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4792 RExC_emit_start = ri->program;
4793 RExC_emit = ri->program;
4794 RExC_emit_bound = ri->program + RExC_size + 1;
4796 /* Store the count of eval-groups for security checks: */
4797 RExC_rx->seen_evals = RExC_seen_evals;
4798 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4799 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4803 /* XXXX To minimize changes to RE engine we always allocate
4804 3-units-long substrs field. */
4805 Newx(r->substrs, 1, struct reg_substr_data);
4806 if (RExC_recurse_count) {
4807 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4808 SAVEFREEPV(RExC_recurse);
4812 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4813 Zero(r->substrs, 1, struct reg_substr_data);
4815 #ifdef TRIE_STUDY_OPT
4817 StructCopy(&zero_scan_data, &data, scan_data_t);
4818 copyRExC_state = RExC_state;
4821 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4823 RExC_state = copyRExC_state;
4824 if (seen & REG_TOP_LEVEL_BRANCHES)
4825 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4827 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4828 if (data.last_found) {
4829 SvREFCNT_dec(data.longest_fixed);
4830 SvREFCNT_dec(data.longest_float);
4831 SvREFCNT_dec(data.last_found);
4833 StructCopy(&zero_scan_data, &data, scan_data_t);
4836 StructCopy(&zero_scan_data, &data, scan_data_t);
4839 /* Dig out information for optimizations. */
4840 r->extflags = RExC_flags; /* was pm_op */
4841 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4844 SvUTF8_on(rx); /* Unicode in it? */
4845 ri->regstclass = NULL;
4846 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4847 r->intflags |= PREGf_NAUGHTY;
4848 scan = ri->program + 1; /* First BRANCH. */
4850 /* testing for BRANCH here tells us whether there is "must appear"
4851 data in the pattern. If there is then we can use it for optimisations */
4852 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4854 STRLEN longest_float_length, longest_fixed_length;
4855 struct regnode_charclass_class ch_class; /* pointed to by data */
4857 I32 last_close = 0; /* pointed to by data */
4858 regnode *first= scan;
4859 regnode *first_next= regnext(first);
4861 * Skip introductions and multiplicators >= 1
4862 * so that we can extract the 'meat' of the pattern that must
4863 * match in the large if() sequence following.
4864 * NOTE that EXACT is NOT covered here, as it is normally
4865 * picked up by the optimiser separately.
4867 * This is unfortunate as the optimiser isnt handling lookahead
4868 * properly currently.
4871 while ((OP(first) == OPEN && (sawopen = 1)) ||
4872 /* An OR of *one* alternative - should not happen now. */
4873 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4874 /* for now we can't handle lookbehind IFMATCH*/
4875 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4876 (OP(first) == PLUS) ||
4877 (OP(first) == MINMOD) ||
4878 /* An {n,m} with n>0 */
4879 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4880 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4883 * the only op that could be a regnode is PLUS, all the rest
4884 * will be regnode_1 or regnode_2.
4887 if (OP(first) == PLUS)
4890 first += regarglen[OP(first)];
4892 first = NEXTOPER(first);
4893 first_next= regnext(first);
4896 /* Starting-point info. */
4898 DEBUG_PEEP("first:",first,0);
4899 /* Ignore EXACT as we deal with it later. */
4900 if (PL_regkind[OP(first)] == EXACT) {
4901 if (OP(first) == EXACT)
4902 NOOP; /* Empty, get anchored substr later. */
4904 ri->regstclass = first;
4907 else if (PL_regkind[OP(first)] == TRIE &&
4908 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4911 /* this can happen only on restudy */
4912 if ( OP(first) == TRIE ) {
4913 struct regnode_1 *trieop = (struct regnode_1 *)
4914 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4915 StructCopy(first,trieop,struct regnode_1);
4916 trie_op=(regnode *)trieop;
4918 struct regnode_charclass *trieop = (struct regnode_charclass *)
4919 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4920 StructCopy(first,trieop,struct regnode_charclass);
4921 trie_op=(regnode *)trieop;
4924 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4925 ri->regstclass = trie_op;
4928 else if (REGNODE_SIMPLE(OP(first)))
4929 ri->regstclass = first;
4930 else if (PL_regkind[OP(first)] == BOUND ||
4931 PL_regkind[OP(first)] == NBOUND)
4932 ri->regstclass = first;
4933 else if (PL_regkind[OP(first)] == BOL) {
4934 r->extflags |= (OP(first) == MBOL
4936 : (OP(first) == SBOL
4939 first = NEXTOPER(first);
4942 else if (OP(first) == GPOS) {
4943 r->extflags |= RXf_ANCH_GPOS;
4944 first = NEXTOPER(first);
4947 else if ((!sawopen || !RExC_sawback) &&
4948 (OP(first) == STAR &&
4949 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4950 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4952 /* turn .* into ^.* with an implied $*=1 */
4954 (OP(NEXTOPER(first)) == REG_ANY)
4957 r->extflags |= type;
4958 r->intflags |= PREGf_IMPLICIT;
4959 first = NEXTOPER(first);
4962 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4963 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4964 /* x+ must match at the 1st pos of run of x's */
4965 r->intflags |= PREGf_SKIP;
4967 /* Scan is after the zeroth branch, first is atomic matcher. */
4968 #ifdef TRIE_STUDY_OPT
4971 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4972 (IV)(first - scan + 1))
4976 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4977 (IV)(first - scan + 1))
4983 * If there's something expensive in the r.e., find the
4984 * longest literal string that must appear and make it the
4985 * regmust. Resolve ties in favor of later strings, since
4986 * the regstart check works with the beginning of the r.e.
4987 * and avoiding duplication strengthens checking. Not a
4988 * strong reason, but sufficient in the absence of others.
4989 * [Now we resolve ties in favor of the earlier string if
4990 * it happens that c_offset_min has been invalidated, since the
4991 * earlier string may buy us something the later one won't.]
4994 data.longest_fixed = newSVpvs("");
4995 data.longest_float = newSVpvs("");
4996 data.last_found = newSVpvs("");
4997 data.longest = &(data.longest_fixed);
4999 if (!ri->regstclass) {
5000 cl_init(pRExC_state, &ch_class);
5001 data.start_class = &ch_class;
5002 stclass_flag = SCF_DO_STCLASS_AND;
5003 } else /* XXXX Check for BOUND? */
5005 data.last_closep = &last_close;
5007 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5008 &data, -1, NULL, NULL,
5009 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5015 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5016 && data.last_start_min == 0 && data.last_end > 0
5017 && !RExC_seen_zerolen
5018 && !(RExC_seen & REG_SEEN_VERBARG)
5019 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5020 r->extflags |= RXf_CHECK_ALL;
5021 scan_commit(pRExC_state, &data,&minlen,0);
5022 SvREFCNT_dec(data.last_found);
5024 /* Note that code very similar to this but for anchored string
5025 follows immediately below, changes may need to be made to both.
5028 longest_float_length = CHR_SVLEN(data.longest_float);
5029 if (longest_float_length
5030 || (data.flags & SF_FL_BEFORE_EOL
5031 && (!(data.flags & SF_FL_BEFORE_MEOL)
5032 || (RExC_flags & RXf_PMf_MULTILINE))))
5036 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5037 && data.offset_fixed == data.offset_float_min
5038 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5039 goto remove_float; /* As in (a)+. */
5041 /* copy the information about the longest float from the reg_scan_data
5042 over to the program. */
5043 if (SvUTF8(data.longest_float)) {
5044 r->float_utf8 = data.longest_float;
5045 r->float_substr = NULL;
5047 r->float_substr = data.longest_float;
5048 r->float_utf8 = NULL;
5050 /* float_end_shift is how many chars that must be matched that
5051 follow this item. We calculate it ahead of time as once the
5052 lookbehind offset is added in we lose the ability to correctly
5054 ml = data.minlen_float ? *(data.minlen_float)
5055 : (I32)longest_float_length;
5056 r->float_end_shift = ml - data.offset_float_min
5057 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5058 + data.lookbehind_float;
5059 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5060 r->float_max_offset = data.offset_float_max;
5061 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5062 r->float_max_offset -= data.lookbehind_float;
5064 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5065 && (!(data.flags & SF_FL_BEFORE_MEOL)
5066 || (RExC_flags & RXf_PMf_MULTILINE)));
5067 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5071 r->float_substr = r->float_utf8 = NULL;
5072 SvREFCNT_dec(data.longest_float);
5073 longest_float_length = 0;
5076 /* Note that code very similar to this but for floating string
5077 is immediately above, changes may need to be made to both.
5080 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5081 if (longest_fixed_length
5082 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5083 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5084 || (RExC_flags & RXf_PMf_MULTILINE))))
5088 /* copy the information about the longest fixed
5089 from the reg_scan_data over to the program. */
5090 if (SvUTF8(data.longest_fixed)) {
5091 r->anchored_utf8 = data.longest_fixed;
5092 r->anchored_substr = NULL;
5094 r->anchored_substr = data.longest_fixed;
5095 r->anchored_utf8 = NULL;
5097 /* fixed_end_shift is how many chars that must be matched that
5098 follow this item. We calculate it ahead of time as once the
5099 lookbehind offset is added in we lose the ability to correctly
5101 ml = data.minlen_fixed ? *(data.minlen_fixed)
5102 : (I32)longest_fixed_length;
5103 r->anchored_end_shift = ml - data.offset_fixed
5104 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5105 + data.lookbehind_fixed;
5106 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5108 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5109 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5110 || (RExC_flags & RXf_PMf_MULTILINE)));
5111 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5114 r->anchored_substr = r->anchored_utf8 = NULL;
5115 SvREFCNT_dec(data.longest_fixed);
5116 longest_fixed_length = 0;
5119 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5120 ri->regstclass = NULL;
5122 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5124 && !(data.start_class->flags & ANYOF_EOS)
5125 && !cl_is_anything(data.start_class))
5127 const U32 n = add_data(pRExC_state, 1, "f");
5128 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5130 Newx(RExC_rxi->data->data[n], 1,
5131 struct regnode_charclass_class);
5132 StructCopy(data.start_class,
5133 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5134 struct regnode_charclass_class);
5135 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5136 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5137 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5138 regprop(r, sv, (regnode*)data.start_class);
5139 PerlIO_printf(Perl_debug_log,
5140 "synthetic stclass \"%s\".\n",
5141 SvPVX_const(sv));});
5144 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5145 if (longest_fixed_length > longest_float_length) {
5146 r->check_end_shift = r->anchored_end_shift;
5147 r->check_substr = r->anchored_substr;
5148 r->check_utf8 = r->anchored_utf8;
5149 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5150 if (r->extflags & RXf_ANCH_SINGLE)
5151 r->extflags |= RXf_NOSCAN;
5154 r->check_end_shift = r->float_end_shift;
5155 r->check_substr = r->float_substr;
5156 r->check_utf8 = r->float_utf8;
5157 r->check_offset_min = r->float_min_offset;
5158 r->check_offset_max = r->float_max_offset;
5160 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5161 This should be changed ASAP! */
5162 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5163 r->extflags |= RXf_USE_INTUIT;
5164 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5165 r->extflags |= RXf_INTUIT_TAIL;
5167 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5168 if ( (STRLEN)minlen < longest_float_length )
5169 minlen= longest_float_length;
5170 if ( (STRLEN)minlen < longest_fixed_length )
5171 minlen= longest_fixed_length;
5175 /* Several toplevels. Best we can is to set minlen. */
5177 struct regnode_charclass_class ch_class;
5180 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5182 scan = ri->program + 1;
5183 cl_init(pRExC_state, &ch_class);
5184 data.start_class = &ch_class;
5185 data.last_closep = &last_close;
5188 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5189 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5193 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5194 = r->float_substr = r->float_utf8 = NULL;
5196 if (!(data.start_class->flags & ANYOF_EOS)
5197 && !cl_is_anything(data.start_class))
5199 const U32 n = add_data(pRExC_state, 1, "f");
5200 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5202 Newx(RExC_rxi->data->data[n], 1,
5203 struct regnode_charclass_class);
5204 StructCopy(data.start_class,
5205 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5206 struct regnode_charclass_class);
5207 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5208 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5209 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5210 regprop(r, sv, (regnode*)data.start_class);
5211 PerlIO_printf(Perl_debug_log,
5212 "synthetic stclass \"%s\".\n",
5213 SvPVX_const(sv));});
5217 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5218 the "real" pattern. */
5220 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5221 (IV)minlen, (IV)r->minlen);
5223 r->minlenret = minlen;
5224 if (r->minlen < minlen)
5227 if (RExC_seen & REG_SEEN_GPOS)
5228 r->extflags |= RXf_GPOS_SEEN;
5229 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5230 r->extflags |= RXf_LOOKBEHIND_SEEN;
5231 if (RExC_seen & REG_SEEN_EVAL)
5232 r->extflags |= RXf_EVAL_SEEN;
5233 if (RExC_seen & REG_SEEN_CANY)
5234 r->extflags |= RXf_CANY_SEEN;
5235 if (RExC_seen & REG_SEEN_VERBARG)
5236 r->intflags |= PREGf_VERBARG_SEEN;
5237 if (RExC_seen & REG_SEEN_CUTGROUP)
5238 r->intflags |= PREGf_CUTGROUP_SEEN;
5239 if (RExC_paren_names)
5240 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5242 RXp_PAREN_NAMES(r) = NULL;
5244 #ifdef STUPID_PATTERN_CHECKS
5245 if (RX_PRELEN(rx) == 0)
5246 r->extflags |= RXf_NULL;
5247 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5248 /* XXX: this should happen BEFORE we compile */
5249 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5250 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5251 r->extflags |= RXf_WHITE;
5252 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5253 r->extflags |= RXf_START_ONLY;
5255 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5256 /* XXX: this should happen BEFORE we compile */
5257 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5259 regnode *first = ri->program + 1;
5262 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5263 r->extflags |= RXf_NULL;
5264 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5265 r->extflags |= RXf_START_ONLY;
5266 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5267 && OP(regnext(first)) == END)
5268 r->extflags |= RXf_WHITE;
5272 if (RExC_paren_names) {
5273 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5274 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5277 ri->name_list_idx = 0;
5279 if (RExC_recurse_count) {
5280 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5281 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5282 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5285 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5286 /* assume we don't need to swap parens around before we match */
5289 PerlIO_printf(Perl_debug_log,"Final program:\n");
5292 #ifdef RE_TRACK_PATTERN_OFFSETS
5293 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5294 const U32 len = ri->u.offsets[0];
5296 GET_RE_DEBUG_FLAGS_DECL;
5297 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5298 for (i = 1; i <= len; i++) {
5299 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5300 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5301 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5303 PerlIO_printf(Perl_debug_log, "\n");
5309 #undef RE_ENGINE_PTR
5313 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5316 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5318 PERL_UNUSED_ARG(value);
5320 if (flags & RXapif_FETCH) {
5321 return reg_named_buff_fetch(rx, key, flags);
5322 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5323 Perl_croak_no_modify(aTHX);
5325 } else if (flags & RXapif_EXISTS) {
5326 return reg_named_buff_exists(rx, key, flags)
5329 } else if (flags & RXapif_REGNAMES) {
5330 return reg_named_buff_all(rx, flags);
5331 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5332 return reg_named_buff_scalar(rx, flags);
5334 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5340 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5343 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5344 PERL_UNUSED_ARG(lastkey);
5346 if (flags & RXapif_FIRSTKEY)
5347 return reg_named_buff_firstkey(rx, flags);
5348 else if (flags & RXapif_NEXTKEY)
5349 return reg_named_buff_nextkey(rx, flags);
5351 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5357 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5360 AV *retarray = NULL;
5362 struct regexp *const rx = (struct regexp *)SvANY(r);
5364 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5366 if (flags & RXapif_ALL)
5369 if (rx && RXp_PAREN_NAMES(rx)) {
5370 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5373 SV* sv_dat=HeVAL(he_str);
5374 I32 *nums=(I32*)SvPVX(sv_dat);
5375 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5376 if ((I32)(rx->nparens) >= nums[i]
5377 && rx->offs[nums[i]].start != -1
5378 && rx->offs[nums[i]].end != -1)
5381 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5385 ret = newSVsv(&PL_sv_undef);
5388 av_push(retarray, ret);
5391 return newRV_noinc(MUTABLE_SV(retarray));
5398 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5401 struct regexp *const rx = (struct regexp *)SvANY(r);
5403 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5405 if (rx && RXp_PAREN_NAMES(rx)) {
5406 if (flags & RXapif_ALL) {
5407 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5409 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5423 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5425 struct regexp *const rx = (struct regexp *)SvANY(r);
5427 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5429 if ( rx && RXp_PAREN_NAMES(rx) ) {
5430 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5432 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5439 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5441 struct regexp *const rx = (struct regexp *)SvANY(r);
5442 GET_RE_DEBUG_FLAGS_DECL;
5444 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5446 if (rx && RXp_PAREN_NAMES(rx)) {
5447 HV *hv = RXp_PAREN_NAMES(rx);
5449 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5452 SV* sv_dat = HeVAL(temphe);
5453 I32 *nums = (I32*)SvPVX(sv_dat);
5454 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5455 if ((I32)(rx->lastparen) >= nums[i] &&
5456 rx->offs[nums[i]].start != -1 &&
5457 rx->offs[nums[i]].end != -1)
5463 if (parno || flags & RXapif_ALL) {
5464 return newSVhek(HeKEY_hek(temphe));
5472 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5477 struct regexp *const rx = (struct regexp *)SvANY(r);
5479 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5481 if (rx && RXp_PAREN_NAMES(rx)) {
5482 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5483 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5484 } else if (flags & RXapif_ONE) {
5485 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5486 av = MUTABLE_AV(SvRV(ret));
5487 length = av_len(av);
5489 return newSViv(length + 1);
5491 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5495 return &PL_sv_undef;
5499 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5501 struct regexp *const rx = (struct regexp *)SvANY(r);
5504 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5506 if (rx && RXp_PAREN_NAMES(rx)) {
5507 HV *hv= RXp_PAREN_NAMES(rx);
5509 (void)hv_iterinit(hv);
5510 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5513 SV* sv_dat = HeVAL(temphe);
5514 I32 *nums = (I32*)SvPVX(sv_dat);
5515 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5516 if ((I32)(rx->lastparen) >= nums[i] &&
5517 rx->offs[nums[i]].start != -1 &&
5518 rx->offs[nums[i]].end != -1)
5524 if (parno || flags & RXapif_ALL) {
5525 av_push(av, newSVhek(HeKEY_hek(temphe)));
5530 return newRV_noinc(MUTABLE_SV(av));
5534 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5537 struct regexp *const rx = (struct regexp *)SvANY(r);
5542 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5545 sv_setsv(sv,&PL_sv_undef);
5549 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5551 i = rx->offs[0].start;
5555 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5557 s = rx->subbeg + rx->offs[0].end;
5558 i = rx->sublen - rx->offs[0].end;
5561 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5562 (s1 = rx->offs[paren].start) != -1 &&
5563 (t1 = rx->offs[paren].end) != -1)
5567 s = rx->subbeg + s1;
5569 sv_setsv(sv,&PL_sv_undef);
5572 assert(rx->sublen >= (s - rx->subbeg) + i );
5574 const int oldtainted = PL_tainted;
5576 sv_setpvn(sv, s, i);
5577 PL_tainted = oldtainted;
5578 if ( (rx->extflags & RXf_CANY_SEEN)
5579 ? (RXp_MATCH_UTF8(rx)
5580 && (!i || is_utf8_string((U8*)s, i)))
5581 : (RXp_MATCH_UTF8(rx)) )
5588 if (RXp_MATCH_TAINTED(rx)) {
5589 if (SvTYPE(sv) >= SVt_PVMG) {
5590 MAGIC* const mg = SvMAGIC(sv);
5593 SvMAGIC_set(sv, mg->mg_moremagic);
5595 if ((mgt = SvMAGIC(sv))) {
5596 mg->mg_moremagic = mgt;
5597 SvMAGIC_set(sv, mg);
5607 sv_setsv(sv,&PL_sv_undef);
5613 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5614 SV const * const value)
5616 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5618 PERL_UNUSED_ARG(rx);
5619 PERL_UNUSED_ARG(paren);
5620 PERL_UNUSED_ARG(value);
5623 Perl_croak_no_modify(aTHX);
5627 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5630 struct regexp *const rx = (struct regexp *)SvANY(r);
5634 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5636 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5638 /* $` / ${^PREMATCH} */
5639 case RX_BUFF_IDX_PREMATCH:
5640 if (rx->offs[0].start != -1) {
5641 i = rx->offs[0].start;
5649 /* $' / ${^POSTMATCH} */
5650 case RX_BUFF_IDX_POSTMATCH:
5651 if (rx->offs[0].end != -1) {
5652 i = rx->sublen - rx->offs[0].end;
5654 s1 = rx->offs[0].end;
5660 /* $& / ${^MATCH}, $1, $2, ... */
5662 if (paren <= (I32)rx->nparens &&
5663 (s1 = rx->offs[paren].start) != -1 &&
5664 (t1 = rx->offs[paren].end) != -1)
5669 if (ckWARN(WARN_UNINITIALIZED))
5670 report_uninit((const SV *)sv);
5675 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5676 const char * const s = rx->subbeg + s1;
5681 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5688 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5690 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5691 PERL_UNUSED_ARG(rx);
5695 return newSVpvs("Regexp");
5698 /* Scans the name of a named buffer from the pattern.
5699 * If flags is REG_RSN_RETURN_NULL returns null.
5700 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5701 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5702 * to the parsed name as looked up in the RExC_paren_names hash.
5703 * If there is an error throws a vFAIL().. type exception.
5706 #define REG_RSN_RETURN_NULL 0
5707 #define REG_RSN_RETURN_NAME 1
5708 #define REG_RSN_RETURN_DATA 2
5711 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5713 char *name_start = RExC_parse;
5715 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5717 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5718 /* skip IDFIRST by using do...while */
5721 RExC_parse += UTF8SKIP(RExC_parse);
5722 } while (isALNUM_utf8((U8*)RExC_parse));
5726 } while (isALNUM(*RExC_parse));
5731 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5732 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5733 if ( flags == REG_RSN_RETURN_NAME)
5735 else if (flags==REG_RSN_RETURN_DATA) {
5738 if ( ! sv_name ) /* should not happen*/
5739 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5740 if (RExC_paren_names)
5741 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5743 sv_dat = HeVAL(he_str);
5745 vFAIL("Reference to nonexistent named group");
5749 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5756 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5757 int rem=(int)(RExC_end - RExC_parse); \
5766 if (RExC_lastparse!=RExC_parse) \
5767 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5770 iscut ? "..." : "<" \
5773 PerlIO_printf(Perl_debug_log,"%16s",""); \
5776 num = RExC_size + 1; \
5778 num=REG_NODE_NUM(RExC_emit); \
5779 if (RExC_lastnum!=num) \
5780 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5782 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5783 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5784 (int)((depth*2)), "", \
5788 RExC_lastparse=RExC_parse; \
5793 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5794 DEBUG_PARSE_MSG((funcname)); \
5795 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5797 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5798 DEBUG_PARSE_MSG((funcname)); \
5799 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5802 /* This section of code defines the inversion list object and its methods. The
5803 * interfaces are highly subject to change, so as much as possible is static to
5804 * this file. An inversion list is here implemented as a malloc'd C array with
5805 * some added info. More will be coming when functionality is added later.
5807 * Some of the methods should always be private to the implementation, and some
5808 * should eventually be made public */
5810 #define INVLIST_INITIAL_LEN 10
5811 #define INVLIST_ARRAY_KEY "array"
5812 #define INVLIST_MAX_KEY "max"
5813 #define INVLIST_LEN_KEY "len"
5815 PERL_STATIC_INLINE UV*
5816 S_invlist_array(pTHX_ HV* const invlist)
5818 /* Returns the pointer to the inversion list's array. Every time the
5819 * length changes, this needs to be called in case malloc or realloc moved
5822 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5824 PERL_ARGS_ASSERT_INVLIST_ARRAY;
5826 if (list_ptr == NULL) {
5827 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5831 return INT2PTR(UV *, SvUV(*list_ptr));
5834 PERL_STATIC_INLINE void
5835 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5837 PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5839 /* Sets the array stored in the inversion list to the memory beginning with
5842 if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5843 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5848 PERL_STATIC_INLINE UV
5849 S_invlist_len(pTHX_ HV* const invlist)
5851 /* Returns the current number of elements in the inversion list's array */
5853 SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5855 PERL_ARGS_ASSERT_INVLIST_LEN;
5857 if (len_ptr == NULL) {
5858 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5862 return SvUV(*len_ptr);
5865 PERL_STATIC_INLINE UV
5866 S_invlist_max(pTHX_ HV* const invlist)
5868 /* Returns the maximum number of elements storable in the inversion list's
5869 * array, without having to realloc() */
5871 SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5873 PERL_ARGS_ASSERT_INVLIST_MAX;
5875 if (max_ptr == NULL) {
5876 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5880 return SvUV(*max_ptr);
5883 PERL_STATIC_INLINE void
5884 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5886 /* Sets the current number of elements stored in the inversion list */
5888 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5890 if (len != 0 && len > invlist_max(invlist)) {
5891 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5894 if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5895 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5900 PERL_STATIC_INLINE void
5901 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5904 /* Sets the maximum number of elements storable in the inversion list
5905 * without having to realloc() */
5907 PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5909 if (max < invlist_len(invlist)) {
5910 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5913 if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5914 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5919 #ifndef PERL_IN_XSUB_RE
5921 Perl__new_invlist(pTHX_ IV initial_size)
5924 /* Return a pointer to a newly constructed inversion list, with enough
5925 * space to store 'initial_size' elements. If that number is negative, a
5926 * system default is used instead */
5928 HV* invlist = newHV();
5931 if (initial_size < 0) {
5932 initial_size = INVLIST_INITIAL_LEN;
5935 /* Allocate the initial space */
5936 Newx(list, initial_size, UV);
5937 invlist_set_array(invlist, list);
5939 /* set_len has to come before set_max, as the latter inspects the len */
5940 invlist_set_len(invlist, 0);
5941 invlist_set_max(invlist, initial_size);
5947 PERL_STATIC_INLINE void
5948 S_invlist_destroy(pTHX_ HV* const invlist)
5950 /* Inversion list destructor */
5952 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5954 PERL_ARGS_ASSERT_INVLIST_DESTROY;
5956 if (list_ptr != NULL) {
5957 UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5963 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5965 /* Change the maximum size of an inversion list (up or down) */
5969 const UV old_max = invlist_max(invlist);
5971 PERL_ARGS_ASSERT_INVLIST_EXTEND;
5973 if (old_max == new_max) { /* If a no-op */
5977 array = orig_array = invlist_array(invlist);
5978 Renew(array, new_max, UV);
5980 /* If the size change moved the list in memory, set the new one */
5981 if (array != orig_array) {
5982 invlist_set_array(invlist, array);
5985 invlist_set_max(invlist, new_max);
5989 PERL_STATIC_INLINE void
5990 S_invlist_trim(pTHX_ HV* const invlist)
5992 PERL_ARGS_ASSERT_INVLIST_TRIM;
5994 /* Change the length of the inversion list to how many entries it currently
5997 invlist_extend(invlist, invlist_len(invlist));
6000 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6003 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
6005 #ifndef PERL_IN_XSUB_RE
6007 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6009 /* Subject to change or removal. Append the range from 'start' to 'end' at
6010 * the end of the inversion list. The range must be above any existing
6013 UV* array = invlist_array(invlist);
6014 UV max = invlist_max(invlist);
6015 UV len = invlist_len(invlist);
6017 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6021 /* Here, the existing list is non-empty. The current max entry in the
6022 * list is generally the first value not in the set, except when the
6023 * set extends to the end of permissible values, in which case it is
6024 * the first entry in that final set, and so this call is an attempt to
6025 * append out-of-order */
6027 UV final_element = len - 1;
6028 if (array[final_element] > start
6029 || ELEMENT_IN_INVLIST_SET(final_element))
6031 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
6034 /* Here, it is a legal append. If the new range begins with the first
6035 * value not in the set, it is extending the set, so the new first
6036 * value not in the set is one greater than the newly extended range.
6038 if (array[final_element] == start) {
6039 if (end != UV_MAX) {
6040 array[final_element] = end + 1;
6043 /* But if the end is the maximum representable on the machine,
6044 * just let the range that this would extend have no end */
6045 invlist_set_len(invlist, len - 1);
6051 /* Here the new range doesn't extend any existing set. Add it */
6053 len += 2; /* Includes an element each for the start and end of range */
6055 /* If overflows the existing space, extend, which may cause the array to be
6058 invlist_extend(invlist, len);
6059 array = invlist_array(invlist);
6062 invlist_set_len(invlist, len);
6064 /* The next item on the list starts the range, the one after that is
6065 * one past the new range. */
6066 array[len - 2] = start;
6067 if (end != UV_MAX) {
6068 array[len - 1] = end + 1;
6071 /* But if the end is the maximum representable on the machine, just let
6072 * the range have no end */
6073 invlist_set_len(invlist, len - 1);
6079 S_invlist_union(pTHX_ HV* const a, HV* const b)
6081 /* Return a new inversion list which is the union of two inversion lists.
6082 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6083 * Richard Gillam, published by Addison-Wesley, and explained at some
6084 * length there. The preface says to incorporate its examples into your
6085 * code at your own risk.
6087 * The algorithm is like a merge sort.
6089 * XXX A potential performance improvement is to keep track as we go along
6090 * if only one of the inputs contributes to the result, meaning the other
6091 * is a subset of that one. In that case, we can skip the final copy and
6092 * return the larger of the input lists */
6094 UV* array_a = invlist_array(a); /* a's array */
6095 UV* array_b = invlist_array(b);
6096 UV len_a = invlist_len(a); /* length of a's array */
6097 UV len_b = invlist_len(b);
6099 HV* u; /* the resulting union */
6103 UV i_a = 0; /* current index into a's array */
6107 /* running count, as explained in the algorithm source book; items are
6108 * stopped accumulating and are output when the count changes to/from 0.
6109 * The count is incremented when we start a range that's in the set, and
6110 * decremented when we start a range that's not in the set. So its range
6111 * is 0 to 2. Only when the count is zero is something not in the set.
6115 PERL_ARGS_ASSERT_INVLIST_UNION;
6117 /* Size the union for the worst case: that the sets are completely
6119 u = _new_invlist(len_a + len_b);
6120 array_u = invlist_array(u);
6122 /* Go through each list item by item, stopping when exhausted one of
6124 while (i_a < len_a && i_b < len_b) {
6125 UV cp; /* The element to potentially add to the union's array */
6126 bool cp_in_set; /* is it in the the input list's set or not */
6128 /* We need to take one or the other of the two inputs for the union.
6129 * Since we are merging two sorted lists, we take the smaller of the
6130 * next items. In case of a tie, we take the one that is in its set
6131 * first. If we took one not in the set first, it would decrement the
6132 * count, possibly to 0 which would cause it to be output as ending the
6133 * range, and the next time through we would take the same number, and
6134 * output it again as beginning the next range. By doing it the
6135 * opposite way, there is no possibility that the count will be
6136 * momentarily decremented to 0, and thus the two adjoining ranges will
6137 * be seamlessly merged. (In a tie and both are in the set or both not
6138 * in the set, it doesn't matter which we take first.) */
6139 if (array_a[i_a] < array_b[i_b]
6140 || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6142 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6146 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6150 /* Here, have chosen which of the two inputs to look at. Only output
6151 * if the running count changes to/from 0, which marks the
6152 * beginning/end of a range in that's in the set */
6155 array_u[i_u++] = cp;
6162 array_u[i_u++] = cp;
6167 /* Here, we are finished going through at least one of the lists, which
6168 * means there is something remaining in at most one. We check if the list
6169 * that hasn't been exhausted is positioned such that we are in the middle
6170 * of a range in its set or not. (We are in the set if the next item in
6171 * the array marks the beginning of something not in the set) If in the
6172 * set, we decrement 'count'; if 0, there is potentially more to output.
6173 * There are four cases:
6174 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6175 * in the union is entirely from the non-exhausted set.
6176 * 2) Both were in their sets, count is 2. Nothing further should
6177 * be output, as everything that remains will be in the exhausted
6178 * list's set, hence in the union; decrementing to 1 but not 0 insures
6180 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6181 * Nothing further should be output because the union includes
6182 * everything from the exhausted set. Not decrementing insures that.
6183 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6184 * decrementing to 0 insures that we look at the remainder of the
6185 * non-exhausted set */
6186 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6187 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6192 /* The final length is what we've output so far, plus what else is about to
6193 * be output. (If 'count' is non-zero, then the input list we exhausted
6194 * has everything remaining up to the machine's limit in its set, and hence
6195 * in the union, so there will be no further output. */
6198 /* At most one of the subexpressions will be non-zero */
6199 len_u += (len_a - i_a) + (len_b - i_b);
6202 /* Set result to final length, which can change the pointer to array_u, so
6204 if (len_u != invlist_len(u)) {
6205 invlist_set_len(u, len_u);
6207 array_u = invlist_array(u);
6210 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6211 * the other) ended with everything above it not in its set. That means
6212 * that the remaining part of the union is precisely the same as the
6213 * non-exhausted list, so can just copy it unchanged. (If both list were
6214 * exhausted at the same time, then the operations below will be both 0.)
6217 IV copy_count; /* At most one will have a non-zero copy count */
6218 if ((copy_count = len_a - i_a) > 0) {
6219 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6221 else if ((copy_count = len_b - i_b) > 0) {
6222 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6230 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6232 /* Return the intersection of two inversion lists. The basis for this
6233 * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6234 * by Addison-Wesley, and explained at some length there. The preface says
6235 * to incorporate its examples into your code at your own risk.
6237 * The algorithm is like a merge sort, and is essentially the same as the
6241 UV* array_a = invlist_array(a); /* a's array */
6242 UV* array_b = invlist_array(b);
6243 UV len_a = invlist_len(a); /* length of a's array */
6244 UV len_b = invlist_len(b);
6246 HV* r; /* the resulting intersection */
6250 UV i_a = 0; /* current index into a's array */
6254 /* running count, as explained in the algorithm source book; items are
6255 * stopped accumulating and are output when the count changes to/from 2.
6256 * The count is incremented when we start a range that's in the set, and
6257 * decremented when we start a range that's not in the set. So its range
6258 * is 0 to 2. Only when the count is 2 is something in the intersection.
6262 PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6264 /* Size the intersection for the worst case: that the intersection ends up
6265 * fragmenting everything to be completely disjoint */
6266 r= _new_invlist(len_a + len_b);
6267 array_r = invlist_array(r);
6269 /* Go through each list item by item, stopping when exhausted one of
6271 while (i_a < len_a && i_b < len_b) {
6272 UV cp; /* The element to potentially add to the intersection's
6274 bool cp_in_set; /* Is it in the input list's set or not */
6276 /* We need to take one or the other of the two inputs for the union.
6277 * Since we are merging two sorted lists, we take the smaller of the
6278 * next items. In case of a tie, we take the one that is not in its
6279 * set first (a difference from the union algorithm). If we took one
6280 * in the set first, it would increment the count, possibly to 2 which
6281 * would cause it to be output as starting a range in the intersection,
6282 * and the next time through we would take that same number, and output
6283 * it again as ending the set. By doing it the opposite of this, we
6284 * there is no possibility that the count will be momentarily
6285 * incremented to 2. (In a tie and both are in the set or both not in
6286 * the set, it doesn't matter which we take first.) */
6287 if (array_a[i_a] < array_b[i_b]
6288 || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6290 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6294 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6298 /* Here, have chosen which of the two inputs to look at. Only output
6299 * if the running count changes to/from 2, which marks the
6300 * beginning/end of a range that's in the intersection */
6304 array_r[i_r++] = cp;
6309 array_r[i_r++] = cp;
6315 /* Here, we are finished going through at least one of the sets, which
6316 * means there is something remaining in at most one. See the comments in
6318 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6319 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6324 /* The final length is what we've output so far plus what else is in the
6325 * intersection. Only one of the subexpressions below will be non-zero */
6328 len_r += (len_a - i_a) + (len_b - i_b);
6331 /* Set result to final length, which can change the pointer to array_r, so
6333 if (len_r != invlist_len(r)) {
6334 invlist_set_len(r, len_r);
6336 array_r = invlist_array(r);
6339 /* Finish outputting any remaining */
6340 if (count == 2) { /* Only one of will have a non-zero copy count */
6342 if ((copy_count = len_a - i_a) > 0) {
6343 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6345 else if ((copy_count = len_b - i_b) > 0) {
6346 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6354 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6356 /* Add the range from 'start' to 'end' inclusive to the inversion list's
6357 * set. A pointer to the inversion list is returned. This may actually be
6358 * a new list, in which case the passed in one has been destroyed. The
6359 * passed in inversion list can be NULL, in which case a new one is created
6360 * with just the one range in it */
6366 if (invlist == NULL) {
6367 invlist = _new_invlist(2);
6371 len = invlist_len(invlist);
6374 /* If comes after the final entry, can just append it to the end */
6376 || start >= invlist_array(invlist)
6377 [invlist_len(invlist) - 1])
6379 _append_range_to_invlist(invlist, start, end);
6383 /* Here, can't just append things, create and return a new inversion list
6384 * which is the union of this range and the existing inversion list */
6385 range_invlist = _new_invlist(2);
6386 _append_range_to_invlist(range_invlist, start, end);
6388 added_invlist = invlist_union(invlist, range_invlist);
6390 /* The passed in list can be freed, as well as our temporary */
6391 invlist_destroy(range_invlist);
6392 if (invlist != added_invlist) {
6393 invlist_destroy(invlist);
6396 return added_invlist;
6399 PERL_STATIC_INLINE HV*
6400 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6401 return add_range_to_invlist(invlist, cp, cp);
6404 /* End of inversion list object */
6407 - reg - regular expression, i.e. main body or parenthesized thing
6409 * Caller must absorb opening parenthesis.
6411 * Combining parenthesis handling with the base level of regular expression
6412 * is a trifle forced, but the need to tie the tails of the branches to what
6413 * follows makes it hard to avoid.
6415 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6417 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6419 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6423 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6424 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6427 register regnode *ret; /* Will be the head of the group. */
6428 register regnode *br;
6429 register regnode *lastbr;
6430 register regnode *ender = NULL;
6431 register I32 parno = 0;
6433 U32 oregflags = RExC_flags;
6434 bool have_branch = 0;
6436 I32 freeze_paren = 0;
6437 I32 after_freeze = 0;
6439 /* for (?g), (?gc), and (?o) warnings; warning
6440 about (?c) will warn about (?g) -- japhy */
6442 #define WASTED_O 0x01
6443 #define WASTED_G 0x02
6444 #define WASTED_C 0x04
6445 #define WASTED_GC (0x02|0x04)
6446 I32 wastedflags = 0x00;
6448 char * parse_start = RExC_parse; /* MJD */
6449 char * const oregcomp_parse = RExC_parse;
6451 GET_RE_DEBUG_FLAGS_DECL;
6453 PERL_ARGS_ASSERT_REG;
6454 DEBUG_PARSE("reg ");
6456 *flagp = 0; /* Tentatively. */
6459 /* Make an OPEN node, if parenthesized. */
6461 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6462 char *start_verb = RExC_parse;
6463 STRLEN verb_len = 0;
6464 char *start_arg = NULL;
6465 unsigned char op = 0;
6467 int internal_argval = 0; /* internal_argval is only useful if !argok */
6468 while ( *RExC_parse && *RExC_parse != ')' ) {
6469 if ( *RExC_parse == ':' ) {
6470 start_arg = RExC_parse + 1;
6476 verb_len = RExC_parse - start_verb;
6479 while ( *RExC_parse && *RExC_parse != ')' )
6481 if ( *RExC_parse != ')' )
6482 vFAIL("Unterminated verb pattern argument");
6483 if ( RExC_parse == start_arg )
6486 if ( *RExC_parse != ')' )
6487 vFAIL("Unterminated verb pattern");
6490 switch ( *start_verb ) {
6491 case 'A': /* (*ACCEPT) */
6492 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6494 internal_argval = RExC_nestroot;
6497 case 'C': /* (*COMMIT) */
6498 if ( memEQs(start_verb,verb_len,"COMMIT") )
6501 case 'F': /* (*FAIL) */
6502 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6507 case ':': /* (*:NAME) */
6508 case 'M': /* (*MARK:NAME) */
6509 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6514 case 'P': /* (*PRUNE) */
6515 if ( memEQs(start_verb,verb_len,"PRUNE") )
6518 case 'S': /* (*SKIP) */
6519 if ( memEQs(start_verb,verb_len,"SKIP") )
6522 case 'T': /* (*THEN) */
6523 /* [19:06] <TimToady> :: is then */
6524 if ( memEQs(start_verb,verb_len,"THEN") ) {
6526 RExC_seen |= REG_SEEN_CUTGROUP;
6532 vFAIL3("Unknown verb pattern '%.*s'",
6533 verb_len, start_verb);
6536 if ( start_arg && internal_argval ) {
6537 vFAIL3("Verb pattern '%.*s' may not have an argument",
6538 verb_len, start_verb);
6539 } else if ( argok < 0 && !start_arg ) {
6540 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6541 verb_len, start_verb);
6543 ret = reganode(pRExC_state, op, internal_argval);
6544 if ( ! internal_argval && ! SIZE_ONLY ) {
6546 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6547 ARG(ret) = add_data( pRExC_state, 1, "S" );
6548 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6555 if (!internal_argval)
6556 RExC_seen |= REG_SEEN_VERBARG;
6557 } else if ( start_arg ) {
6558 vFAIL3("Verb pattern '%.*s' may not have an argument",
6559 verb_len, start_verb);
6561 ret = reg_node(pRExC_state, op);
6563 nextchar(pRExC_state);
6566 if (*RExC_parse == '?') { /* (?...) */
6567 bool is_logical = 0;
6568 const char * const seqstart = RExC_parse;
6569 bool has_use_defaults = FALSE;
6572 paren = *RExC_parse++;
6573 ret = NULL; /* For look-ahead/behind. */
6576 case 'P': /* (?P...) variants for those used to PCRE/Python */
6577 paren = *RExC_parse++;
6578 if ( paren == '<') /* (?P<...>) named capture */
6580 else if (paren == '>') { /* (?P>name) named recursion */
6581 goto named_recursion;
6583 else if (paren == '=') { /* (?P=...) named backref */
6584 /* this pretty much dupes the code for \k<NAME> in regatom(), if
6585 you change this make sure you change that */
6586 char* name_start = RExC_parse;
6588 SV *sv_dat = reg_scan_name(pRExC_state,
6589 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6590 if (RExC_parse == name_start || *RExC_parse != ')')
6591 vFAIL2("Sequence %.3s... not terminated",parse_start);
6594 num = add_data( pRExC_state, 1, "S" );
6595 RExC_rxi->data->data[num]=(void*)sv_dat;
6596 SvREFCNT_inc_simple_void(sv_dat);
6599 ret = reganode(pRExC_state,
6602 : (MORE_ASCII_RESTRICTED)
6604 : (AT_LEAST_UNI_SEMANTICS)
6612 Set_Node_Offset(ret, parse_start+1);
6613 Set_Node_Cur_Length(ret); /* MJD */
6615 nextchar(pRExC_state);
6619 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6621 case '<': /* (?<...) */
6622 if (*RExC_parse == '!')
6624 else if (*RExC_parse != '=')
6630 case '\'': /* (?'...') */
6631 name_start= RExC_parse;
6632 svname = reg_scan_name(pRExC_state,
6633 SIZE_ONLY ? /* reverse test from the others */
6634 REG_RSN_RETURN_NAME :
6635 REG_RSN_RETURN_NULL);
6636 if (RExC_parse == name_start) {
6638 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6641 if (*RExC_parse != paren)
6642 vFAIL2("Sequence (?%c... not terminated",
6643 paren=='>' ? '<' : paren);
6647 if (!svname) /* shouldn't happen */
6649 "panic: reg_scan_name returned NULL");
6650 if (!RExC_paren_names) {
6651 RExC_paren_names= newHV();
6652 sv_2mortal(MUTABLE_SV(RExC_paren_names));
6654 RExC_paren_name_list= newAV();
6655 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6658 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6660 sv_dat = HeVAL(he_str);
6662 /* croak baby croak */
6664 "panic: paren_name hash element allocation failed");
6665 } else if ( SvPOK(sv_dat) ) {
6666 /* (?|...) can mean we have dupes so scan to check
6667 its already been stored. Maybe a flag indicating
6668 we are inside such a construct would be useful,
6669 but the arrays are likely to be quite small, so
6670 for now we punt -- dmq */
6671 IV count = SvIV(sv_dat);
6672 I32 *pv = (I32*)SvPVX(sv_dat);
6674 for ( i = 0 ; i < count ; i++ ) {
6675 if ( pv[i] == RExC_npar ) {
6681 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6682 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6683 pv[count] = RExC_npar;
6684 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6687 (void)SvUPGRADE(sv_dat,SVt_PVNV);
6688 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6690 SvIV_set(sv_dat, 1);
6693 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6694 SvREFCNT_dec(svname);
6697 /*sv_dump(sv_dat);*/
6699 nextchar(pRExC_state);
6701 goto capturing_parens;
6703 RExC_seen |= REG_SEEN_LOOKBEHIND;
6704 RExC_in_lookbehind++;
6706 case '=': /* (?=...) */
6707 RExC_seen_zerolen++;
6709 case '!': /* (?!...) */
6710 RExC_seen_zerolen++;
6711 if (*RExC_parse == ')') {
6712 ret=reg_node(pRExC_state, OPFAIL);
6713 nextchar(pRExC_state);
6717 case '|': /* (?|...) */
6718 /* branch reset, behave like a (?:...) except that
6719 buffers in alternations share the same numbers */
6721 after_freeze = freeze_paren = RExC_npar;
6723 case ':': /* (?:...) */
6724 case '>': /* (?>...) */
6726 case '$': /* (?$...) */
6727 case '@': /* (?@...) */
6728 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6730 case '#': /* (?#...) */
6731 while (*RExC_parse && *RExC_parse != ')')
6733 if (*RExC_parse != ')')
6734 FAIL("Sequence (?#... not terminated");
6735 nextchar(pRExC_state);
6738 case '0' : /* (?0) */
6739 case 'R' : /* (?R) */
6740 if (*RExC_parse != ')')
6741 FAIL("Sequence (?R) not terminated");
6742 ret = reg_node(pRExC_state, GOSTART);
6743 *flagp |= POSTPONED;
6744 nextchar(pRExC_state);
6747 { /* named and numeric backreferences */
6749 case '&': /* (?&NAME) */
6750 parse_start = RExC_parse - 1;
6753 SV *sv_dat = reg_scan_name(pRExC_state,
6754 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6755 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6757 goto gen_recurse_regop;
6760 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6762 vFAIL("Illegal pattern");
6764 goto parse_recursion;
6766 case '-': /* (?-1) */
6767 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6768 RExC_parse--; /* rewind to let it be handled later */
6772 case '1': case '2': case '3': case '4': /* (?1) */
6773 case '5': case '6': case '7': case '8': case '9':
6776 num = atoi(RExC_parse);
6777 parse_start = RExC_parse - 1; /* MJD */
6778 if (*RExC_parse == '-')
6780 while (isDIGIT(*RExC_parse))
6782 if (*RExC_parse!=')')
6783 vFAIL("Expecting close bracket");
6786 if ( paren == '-' ) {
6788 Diagram of capture buffer numbering.
6789 Top line is the normal capture buffer numbers
6790 Bottom line is the negative indexing as from
6794 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6798 num = RExC_npar + num;
6801 vFAIL("Reference to nonexistent group");
6803 } else if ( paren == '+' ) {
6804 num = RExC_npar + num - 1;
6807 ret = reganode(pRExC_state, GOSUB, num);
6809 if (num > (I32)RExC_rx->nparens) {
6811 vFAIL("Reference to nonexistent group");
6813 ARG2L_SET( ret, RExC_recurse_count++);
6815 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6816 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6820 RExC_seen |= REG_SEEN_RECURSE;
6821 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6822 Set_Node_Offset(ret, parse_start); /* MJD */
6824 *flagp |= POSTPONED;
6825 nextchar(pRExC_state);
6827 } /* named and numeric backreferences */
6830 case '?': /* (??...) */
6832 if (*RExC_parse != '{') {
6834 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6837 *flagp |= POSTPONED;
6838 paren = *RExC_parse++;
6840 case '{': /* (?{...}) */
6845 char *s = RExC_parse;
6847 RExC_seen_zerolen++;
6848 RExC_seen |= REG_SEEN_EVAL;
6849 while (count && (c = *RExC_parse)) {
6860 if (*RExC_parse != ')') {
6862 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6866 OP_4tree *sop, *rop;
6867 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6870 Perl_save_re_context(aTHX);
6871 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6872 sop->op_private |= OPpREFCOUNTED;
6873 /* re_dup will OpREFCNT_inc */
6874 OpREFCNT_set(sop, 1);
6877 n = add_data(pRExC_state, 3, "nop");
6878 RExC_rxi->data->data[n] = (void*)rop;
6879 RExC_rxi->data->data[n+1] = (void*)sop;
6880 RExC_rxi->data->data[n+2] = (void*)pad;
6883 else { /* First pass */
6884 if (PL_reginterp_cnt < ++RExC_seen_evals
6886 /* No compiled RE interpolated, has runtime
6887 components ===> unsafe. */
6888 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6889 if (PL_tainting && PL_tainted)
6890 FAIL("Eval-group in insecure regular expression");
6891 #if PERL_VERSION > 8
6892 if (IN_PERL_COMPILETIME)
6897 nextchar(pRExC_state);
6899 ret = reg_node(pRExC_state, LOGICAL);
6902 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6903 /* deal with the length of this later - MJD */
6906 ret = reganode(pRExC_state, EVAL, n);
6907 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6908 Set_Node_Offset(ret, parse_start);
6911 case '(': /* (?(?{...})...) and (?(?=...)...) */
6914 if (RExC_parse[0] == '?') { /* (?(?...)) */
6915 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6916 || RExC_parse[1] == '<'
6917 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6920 ret = reg_node(pRExC_state, LOGICAL);
6923 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6927 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6928 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6930 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6931 char *name_start= RExC_parse++;
6933 SV *sv_dat=reg_scan_name(pRExC_state,
6934 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6935 if (RExC_parse == name_start || *RExC_parse != ch)
6936 vFAIL2("Sequence (?(%c... not terminated",
6937 (ch == '>' ? '<' : ch));
6940 num = add_data( pRExC_state, 1, "S" );
6941 RExC_rxi->data->data[num]=(void*)sv_dat;
6942 SvREFCNT_inc_simple_void(sv_dat);
6944 ret = reganode(pRExC_state,NGROUPP,num);
6945 goto insert_if_check_paren;
6947 else if (RExC_parse[0] == 'D' &&
6948 RExC_parse[1] == 'E' &&
6949 RExC_parse[2] == 'F' &&
6950 RExC_parse[3] == 'I' &&
6951 RExC_parse[4] == 'N' &&
6952 RExC_parse[5] == 'E')
6954 ret = reganode(pRExC_state,DEFINEP,0);
6957 goto insert_if_check_paren;
6959 else if (RExC_parse[0] == 'R') {
6962 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6963 parno = atoi(RExC_parse++);
6964 while (isDIGIT(*RExC_parse))
6966 } else if (RExC_parse[0] == '&') {
6969 sv_dat = reg_scan_name(pRExC_state,
6970 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6971 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6973 ret = reganode(pRExC_state,INSUBP,parno);
6974 goto insert_if_check_paren;
6976 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6979 parno = atoi(RExC_parse++);
6981 while (isDIGIT(*RExC_parse))
6983 ret = reganode(pRExC_state, GROUPP, parno);
6985 insert_if_check_paren:
6986 if ((c = *nextchar(pRExC_state)) != ')')
6987 vFAIL("Switch condition not recognized");
6989 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6990 br = regbranch(pRExC_state, &flags, 1,depth+1);
6992 br = reganode(pRExC_state, LONGJMP, 0);
6994 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6995 c = *nextchar(pRExC_state);
7000 vFAIL("(?(DEFINE)....) does not allow branches");
7001 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7002 regbranch(pRExC_state, &flags, 1,depth+1);
7003 REGTAIL(pRExC_state, ret, lastbr);
7006 c = *nextchar(pRExC_state);
7011 vFAIL("Switch (?(condition)... contains too many branches");
7012 ender = reg_node(pRExC_state, TAIL);
7013 REGTAIL(pRExC_state, br, ender);
7015 REGTAIL(pRExC_state, lastbr, ender);
7016 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7019 REGTAIL(pRExC_state, ret, ender);
7020 RExC_size++; /* XXX WHY do we need this?!!
7021 For large programs it seems to be required
7022 but I can't figure out why. -- dmq*/
7026 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7030 RExC_parse--; /* for vFAIL to print correctly */
7031 vFAIL("Sequence (? incomplete");
7033 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
7035 has_use_defaults = TRUE;
7036 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7037 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7038 ? REGEX_UNICODE_CHARSET
7039 : REGEX_DEPENDS_CHARSET);
7043 parse_flags: /* (?i) */
7045 U32 posflags = 0, negflags = 0;
7046 U32 *flagsp = &posflags;
7047 bool has_charset_modifier = 0;
7048 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7049 ? REGEX_UNICODE_CHARSET
7050 : REGEX_DEPENDS_CHARSET;
7052 while (*RExC_parse) {
7053 /* && strchr("iogcmsx", *RExC_parse) */
7054 /* (?g), (?gc) and (?o) are useless here
7055 and must be globally applied -- japhy */
7056 switch (*RExC_parse) {
7057 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7058 case LOCALE_PAT_MOD:
7059 if (has_charset_modifier || flagsp == &negflags) {
7060 goto fail_modifiers;
7062 cs = REGEX_LOCALE_CHARSET;
7063 has_charset_modifier = 1;
7064 RExC_contains_locale = 1;
7066 case UNICODE_PAT_MOD:
7067 if (has_charset_modifier || flagsp == &negflags) {
7068 goto fail_modifiers;
7070 cs = REGEX_UNICODE_CHARSET;
7071 has_charset_modifier = 1;
7073 case ASCII_RESTRICT_PAT_MOD:
7074 if (has_charset_modifier || flagsp == &negflags) {
7075 goto fail_modifiers;
7077 if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
7078 /* Doubled modifier implies more restricted */
7079 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7083 cs = REGEX_ASCII_RESTRICTED_CHARSET;
7085 has_charset_modifier = 1;
7087 case DEPENDS_PAT_MOD:
7088 if (has_use_defaults
7089 || has_charset_modifier
7090 || flagsp == &negflags)
7092 goto fail_modifiers;
7095 /* The dual charset means unicode semantics if the
7096 * pattern (or target, not known until runtime) are
7097 * utf8, or something in the pattern indicates unicode
7099 cs = (RExC_utf8 || RExC_uni_semantics)
7100 ? REGEX_UNICODE_CHARSET
7101 : REGEX_DEPENDS_CHARSET;
7102 has_charset_modifier = 1;
7104 case ONCE_PAT_MOD: /* 'o' */
7105 case GLOBAL_PAT_MOD: /* 'g' */
7106 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7107 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7108 if (! (wastedflags & wflagbit) ) {
7109 wastedflags |= wflagbit;
7112 "Useless (%s%c) - %suse /%c modifier",
7113 flagsp == &negflags ? "?-" : "?",
7115 flagsp == &negflags ? "don't " : "",
7122 case CONTINUE_PAT_MOD: /* 'c' */
7123 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7124 if (! (wastedflags & WASTED_C) ) {
7125 wastedflags |= WASTED_GC;
7128 "Useless (%sc) - %suse /gc modifier",
7129 flagsp == &negflags ? "?-" : "?",
7130 flagsp == &negflags ? "don't " : ""
7135 case KEEPCOPY_PAT_MOD: /* 'p' */
7136 if (flagsp == &negflags) {
7138 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7140 *flagsp |= RXf_PMf_KEEPCOPY;
7144 /* A flag is a default iff it is following a minus, so
7145 * if there is a minus, it means will be trying to
7146 * re-specify a default which is an error */
7147 if (has_use_defaults || flagsp == &negflags) {
7150 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7154 wastedflags = 0; /* reset so (?g-c) warns twice */
7160 RExC_flags |= posflags;
7161 RExC_flags &= ~negflags;
7162 set_regex_charset(&RExC_flags, cs);
7164 oregflags |= posflags;
7165 oregflags &= ~negflags;
7166 set_regex_charset(&oregflags, cs);
7168 nextchar(pRExC_state);
7179 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7184 }} /* one for the default block, one for the switch */
7191 ret = reganode(pRExC_state, OPEN, parno);
7194 RExC_nestroot = parno;
7195 if (RExC_seen & REG_SEEN_RECURSE
7196 && !RExC_open_parens[parno-1])
7198 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7199 "Setting open paren #%"IVdf" to %d\n",
7200 (IV)parno, REG_NODE_NUM(ret)));
7201 RExC_open_parens[parno-1]= ret;
7204 Set_Node_Length(ret, 1); /* MJD */
7205 Set_Node_Offset(ret, RExC_parse); /* MJD */
7213 /* Pick up the branches, linking them together. */
7214 parse_start = RExC_parse; /* MJD */
7215 br = regbranch(pRExC_state, &flags, 1,depth+1);
7217 /* branch_len = (paren != 0); */
7221 if (*RExC_parse == '|') {
7222 if (!SIZE_ONLY && RExC_extralen) {
7223 reginsert(pRExC_state, BRANCHJ, br, depth+1);
7226 reginsert(pRExC_state, BRANCH, br, depth+1);
7227 Set_Node_Length(br, paren != 0);
7228 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7232 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
7234 else if (paren == ':') {
7235 *flagp |= flags&SIMPLE;
7237 if (is_open) { /* Starts with OPEN. */
7238 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
7240 else if (paren != '?') /* Not Conditional */
7242 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7244 while (*RExC_parse == '|') {
7245 if (!SIZE_ONLY && RExC_extralen) {
7246 ender = reganode(pRExC_state, LONGJMP,0);
7247 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7250 RExC_extralen += 2; /* Account for LONGJMP. */
7251 nextchar(pRExC_state);
7253 if (RExC_npar > after_freeze)
7254 after_freeze = RExC_npar;
7255 RExC_npar = freeze_paren;
7257 br = regbranch(pRExC_state, &flags, 0, depth+1);
7261 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
7263 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7266 if (have_branch || paren != ':') {
7267 /* Make a closing node, and hook it on the end. */
7270 ender = reg_node(pRExC_state, TAIL);
7273 ender = reganode(pRExC_state, CLOSE, parno);
7274 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7275 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7276 "Setting close paren #%"IVdf" to %d\n",
7277 (IV)parno, REG_NODE_NUM(ender)));
7278 RExC_close_parens[parno-1]= ender;
7279 if (RExC_nestroot == parno)
7282 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7283 Set_Node_Length(ender,1); /* MJD */
7289 *flagp &= ~HASWIDTH;
7292 ender = reg_node(pRExC_state, SUCCEED);
7295 ender = reg_node(pRExC_state, END);
7297 assert(!RExC_opend); /* there can only be one! */
7302 REGTAIL(pRExC_state, lastbr, ender);
7304 if (have_branch && !SIZE_ONLY) {
7306 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7308 /* Hook the tails of the branches to the closing node. */
7309 for (br = ret; br; br = regnext(br)) {
7310 const U8 op = PL_regkind[OP(br)];
7312 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7314 else if (op == BRANCHJ) {
7315 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7323 static const char parens[] = "=!<,>";
7325 if (paren && (p = strchr(parens, paren))) {
7326 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7327 int flag = (p - parens) > 1;
7330 node = SUSPEND, flag = 0;
7331 reginsert(pRExC_state, node,ret, depth+1);
7332 Set_Node_Cur_Length(ret);
7333 Set_Node_Offset(ret, parse_start + 1);
7335 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7339 /* Check for proper termination. */
7341 RExC_flags = oregflags;
7342 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7343 RExC_parse = oregcomp_parse;
7344 vFAIL("Unmatched (");
7347 else if (!paren && RExC_parse < RExC_end) {
7348 if (*RExC_parse == ')') {
7350 vFAIL("Unmatched )");
7353 FAIL("Junk on end of regexp"); /* "Can't happen". */
7357 if (RExC_in_lookbehind) {
7358 RExC_in_lookbehind--;
7360 if (after_freeze > RExC_npar)
7361 RExC_npar = after_freeze;
7366 - regbranch - one alternative of an | operator
7368 * Implements the concatenation operator.
7371 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7374 register regnode *ret;
7375 register regnode *chain = NULL;
7376 register regnode *latest;
7377 I32 flags = 0, c = 0;
7378 GET_RE_DEBUG_FLAGS_DECL;
7380 PERL_ARGS_ASSERT_REGBRANCH;
7382 DEBUG_PARSE("brnc");
7387 if (!SIZE_ONLY && RExC_extralen)
7388 ret = reganode(pRExC_state, BRANCHJ,0);
7390 ret = reg_node(pRExC_state, BRANCH);
7391 Set_Node_Length(ret, 1);
7395 if (!first && SIZE_ONLY)
7396 RExC_extralen += 1; /* BRANCHJ */
7398 *flagp = WORST; /* Tentatively. */
7401 nextchar(pRExC_state);
7402 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7404 latest = regpiece(pRExC_state, &flags,depth+1);
7405 if (latest == NULL) {
7406 if (flags & TRYAGAIN)
7410 else if (ret == NULL)
7412 *flagp |= flags&(HASWIDTH|POSTPONED);
7413 if (chain == NULL) /* First piece. */
7414 *flagp |= flags&SPSTART;
7417 REGTAIL(pRExC_state, chain, latest);
7422 if (chain == NULL) { /* Loop ran zero times. */
7423 chain = reg_node(pRExC_state, NOTHING);
7428 *flagp |= flags&SIMPLE;
7435 - regpiece - something followed by possible [*+?]
7437 * Note that the branching code sequences used for ? and the general cases
7438 * of * and + are somewhat optimized: they use the same NOTHING node as
7439 * both the endmarker for their branch list and the body of the last branch.
7440 * It might seem that this node could be dispensed with entirely, but the
7441 * endmarker role is not redundant.
7444 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7447 register regnode *ret;
7449 register char *next;
7451 const char * const origparse = RExC_parse;
7453 I32 max = REG_INFTY;
7455 const char *maxpos = NULL;
7456 GET_RE_DEBUG_FLAGS_DECL;
7458 PERL_ARGS_ASSERT_REGPIECE;
7460 DEBUG_PARSE("piec");
7462 ret = regatom(pRExC_state, &flags,depth+1);
7464 if (flags & TRYAGAIN)
7471 if (op == '{' && regcurly(RExC_parse)) {
7473 parse_start = RExC_parse; /* MJD */
7474 next = RExC_parse + 1;
7475 while (isDIGIT(*next) || *next == ',') {
7484 if (*next == '}') { /* got one */
7488 min = atoi(RExC_parse);
7492 maxpos = RExC_parse;
7494 if (!max && *maxpos != '0')
7495 max = REG_INFTY; /* meaning "infinity" */
7496 else if (max >= REG_INFTY)
7497 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7499 nextchar(pRExC_state);
7502 if ((flags&SIMPLE)) {
7503 RExC_naughty += 2 + RExC_naughty / 2;
7504 reginsert(pRExC_state, CURLY, ret, depth+1);
7505 Set_Node_Offset(ret, parse_start+1); /* MJD */
7506 Set_Node_Cur_Length(ret);
7509 regnode * const w = reg_node(pRExC_state, WHILEM);
7512 REGTAIL(pRExC_state, ret, w);
7513 if (!SIZE_ONLY && RExC_extralen) {
7514 reginsert(pRExC_state, LONGJMP,ret, depth+1);
7515 reginsert(pRExC_state, NOTHING,ret, depth+1);
7516 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7518 reginsert(pRExC_state, CURLYX,ret, depth+1);
7520 Set_Node_Offset(ret, parse_start+1);
7521 Set_Node_Length(ret,
7522 op == '{' ? (RExC_parse - parse_start) : 1);
7524 if (!SIZE_ONLY && RExC_extralen)
7525 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
7526 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7528 RExC_whilem_seen++, RExC_extralen += 3;
7529 RExC_naughty += 4 + RExC_naughty; /* compound interest */
7538 vFAIL("Can't do {n,m} with n > m");
7540 ARG1_SET(ret, (U16)min);
7541 ARG2_SET(ret, (U16)max);
7553 #if 0 /* Now runtime fix should be reliable. */
7555 /* if this is reinstated, don't forget to put this back into perldiag:
7557 =item Regexp *+ operand could be empty at {#} in regex m/%s/
7559 (F) The part of the regexp subject to either the * or + quantifier
7560 could match an empty string. The {#} shows in the regular
7561 expression about where the problem was discovered.
7565 if (!(flags&HASWIDTH) && op != '?')
7566 vFAIL("Regexp *+ operand could be empty");
7569 parse_start = RExC_parse;
7570 nextchar(pRExC_state);
7572 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7574 if (op == '*' && (flags&SIMPLE)) {
7575 reginsert(pRExC_state, STAR, ret, depth+1);
7579 else if (op == '*') {
7583 else if (op == '+' && (flags&SIMPLE)) {
7584 reginsert(pRExC_state, PLUS, ret, depth+1);
7588 else if (op == '+') {
7592 else if (op == '?') {
7597 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7598 ckWARN3reg(RExC_parse,
7599 "%.*s matches null string many times",
7600 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7604 if (RExC_parse < RExC_end && *RExC_parse == '?') {
7605 nextchar(pRExC_state);
7606 reginsert(pRExC_state, MINMOD, ret, depth+1);
7607 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7609 #ifndef REG_ALLOW_MINMOD_SUSPEND
7612 if (RExC_parse < RExC_end && *RExC_parse == '+') {
7614 nextchar(pRExC_state);
7615 ender = reg_node(pRExC_state, SUCCEED);
7616 REGTAIL(pRExC_state, ret, ender);
7617 reginsert(pRExC_state, SUSPEND, ret, depth+1);
7619 ender = reg_node(pRExC_state, TAIL);
7620 REGTAIL(pRExC_state, ret, ender);
7624 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7626 vFAIL("Nested quantifiers");
7633 /* reg_namedseq(pRExC_state,UVp)
7635 This is expected to be called by a parser routine that has
7636 recognized '\N' and needs to handle the rest. RExC_parse is
7637 expected to point at the first char following the N at the time
7640 The \N may be inside (indicated by valuep not being NULL) or outside a
7643 \N may begin either a named sequence, or if outside a character class, mean
7644 to match a non-newline. For non single-quoted regexes, the tokenizer has
7645 attempted to decide which, and in the case of a named sequence converted it
7646 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7647 where c1... are the characters in the sequence. For single-quoted regexes,
7648 the tokenizer passes the \N sequence through unchanged; this code will not
7649 attempt to determine this nor expand those. The net effect is that if the
7650 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7651 signals that this \N occurrence means to match a non-newline.
7653 Only the \N{U+...} form should occur in a character class, for the same
7654 reason that '.' inside a character class means to just match a period: it
7655 just doesn't make sense.
7657 If valuep is non-null then it is assumed that we are parsing inside
7658 of a charclass definition and the first codepoint in the resolved
7659 string is returned via *valuep and the routine will return NULL.
7660 In this mode if a multichar string is returned from the charnames
7661 handler, a warning will be issued, and only the first char in the
7662 sequence will be examined. If the string returned is zero length
7663 then the value of *valuep is undefined and NON-NULL will
7664 be returned to indicate failure. (This will NOT be a valid pointer
7667 If valuep is null then it is assumed that we are parsing normal text and a
7668 new EXACT node is inserted into the program containing the resolved string,
7669 and a pointer to the new node is returned. But if the string is zero length
7670 a NOTHING node is emitted instead.
7672 On success RExC_parse is set to the char following the endbrace.
7673 Parsing failures will generate a fatal error via vFAIL(...)
7676 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7678 char * endbrace; /* '}' following the name */
7679 regnode *ret = NULL;
7681 char* parse_start = RExC_parse - 2; /* points to the '\N' */
7685 GET_RE_DEBUG_FLAGS_DECL;
7687 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7691 /* The [^\n] meaning of \N ignores spaces and comments under the /x
7692 * modifier. The other meaning does not */
7693 p = (RExC_flags & RXf_PMf_EXTENDED)
7694 ? regwhite( pRExC_state, RExC_parse )
7697 /* Disambiguate between \N meaning a named character versus \N meaning
7698 * [^\n]. The former is assumed when it can't be the latter. */
7699 if (*p != '{' || regcurly(p)) {
7702 /* no bare \N in a charclass */
7703 vFAIL("\\N in a character class must be a named character: \\N{...}");
7705 nextchar(pRExC_state);
7706 ret = reg_node(pRExC_state, REG_ANY);
7707 *flagp |= HASWIDTH|SIMPLE;
7710 Set_Node_Length(ret, 1); /* MJD */
7714 /* Here, we have decided it should be a named sequence */
7716 /* The test above made sure that the next real character is a '{', but
7717 * under the /x modifier, it could be separated by space (or a comment and
7718 * \n) and this is not allowed (for consistency with \x{...} and the
7719 * tokenizer handling of \N{NAME}). */
7720 if (*RExC_parse != '{') {
7721 vFAIL("Missing braces on \\N{}");
7724 RExC_parse++; /* Skip past the '{' */
7726 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7727 || ! (endbrace == RExC_parse /* nothing between the {} */
7728 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7729 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7731 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7732 vFAIL("\\N{NAME} must be resolved by the lexer");
7735 if (endbrace == RExC_parse) { /* empty: \N{} */
7737 RExC_parse = endbrace + 1;
7738 return reg_node(pRExC_state,NOTHING);
7742 ckWARNreg(RExC_parse,
7743 "Ignoring zero length \\N{} in character class"
7745 RExC_parse = endbrace + 1;
7748 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7751 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
7752 RExC_parse += 2; /* Skip past the 'U+' */
7754 if (valuep) { /* In a bracketed char class */
7755 /* We only pay attention to the first char of
7756 multichar strings being returned. I kinda wonder
7757 if this makes sense as it does change the behaviour
7758 from earlier versions, OTOH that behaviour was broken
7759 as well. XXX Solution is to recharacterize as
7760 [rest-of-class]|multi1|multi2... */
7762 STRLEN length_of_hex;
7763 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7764 | PERL_SCAN_DISALLOW_PREFIX
7765 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7767 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7768 if (endchar < endbrace) {
7769 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7772 length_of_hex = (STRLEN)(endchar - RExC_parse);
7773 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7775 /* The tokenizer should have guaranteed validity, but it's possible to
7776 * bypass it by using single quoting, so check */
7777 if (length_of_hex == 0
7778 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7780 RExC_parse += length_of_hex; /* Includes all the valid */
7781 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7782 ? UTF8SKIP(RExC_parse)
7784 /* Guard against malformed utf8 */
7785 if (RExC_parse >= endchar) RExC_parse = endchar;
7786 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7789 RExC_parse = endbrace + 1;
7790 if (endchar == endbrace) return NULL;
7792 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7794 else { /* Not a char class */
7795 char *s; /* String to put in generated EXACT node */
7796 STRLEN len = 0; /* Its current byte length */
7797 char *endchar; /* Points to '.' or '}' ending cur char in the input
7799 ret = reg_node(pRExC_state,
7800 (U8) ((! FOLD) ? EXACT
7803 : (MORE_ASCII_RESTRICTED)
7805 : (AT_LEAST_UNI_SEMANTICS)
7810 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7811 * the input which is of the form now 'c1.c2.c3...}' until find the
7812 * ending brace or exceed length 255. The characters that exceed this
7813 * limit are dropped. The limit could be relaxed should it become
7814 * desirable by reparsing this as (?:\N{NAME}), so could generate
7815 * multiple EXACT nodes, as is done for just regular input. But this
7816 * is primarily a named character, and not intended to be a huge long
7817 * string, so 255 bytes should be good enough */
7819 STRLEN length_of_hex;
7820 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7821 | PERL_SCAN_DISALLOW_PREFIX
7822 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7823 UV cp; /* Ord of current character */
7824 bool use_this_char_fold = FOLD;
7826 /* Code points are separated by dots. If none, there is only one
7827 * code point, and is terminated by the brace */
7828 endchar = RExC_parse + strcspn(RExC_parse, ".}");
7830 /* The values are Unicode even on EBCDIC machines */
7831 length_of_hex = (STRLEN)(endchar - RExC_parse);
7832 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7833 if ( length_of_hex == 0
7834 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7836 RExC_parse += length_of_hex; /* Includes all the valid */
7837 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7838 ? UTF8SKIP(RExC_parse)
7840 /* Guard against malformed utf8 */
7841 if (RExC_parse >= endchar) RExC_parse = endchar;
7842 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7845 /* XXX ? Change to ANYOF node
7847 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7848 && is_TRICKYFOLD_cp(cp))
7853 /* Under /aa, we can't mix ASCII with non- in a fold. If we are
7854 * folding, and the source isn't ASCII, look through all the
7855 * characters it folds to. If any one of them is ASCII, forbid
7856 * this fold. (cp is uni, so the 127 below is correct even for
7857 * EBCDIC). Similarly under locale rules, we don't mix under 256
7858 * with above 255. XXX It really doesn't make sense to have \N{}
7859 * which means a Unicode rules under locale. I (khw) think this
7860 * should be warned about, but the counter argument is that people
7861 * who have programmed around Perl's earlier lack of specifying the
7862 * rules and used \N{} to force Unicode things in a local
7863 * environment shouldn't get suddenly a warning */
7864 if (use_this_char_fold) {
7865 if (LOC && cp < 256) { /* Fold not known until run-time */
7866 use_this_char_fold = FALSE;
7868 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7869 || (cp > 255 && LOC))
7871 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7876 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7881 || (LOC && (UTF8_IS_INVARIANT(*s)
7882 || UTF8_IS_DOWNGRADEABLE_START(*s))))
7884 use_this_char_fold = FALSE;
7892 if (! use_this_char_fold) { /* Not folding, just append to the
7896 /* Quit before adding this character if would exceed limit */
7897 if (len + UNISKIP(cp) > U8_MAX) break;
7899 unilen = reguni(pRExC_state, cp, s);
7904 } else { /* Folding, output the folded equivalent */
7905 STRLEN foldlen,numlen;
7906 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7907 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7909 /* Quit before exceeding size limit */
7910 if (len + foldlen > U8_MAX) break;
7912 for (foldbuf = tmpbuf;
7916 cp = utf8_to_uvchr(foldbuf, &numlen);
7918 const STRLEN unilen = reguni(pRExC_state, cp, s);
7921 /* In EBCDIC the numlen and unilen can differ. */
7923 if (numlen >= foldlen)
7927 break; /* "Can't happen." */
7931 /* Point to the beginning of the next character in the sequence. */
7932 RExC_parse = endchar + 1;
7934 /* Quit if no more characters */
7935 if (RExC_parse >= endbrace) break;
7940 if (RExC_parse < endbrace) {
7941 ckWARNreg(RExC_parse - 1,
7942 "Using just the first characters returned by \\N{}");
7945 RExC_size += STR_SZ(len);
7948 RExC_emit += STR_SZ(len);
7951 RExC_parse = endbrace + 1;
7953 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7954 with malformed in t/re/pat_advanced.t */
7956 Set_Node_Cur_Length(ret); /* MJD */
7957 nextchar(pRExC_state);
7967 * It returns the code point in utf8 for the value in *encp.
7968 * value: a code value in the source encoding
7969 * encp: a pointer to an Encode object
7971 * If the result from Encode is not a single character,
7972 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7975 S_reg_recode(pTHX_ const char value, SV **encp)
7978 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7979 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7980 const STRLEN newlen = SvCUR(sv);
7981 UV uv = UNICODE_REPLACEMENT;
7983 PERL_ARGS_ASSERT_REG_RECODE;
7987 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7990 if (!newlen || numlen != newlen) {
7991 uv = UNICODE_REPLACEMENT;
7999 - regatom - the lowest level
8001 Try to identify anything special at the start of the pattern. If there
8002 is, then handle it as required. This may involve generating a single regop,
8003 such as for an assertion; or it may involve recursing, such as to
8004 handle a () structure.
8006 If the string doesn't start with something special then we gobble up
8007 as much literal text as we can.
8009 Once we have been able to handle whatever type of thing started the
8010 sequence, we return.
8012 Note: we have to be careful with escapes, as they can be both literal
8013 and special, and in the case of \10 and friends can either, depending
8014 on context. Specifically there are two separate switches for handling
8015 escape sequences, with the one for handling literal escapes requiring
8016 a dummy entry for all of the special escapes that are actually handled
8021 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8024 register regnode *ret = NULL;
8026 char *parse_start = RExC_parse;
8028 GET_RE_DEBUG_FLAGS_DECL;
8029 DEBUG_PARSE("atom");
8030 *flagp = WORST; /* Tentatively. */
8032 PERL_ARGS_ASSERT_REGATOM;
8035 switch ((U8)*RExC_parse) {
8037 RExC_seen_zerolen++;
8038 nextchar(pRExC_state);
8039 if (RExC_flags & RXf_PMf_MULTILINE)
8040 ret = reg_node(pRExC_state, MBOL);
8041 else if (RExC_flags & RXf_PMf_SINGLELINE)
8042 ret = reg_node(pRExC_state, SBOL);
8044 ret = reg_node(pRExC_state, BOL);
8045 Set_Node_Length(ret, 1); /* MJD */
8048 nextchar(pRExC_state);
8050 RExC_seen_zerolen++;
8051 if (RExC_flags & RXf_PMf_MULTILINE)
8052 ret = reg_node(pRExC_state, MEOL);
8053 else if (RExC_flags & RXf_PMf_SINGLELINE)
8054 ret = reg_node(pRExC_state, SEOL);
8056 ret = reg_node(pRExC_state, EOL);
8057 Set_Node_Length(ret, 1); /* MJD */
8060 nextchar(pRExC_state);
8061 if (RExC_flags & RXf_PMf_SINGLELINE)
8062 ret = reg_node(pRExC_state, SANY);
8064 ret = reg_node(pRExC_state, REG_ANY);
8065 *flagp |= HASWIDTH|SIMPLE;
8067 Set_Node_Length(ret, 1); /* MJD */
8071 char * const oregcomp_parse = ++RExC_parse;
8072 ret = regclass(pRExC_state,depth+1);
8073 if (*RExC_parse != ']') {
8074 RExC_parse = oregcomp_parse;
8075 vFAIL("Unmatched [");
8077 nextchar(pRExC_state);
8078 *flagp |= HASWIDTH|SIMPLE;
8079 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8083 nextchar(pRExC_state);
8084 ret = reg(pRExC_state, 1, &flags,depth+1);
8086 if (flags & TRYAGAIN) {
8087 if (RExC_parse == RExC_end) {
8088 /* Make parent create an empty node if needed. */
8096 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8100 if (flags & TRYAGAIN) {
8104 vFAIL("Internal urp");
8105 /* Supposed to be caught earlier. */
8108 if (!regcurly(RExC_parse)) {
8117 vFAIL("Quantifier follows nothing");
8119 case LATIN_SMALL_LETTER_SHARP_S:
8120 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8121 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8122 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8123 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
8124 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8129 len=0; /* silence a spurious compiler warning */
8130 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8131 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8132 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8133 ret = reganode(pRExC_state, FOLDCHAR, cp);
8134 Set_Node_Length(ret, 1); /* MJD */
8135 nextchar(pRExC_state); /* kill whitespace under /x */
8143 This switch handles escape sequences that resolve to some kind
8144 of special regop and not to literal text. Escape sequnces that
8145 resolve to literal text are handled below in the switch marked
8148 Every entry in this switch *must* have a corresponding entry
8149 in the literal escape switch. However, the opposite is not
8150 required, as the default for this switch is to jump to the
8151 literal text handling code.
8153 switch ((U8)*++RExC_parse) {
8154 case LATIN_SMALL_LETTER_SHARP_S:
8155 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8156 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8158 /* Special Escapes */
8160 RExC_seen_zerolen++;
8161 ret = reg_node(pRExC_state, SBOL);
8163 goto finish_meta_pat;
8165 ret = reg_node(pRExC_state, GPOS);
8166 RExC_seen |= REG_SEEN_GPOS;
8168 goto finish_meta_pat;
8170 RExC_seen_zerolen++;
8171 ret = reg_node(pRExC_state, KEEPS);
8173 /* XXX:dmq : disabling in-place substitution seems to
8174 * be necessary here to avoid cases of memory corruption, as
8175 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8177 RExC_seen |= REG_SEEN_LOOKBEHIND;
8178 goto finish_meta_pat;
8180 ret = reg_node(pRExC_state, SEOL);
8182 RExC_seen_zerolen++; /* Do not optimize RE away */
8183 goto finish_meta_pat;
8185 ret = reg_node(pRExC_state, EOS);
8187 RExC_seen_zerolen++; /* Do not optimize RE away */
8188 goto finish_meta_pat;
8190 ret = reg_node(pRExC_state, CANY);
8191 RExC_seen |= REG_SEEN_CANY;
8192 *flagp |= HASWIDTH|SIMPLE;
8193 goto finish_meta_pat;
8195 ret = reg_node(pRExC_state, CLUMP);
8197 goto finish_meta_pat;
8199 switch (get_regex_charset(RExC_flags)) {
8200 case REGEX_LOCALE_CHARSET:
8203 case REGEX_UNICODE_CHARSET:
8206 case REGEX_ASCII_RESTRICTED_CHARSET:
8207 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8210 case REGEX_DEPENDS_CHARSET:
8216 ret = reg_node(pRExC_state, op);
8217 *flagp |= HASWIDTH|SIMPLE;
8218 goto finish_meta_pat;
8220 switch (get_regex_charset(RExC_flags)) {
8221 case REGEX_LOCALE_CHARSET:
8224 case REGEX_UNICODE_CHARSET:
8227 case REGEX_ASCII_RESTRICTED_CHARSET:
8228 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8231 case REGEX_DEPENDS_CHARSET:
8237 ret = reg_node(pRExC_state, op);
8238 *flagp |= HASWIDTH|SIMPLE;
8239 goto finish_meta_pat;
8241 RExC_seen_zerolen++;
8242 RExC_seen |= REG_SEEN_LOOKBEHIND;
8243 switch (get_regex_charset(RExC_flags)) {
8244 case REGEX_LOCALE_CHARSET:
8247 case REGEX_UNICODE_CHARSET:
8250 case REGEX_ASCII_RESTRICTED_CHARSET:
8251 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8254 case REGEX_DEPENDS_CHARSET:
8260 ret = reg_node(pRExC_state, op);
8261 FLAGS(ret) = get_regex_charset(RExC_flags);
8263 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8264 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8266 goto finish_meta_pat;
8268 RExC_seen_zerolen++;
8269 RExC_seen |= REG_SEEN_LOOKBEHIND;
8270 switch (get_regex_charset(RExC_flags)) {
8271 case REGEX_LOCALE_CHARSET:
8274 case REGEX_UNICODE_CHARSET:
8277 case REGEX_ASCII_RESTRICTED_CHARSET:
8278 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8281 case REGEX_DEPENDS_CHARSET:
8287 ret = reg_node(pRExC_state, op);
8288 FLAGS(ret) = get_regex_charset(RExC_flags);
8290 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8291 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8293 goto finish_meta_pat;
8295 switch (get_regex_charset(RExC_flags)) {
8296 case REGEX_LOCALE_CHARSET:
8299 case REGEX_UNICODE_CHARSET:
8302 case REGEX_ASCII_RESTRICTED_CHARSET:
8303 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8306 case REGEX_DEPENDS_CHARSET:
8312 ret = reg_node(pRExC_state, op);
8313 *flagp |= HASWIDTH|SIMPLE;
8314 goto finish_meta_pat;
8316 switch (get_regex_charset(RExC_flags)) {
8317 case REGEX_LOCALE_CHARSET:
8320 case REGEX_UNICODE_CHARSET:
8323 case REGEX_ASCII_RESTRICTED_CHARSET:
8324 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8327 case REGEX_DEPENDS_CHARSET:
8333 ret = reg_node(pRExC_state, op);
8334 *flagp |= HASWIDTH|SIMPLE;
8335 goto finish_meta_pat;
8337 switch (get_regex_charset(RExC_flags)) {
8338 case REGEX_LOCALE_CHARSET:
8341 case REGEX_ASCII_RESTRICTED_CHARSET:
8342 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8345 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8346 case REGEX_UNICODE_CHARSET:
8352 ret = reg_node(pRExC_state, op);
8353 *flagp |= HASWIDTH|SIMPLE;
8354 goto finish_meta_pat;
8356 switch (get_regex_charset(RExC_flags)) {
8357 case REGEX_LOCALE_CHARSET:
8360 case REGEX_ASCII_RESTRICTED_CHARSET:
8361 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8364 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8365 case REGEX_UNICODE_CHARSET:
8371 ret = reg_node(pRExC_state, op);
8372 *flagp |= HASWIDTH|SIMPLE;
8373 goto finish_meta_pat;
8375 ret = reg_node(pRExC_state, LNBREAK);
8376 *flagp |= HASWIDTH|SIMPLE;
8377 goto finish_meta_pat;
8379 ret = reg_node(pRExC_state, HORIZWS);
8380 *flagp |= HASWIDTH|SIMPLE;
8381 goto finish_meta_pat;
8383 ret = reg_node(pRExC_state, NHORIZWS);
8384 *flagp |= HASWIDTH|SIMPLE;
8385 goto finish_meta_pat;
8387 ret = reg_node(pRExC_state, VERTWS);
8388 *flagp |= HASWIDTH|SIMPLE;
8389 goto finish_meta_pat;
8391 ret = reg_node(pRExC_state, NVERTWS);
8392 *flagp |= HASWIDTH|SIMPLE;
8394 nextchar(pRExC_state);
8395 Set_Node_Length(ret, 2); /* MJD */
8400 char* const oldregxend = RExC_end;
8402 char* parse_start = RExC_parse - 2;
8405 if (RExC_parse[1] == '{') {
8406 /* a lovely hack--pretend we saw [\pX] instead */
8407 RExC_end = strchr(RExC_parse, '}');
8409 const U8 c = (U8)*RExC_parse;
8411 RExC_end = oldregxend;
8412 vFAIL2("Missing right brace on \\%c{}", c);
8417 RExC_end = RExC_parse + 2;
8418 if (RExC_end > oldregxend)
8419 RExC_end = oldregxend;
8423 ret = regclass(pRExC_state,depth+1);
8425 RExC_end = oldregxend;
8428 Set_Node_Offset(ret, parse_start + 2);
8429 Set_Node_Cur_Length(ret);
8430 nextchar(pRExC_state);
8431 *flagp |= HASWIDTH|SIMPLE;
8435 /* Handle \N and \N{NAME} here and not below because it can be
8436 multicharacter. join_exact() will join them up later on.
8437 Also this makes sure that things like /\N{BLAH}+/ and
8438 \N{BLAH} being multi char Just Happen. dmq*/
8440 ret= reg_namedseq(pRExC_state, NULL, flagp);
8442 case 'k': /* Handle \k<NAME> and \k'NAME' */
8445 char ch= RExC_parse[1];
8446 if (ch != '<' && ch != '\'' && ch != '{') {
8448 vFAIL2("Sequence %.2s... not terminated",parse_start);
8450 /* this pretty much dupes the code for (?P=...) in reg(), if
8451 you change this make sure you change that */
8452 char* name_start = (RExC_parse += 2);
8454 SV *sv_dat = reg_scan_name(pRExC_state,
8455 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8456 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8457 if (RExC_parse == name_start || *RExC_parse != ch)
8458 vFAIL2("Sequence %.3s... not terminated",parse_start);
8461 num = add_data( pRExC_state, 1, "S" );
8462 RExC_rxi->data->data[num]=(void*)sv_dat;
8463 SvREFCNT_inc_simple_void(sv_dat);
8467 ret = reganode(pRExC_state,
8470 : (MORE_ASCII_RESTRICTED)
8472 : (AT_LEAST_UNI_SEMANTICS)
8480 /* override incorrect value set in reganode MJD */
8481 Set_Node_Offset(ret, parse_start+1);
8482 Set_Node_Cur_Length(ret); /* MJD */
8483 nextchar(pRExC_state);
8489 case '1': case '2': case '3': case '4':
8490 case '5': case '6': case '7': case '8': case '9':
8493 bool isg = *RExC_parse == 'g';
8498 if (*RExC_parse == '{') {
8502 if (*RExC_parse == '-') {
8506 if (hasbrace && !isDIGIT(*RExC_parse)) {
8507 if (isrel) RExC_parse--;
8509 goto parse_named_seq;
8511 num = atoi(RExC_parse);
8512 if (isg && num == 0)
8513 vFAIL("Reference to invalid group 0");
8515 num = RExC_npar - num;
8517 vFAIL("Reference to nonexistent or unclosed group");
8519 if (!isg && num > 9 && num >= RExC_npar)
8522 char * const parse_start = RExC_parse - 1; /* MJD */
8523 while (isDIGIT(*RExC_parse))
8525 if (parse_start == RExC_parse - 1)
8526 vFAIL("Unterminated \\g... pattern");
8528 if (*RExC_parse != '}')
8529 vFAIL("Unterminated \\g{...} pattern");
8533 if (num > (I32)RExC_rx->nparens)
8534 vFAIL("Reference to nonexistent group");
8537 ret = reganode(pRExC_state,
8540 : (MORE_ASCII_RESTRICTED)
8542 : (AT_LEAST_UNI_SEMANTICS)
8550 /* override incorrect value set in reganode MJD */
8551 Set_Node_Offset(ret, parse_start+1);
8552 Set_Node_Cur_Length(ret); /* MJD */
8554 nextchar(pRExC_state);
8559 if (RExC_parse >= RExC_end)
8560 FAIL("Trailing \\");
8563 /* Do not generate "unrecognized" warnings here, we fall
8564 back into the quick-grab loop below */
8571 if (RExC_flags & RXf_PMf_EXTENDED) {
8572 if ( reg_skipcomment( pRExC_state ) )
8579 register STRLEN len;
8584 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8585 regnode * orig_emit;
8587 parse_start = RExC_parse - 1;
8593 orig_emit = RExC_emit; /* Save the original output node position in
8594 case we need to output a different node
8596 ret = reg_node(pRExC_state,
8597 (U8) ((! FOLD) ? EXACT
8600 : (MORE_ASCII_RESTRICTED)
8602 : (AT_LEAST_UNI_SEMANTICS)
8607 for (len = 0, p = RExC_parse - 1;
8608 len < 127 && p < RExC_end;
8611 char * const oldp = p;
8613 if (RExC_flags & RXf_PMf_EXTENDED)
8614 p = regwhite( pRExC_state, p );
8616 case LATIN_SMALL_LETTER_SHARP_S:
8617 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8618 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8619 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8620 goto normal_default;
8630 /* Literal Escapes Switch
8632 This switch is meant to handle escape sequences that
8633 resolve to a literal character.
8635 Every escape sequence that represents something
8636 else, like an assertion or a char class, is handled
8637 in the switch marked 'Special Escapes' above in this
8638 routine, but also has an entry here as anything that
8639 isn't explicitly mentioned here will be treated as
8640 an unescaped equivalent literal.
8644 /* These are all the special escapes. */
8645 case LATIN_SMALL_LETTER_SHARP_S:
8646 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8647 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8648 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8649 goto normal_default;
8650 case 'A': /* Start assertion */
8651 case 'b': case 'B': /* Word-boundary assertion*/
8652 case 'C': /* Single char !DANGEROUS! */
8653 case 'd': case 'D': /* digit class */
8654 case 'g': case 'G': /* generic-backref, pos assertion */
8655 case 'h': case 'H': /* HORIZWS */
8656 case 'k': case 'K': /* named backref, keep marker */
8657 case 'N': /* named char sequence */
8658 case 'p': case 'P': /* Unicode property */
8659 case 'R': /* LNBREAK */
8660 case 's': case 'S': /* space class */
8661 case 'v': case 'V': /* VERTWS */
8662 case 'w': case 'W': /* word class */
8663 case 'X': /* eXtended Unicode "combining character sequence" */
8664 case 'z': case 'Z': /* End of line/string assertion */
8668 /* Anything after here is an escape that resolves to a
8669 literal. (Except digits, which may or may not)
8688 ender = ASCII_TO_NATIVE('\033');
8692 ender = ASCII_TO_NATIVE('\007');
8697 STRLEN brace_len = len;
8699 const char* error_msg;
8701 bool valid = grok_bslash_o(p,
8708 RExC_parse = p; /* going to die anyway; point
8709 to exact spot of failure */
8716 if (PL_encoding && ender < 0x100) {
8717 goto recode_encoding;
8726 char* const e = strchr(p, '}');
8730 vFAIL("Missing right brace on \\x{}");
8733 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8734 | PERL_SCAN_DISALLOW_PREFIX;
8735 STRLEN numlen = e - p - 1;
8736 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8743 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8745 ender = grok_hex(p, &numlen, &flags, NULL);
8748 if (PL_encoding && ender < 0x100)
8749 goto recode_encoding;
8753 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8755 case '0': case '1': case '2': case '3':case '4':
8756 case '5': case '6': case '7': case '8':case '9':
8758 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8760 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8762 ender = grok_oct(p, &numlen, &flags, NULL);
8772 if (PL_encoding && ender < 0x100)
8773 goto recode_encoding;
8777 SV* enc = PL_encoding;
8778 ender = reg_recode((const char)(U8)ender, &enc);
8779 if (!enc && SIZE_ONLY)
8780 ckWARNreg(p, "Invalid escape in the specified encoding");
8786 FAIL("Trailing \\");
8789 if (!SIZE_ONLY&& isALPHA(*p)) {
8790 /* Include any { following the alpha to emphasize
8791 * that it could be part of an escape at some point
8793 int len = (*(p + 1) == '{') ? 2 : 1;
8794 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8796 goto normal_default;
8801 if (UTF8_IS_START(*p) && UTF) {
8803 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8804 &numlen, UTF8_ALLOW_DEFAULT);
8810 } /* End of switch on the literal */
8812 /* Certain characters are problematic because their folded
8813 * length is so different from their original length that it
8814 * isn't handleable by the optimizer. They are therefore not
8815 * placed in an EXACTish node; and are here handled specially.
8816 * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8817 * putting it in a special node keeps regexec from having to
8818 * deal with a non-utf8 multi-char fold */
8820 && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8821 && is_TRICKYFOLD_cp(ender))
8823 /* If is in middle of outputting characters into an
8824 * EXACTish node, go output what we have so far, and
8825 * position the parse so that this will be called again
8833 /* Here we are ready to output our tricky fold
8834 * character. What's done is to pretend it's in a
8835 * [bracketed] class, and let the code that deals with
8836 * those handle it, as that code has all the
8837 * intelligence necessary. First save the current
8838 * parse state, get rid of the already allocated EXACT
8839 * node that the ANYOFV node will replace, and point
8840 * the parse to a buffer which we fill with the
8841 * character we want the regclass code to think is
8843 char* const oldregxend = RExC_end;
8845 RExC_emit = orig_emit;
8846 RExC_parse = tmpbuf;
8848 tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8849 tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8850 RExC_end = RExC_parse + 2;
8853 tmpbuf[0] = (char) ender;
8854 RExC_end = RExC_parse + 1;
8857 ret = regclass(pRExC_state,depth+1);
8859 /* Here, have parsed the buffer. Reset the parse to
8860 * the actual input, and return */
8861 RExC_end = oldregxend;
8864 Set_Node_Offset(ret, RExC_parse);
8865 Set_Node_Cur_Length(ret);
8866 nextchar(pRExC_state);
8867 *flagp |= HASWIDTH|SIMPLE;
8872 if ( RExC_flags & RXf_PMf_EXTENDED)
8873 p = regwhite( pRExC_state, p );
8875 /* Prime the casefolded buffer. Locale rules, which apply
8876 * only to code points < 256, aren't known until execution,
8877 * so for them, just output the original character using
8879 if (LOC && ender < 256) {
8880 if (UNI_IS_INVARIANT(ender)) {
8881 *tmpbuf = (U8) ender;
8884 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8885 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8889 else if (isASCII(ender)) { /* Note: Here can't also be LOC
8891 ender = toLOWER(ender);
8892 *tmpbuf = (U8) ender;
8895 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8897 /* Locale and /aa require more selectivity about the
8898 * fold, so are handled below. Otherwise, here, just
8900 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8903 /* Under locale rules or /aa we are not to mix,
8904 * respectively, ords < 256 or ASCII with non-. So
8905 * reject folds that mix them, using only the
8906 * non-folded code point. So do the fold to a
8907 * temporary, and inspect each character in it. */
8908 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8910 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8911 U8* e = s + foldlen;
8912 bool fold_ok = TRUE;
8916 || (LOC && (UTF8_IS_INVARIANT(*s)
8917 || UTF8_IS_DOWNGRADEABLE_START(*s))))
8925 Copy(trialbuf, tmpbuf, foldlen, U8);
8929 uvuni_to_utf8(tmpbuf, ender);
8930 foldlen = UNISKIP(ender);
8934 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8939 /* Emit all the Unicode characters. */
8941 for (foldbuf = tmpbuf;
8943 foldlen -= numlen) {
8944 ender = utf8_to_uvchr(foldbuf, &numlen);
8946 const STRLEN unilen = reguni(pRExC_state, ender, s);
8949 /* In EBCDIC the numlen
8950 * and unilen can differ. */
8952 if (numlen >= foldlen)
8956 break; /* "Can't happen." */
8960 const STRLEN unilen = reguni(pRExC_state, ender, s);
8969 REGC((char)ender, s++);
8975 /* Emit all the Unicode characters. */
8977 for (foldbuf = tmpbuf;
8979 foldlen -= numlen) {
8980 ender = utf8_to_uvchr(foldbuf, &numlen);
8982 const STRLEN unilen = reguni(pRExC_state, ender, s);
8985 /* In EBCDIC the numlen
8986 * and unilen can differ. */
8988 if (numlen >= foldlen)
8996 const STRLEN unilen = reguni(pRExC_state, ender, s);
9005 REGC((char)ender, s++);
9007 loopdone: /* Jumped to when encounters something that shouldn't be in
9010 Set_Node_Cur_Length(ret); /* MJD */
9011 nextchar(pRExC_state);
9013 /* len is STRLEN which is unsigned, need to copy to signed */
9016 vFAIL("Internal disaster");
9020 if (len == 1 && UNI_IS_INVARIANT(ender))
9024 RExC_size += STR_SZ(len);
9027 RExC_emit += STR_SZ(len);
9035 /* Jumped to when an unrecognized character set is encountered */
9037 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9042 S_regwhite( RExC_state_t *pRExC_state, char *p )
9044 const char *e = RExC_end;
9046 PERL_ARGS_ASSERT_REGWHITE;
9051 else if (*p == '#') {
9060 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9068 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9069 Character classes ([:foo:]) can also be negated ([:^foo:]).
9070 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9071 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9072 but trigger failures because they are currently unimplemented. */
9074 #define POSIXCC_DONE(c) ((c) == ':')
9075 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9076 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9079 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9082 I32 namedclass = OOB_NAMEDCLASS;
9084 PERL_ARGS_ASSERT_REGPPOSIXCC;
9086 if (value == '[' && RExC_parse + 1 < RExC_end &&
9087 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9088 POSIXCC(UCHARAT(RExC_parse))) {
9089 const char c = UCHARAT(RExC_parse);
9090 char* const s = RExC_parse++;
9092 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9094 if (RExC_parse == RExC_end)
9095 /* Grandfather lone [:, [=, [. */
9098 const char* const t = RExC_parse++; /* skip over the c */
9101 if (UCHARAT(RExC_parse) == ']') {
9102 const char *posixcc = s + 1;
9103 RExC_parse++; /* skip over the ending ] */
9106 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9107 const I32 skip = t - posixcc;
9109 /* Initially switch on the length of the name. */
9112 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9113 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9116 /* Names all of length 5. */
9117 /* alnum alpha ascii blank cntrl digit graph lower
9118 print punct space upper */
9119 /* Offset 4 gives the best switch position. */
9120 switch (posixcc[4]) {
9122 if (memEQ(posixcc, "alph", 4)) /* alpha */
9123 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9126 if (memEQ(posixcc, "spac", 4)) /* space */
9127 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9130 if (memEQ(posixcc, "grap", 4)) /* graph */
9131 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9134 if (memEQ(posixcc, "asci", 4)) /* ascii */
9135 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9138 if (memEQ(posixcc, "blan", 4)) /* blank */
9139 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9142 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9143 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9146 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9147 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9150 if (memEQ(posixcc, "lowe", 4)) /* lower */
9151 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9152 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9153 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9156 if (memEQ(posixcc, "digi", 4)) /* digit */
9157 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9158 else if (memEQ(posixcc, "prin", 4)) /* print */
9159 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9160 else if (memEQ(posixcc, "punc", 4)) /* punct */
9161 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9166 if (memEQ(posixcc, "xdigit", 6))
9167 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9171 if (namedclass == OOB_NAMEDCLASS)
9172 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9174 assert (posixcc[skip] == ':');
9175 assert (posixcc[skip+1] == ']');
9176 } else if (!SIZE_ONLY) {
9177 /* [[=foo=]] and [[.foo.]] are still future. */
9179 /* adjust RExC_parse so the warning shows after
9181 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9183 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9186 /* Maternal grandfather:
9187 * "[:" ending in ":" but not in ":]" */
9197 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9201 PERL_ARGS_ASSERT_CHECKPOSIXCC;
9203 if (POSIXCC(UCHARAT(RExC_parse))) {
9204 const char *s = RExC_parse;
9205 const char c = *s++;
9209 if (*s && c == *s && s[1] == ']') {
9211 "POSIX syntax [%c %c] belongs inside character classes",
9214 /* [[=foo=]] and [[.foo.]] are still future. */
9215 if (POSIXCC_NOTYET(c)) {
9216 /* adjust RExC_parse so the error shows after
9218 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9220 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9226 /* No locale test, and always Unicode semantics */
9227 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
9229 for (value = 0; value < 256; value++) \
9231 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9235 case ANYOF_N##NAME: \
9236 for (value = 0; value < 256; value++) \
9238 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9243 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9244 * there are two tests passed in, to use depending on that. There aren't any
9245 * cases where the label is different from the name, so no need for that
9247 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
9249 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
9250 else if (UNI_SEMANTICS) { \
9251 for (value = 0; value < 256; value++) { \
9252 if (TEST_8(value)) stored += \
9253 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9257 for (value = 0; value < 128; value++) { \
9258 if (TEST_7(UNI_TO_NATIVE(value))) stored += \
9259 set_regclass_bit(pRExC_state, ret, \
9260 (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9266 case ANYOF_N##NAME: \
9267 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
9268 else if (UNI_SEMANTICS) { \
9269 for (value = 0; value < 256; value++) { \
9270 if (! TEST_8(value)) stored += \
9271 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9275 for (value = 0; value < 128; value++) { \
9276 if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
9277 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9279 if (AT_LEAST_ASCII_RESTRICTED) { \
9280 for (value = 128; value < 256; value++) { \
9281 stored += set_regclass_bit( \
9282 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9284 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
9287 /* For a non-ut8 target string with DEPENDS semantics, all above \
9288 * ASCII Latin1 code points match the complement of any of the \
9289 * classes. But in utf8, they have their Unicode semantics, so \
9290 * can't just set them in the bitmap, or else regexec.c will think \
9291 * they matched when they shouldn't. */ \
9292 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
9300 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9303 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9304 * Locale folding is done at run-time, so this function should not be
9305 * called for nodes that are for locales.
9307 * This function sets the bit corresponding to the fold of the input
9308 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
9311 * It also knows about the characters that are in the bitmap that have
9312 * folds that are matchable only outside it, and sets the appropriate lists
9315 * It returns the number of bits that actually changed from 0 to 1 */
9320 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9322 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9325 /* It assumes the bit for 'value' has already been set */
9326 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9327 ANYOF_BITMAP_SET(node, fold);
9330 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9331 /* Certain Latin1 characters have matches outside the bitmap. To get
9332 * here, 'value' is one of those characters. None of these matches is
9333 * valid for ASCII characters under /aa, which have been excluded by
9334 * the 'if' above. The matches fall into three categories:
9335 * 1) They are singly folded-to or -from an above 255 character, as
9336 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9338 * 2) They are part of a multi-char fold with another character in the
9339 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9340 * 3) They are part of a multi-char fold with a character not in the
9341 * bitmap, such as various ligatures.
9342 * We aren't dealing fully with multi-char folds, except we do deal
9343 * with the pattern containing a character that has a multi-char fold
9344 * (not so much the inverse).
9345 * For types 1) and 3), the matches only happen when the target string
9346 * is utf8; that's not true for 2), and we set a flag for it.
9348 * The code below adds to the passed in inversion list the single fold
9349 * closures for 'value'. The values are hard-coded here so that an
9350 * innocent-looking character class, like /[ks]/i won't have to go out
9351 * to disk to find the possible matches. XXX It would be better to
9352 * generate these via regen, in case a new version of the Unicode
9353 * standard adds new mappings, though that is not really likely. */
9358 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9362 /* LATIN SMALL LETTER LONG S */
9363 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9366 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9367 GREEK_SMALL_LETTER_MU);
9368 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9369 GREEK_CAPITAL_LETTER_MU);
9371 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9372 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9374 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9375 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
9376 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9377 PL_fold_latin1[value]);
9380 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9381 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9382 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9384 case LATIN_SMALL_LETTER_SHARP_S:
9385 /* 0x1E9E is LATIN CAPITAL LETTER SHARP S */
9386 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x1E9E);
9388 /* Under /a, /d, and /u, this can match the two chars "ss" */
9389 if (! MORE_ASCII_RESTRICTED) {
9390 add_alternate(alternate_ptr, (U8 *) "ss", 2);
9392 /* And under /u or /a, it can match even if the target is
9394 if (AT_LEAST_UNI_SEMANTICS) {
9395 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9403 /* These all are targets of multi-character folds, which can
9404 * occur with only non-Latin1 characters in the fold, so they
9405 * can match if the target string isn't UTF-8 */
9406 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9414 /* These all are targets of multi-character folds, which occur
9415 * only with a non-Latin1 character as part of the fold, so
9416 * they can't match unless the target string is in UTF-8, so no
9417 * action here is necessary */
9420 /* Use deprecated warning to increase the chances of this
9422 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9426 else if (DEPENDS_SEMANTICS
9428 && PL_fold_latin1[value] != value)
9430 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9431 * folds only when the target string is in UTF-8. We add the fold
9432 * here to the list of things to match outside the bitmap, which
9433 * won't be looked at unless it is UTF8 (or else if something else
9434 * says to look even if not utf8, but those things better not happen
9435 * under DEPENDS semantics. */
9436 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9443 PERL_STATIC_INLINE U8
9444 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9446 /* This inline function sets a bit in the bitmap if not already set, and if
9447 * appropriate, its fold, returning the number of bits that actually
9448 * changed from 0 to 1 */
9452 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9454 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
9458 ANYOF_BITMAP_SET(node, value);
9461 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
9462 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9469 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9471 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9472 * alternate list, pointed to by 'alternate_ptr'. This is an array of
9473 * the multi-character folds of characters in the node */
9476 PERL_ARGS_ASSERT_ADD_ALTERNATE;
9478 if (! *alternate_ptr) {
9479 *alternate_ptr = newAV();
9481 sv = newSVpvn_utf8((char*)string, len, TRUE);
9482 av_push(*alternate_ptr, sv);
9487 parse a class specification and produce either an ANYOF node that
9488 matches the pattern or perhaps will be optimized into an EXACTish node
9489 instead. The node contains a bit map for the first 256 characters, with the
9490 corresponding bit set if that character is in the list. For characters
9491 above 255, a range list is used */
9494 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9497 register UV nextvalue;
9498 register IV prevvalue = OOB_UNICODE;
9499 register IV range = 0;
9500 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9501 register regnode *ret;
9504 char *rangebegin = NULL;
9505 bool need_class = 0;
9507 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9508 than just initialized. */
9511 /* code points this node matches that can't be stored in the bitmap */
9512 HV* nonbitmap = NULL;
9514 /* The items that are to match that aren't stored in the bitmap, but are a
9515 * result of things that are stored there. This is the fold closure of
9516 * such a character, either because it has DEPENDS semantics and shouldn't
9517 * be matched unless the target string is utf8, or is a code point that is
9518 * too large for the bit map, as for example, the fold of the MICRO SIGN is
9519 * above 255. This all is solely for performance reasons. By having this
9520 * code know the outside-the-bitmap folds that the bitmapped characters are
9521 * involved with, we don't have to go out to disk to find the list of
9522 * matches, unless the character class includes code points that aren't
9523 * storable in the bit map. That means that a character class with an 's'
9524 * in it, for example, doesn't need to go out to disk to find everything
9525 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
9526 * empty unless there is something whose fold we don't know about, and will
9527 * have to go out to the disk to find. */
9528 HV* l1_fold_invlist = NULL;
9530 /* List of multi-character folds that are matched by this node */
9531 AV* unicode_alternate = NULL;
9533 UV literal_endpoint = 0;
9535 UV stored = 0; /* how many chars stored in the bitmap */
9537 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9538 case we need to change the emitted regop to an EXACT. */
9539 const char * orig_parse = RExC_parse;
9540 GET_RE_DEBUG_FLAGS_DECL;
9542 PERL_ARGS_ASSERT_REGCLASS;
9544 PERL_UNUSED_ARG(depth);
9547 DEBUG_PARSE("clas");
9549 /* Assume we are going to generate an ANYOF node. */
9550 ret = reganode(pRExC_state, ANYOF, 0);
9554 ANYOF_FLAGS(ret) = 0;
9557 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
9561 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9565 RExC_size += ANYOF_SKIP;
9566 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9569 RExC_emit += ANYOF_SKIP;
9571 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9573 ANYOF_BITMAP_ZERO(ret);
9574 listsv = newSVpvs("# comment\n");
9575 initial_listsv_len = SvCUR(listsv);
9578 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9580 if (!SIZE_ONLY && POSIXCC(nextvalue))
9581 checkposixcc(pRExC_state);
9583 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9584 if (UCHARAT(RExC_parse) == ']')
9588 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9592 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9595 rangebegin = RExC_parse;
9597 value = utf8n_to_uvchr((U8*)RExC_parse,
9598 RExC_end - RExC_parse,
9599 &numlen, UTF8_ALLOW_DEFAULT);
9600 RExC_parse += numlen;
9603 value = UCHARAT(RExC_parse++);
9605 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9606 if (value == '[' && POSIXCC(nextvalue))
9607 namedclass = regpposixcc(pRExC_state, value);
9608 else if (value == '\\') {
9610 value = utf8n_to_uvchr((U8*)RExC_parse,
9611 RExC_end - RExC_parse,
9612 &numlen, UTF8_ALLOW_DEFAULT);
9613 RExC_parse += numlen;
9616 value = UCHARAT(RExC_parse++);
9617 /* Some compilers cannot handle switching on 64-bit integer
9618 * values, therefore value cannot be an UV. Yes, this will
9619 * be a problem later if we want switch on Unicode.
9620 * A similar issue a little bit later when switching on
9621 * namedclass. --jhi */
9622 switch ((I32)value) {
9623 case 'w': namedclass = ANYOF_ALNUM; break;
9624 case 'W': namedclass = ANYOF_NALNUM; break;
9625 case 's': namedclass = ANYOF_SPACE; break;
9626 case 'S': namedclass = ANYOF_NSPACE; break;
9627 case 'd': namedclass = ANYOF_DIGIT; break;
9628 case 'D': namedclass = ANYOF_NDIGIT; break;
9629 case 'v': namedclass = ANYOF_VERTWS; break;
9630 case 'V': namedclass = ANYOF_NVERTWS; break;
9631 case 'h': namedclass = ANYOF_HORIZWS; break;
9632 case 'H': namedclass = ANYOF_NHORIZWS; break;
9633 case 'N': /* Handle \N{NAME} in class */
9635 /* We only pay attention to the first char of
9636 multichar strings being returned. I kinda wonder
9637 if this makes sense as it does change the behaviour
9638 from earlier versions, OTOH that behaviour was broken
9640 UV v; /* value is register so we cant & it /grrr */
9641 if (reg_namedseq(pRExC_state, &v, NULL)) {
9651 if (RExC_parse >= RExC_end)
9652 vFAIL2("Empty \\%c{}", (U8)value);
9653 if (*RExC_parse == '{') {
9654 const U8 c = (U8)value;
9655 e = strchr(RExC_parse++, '}');
9657 vFAIL2("Missing right brace on \\%c{}", c);
9658 while (isSPACE(UCHARAT(RExC_parse)))
9660 if (e == RExC_parse)
9661 vFAIL2("Empty \\%c{}", c);
9663 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9671 if (UCHARAT(RExC_parse) == '^') {
9674 value = value == 'p' ? 'P' : 'p'; /* toggle */
9675 while (isSPACE(UCHARAT(RExC_parse))) {
9681 /* Add the property name to the list. If /i matching, give
9682 * a different name which consists of the normal name
9683 * sandwiched between two underscores and '_i'. The design
9684 * is discussed in the commit message for this. */
9685 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9686 (value=='p' ? '+' : '!'),
9695 /* The \p could match something in the Latin1 range, hence
9696 * something that isn't utf8 */
9697 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9698 namedclass = ANYOF_MAX; /* no official name, but it's named */
9700 /* \p means they want Unicode semantics */
9701 RExC_uni_semantics = 1;
9704 case 'n': value = '\n'; break;
9705 case 'r': value = '\r'; break;
9706 case 't': value = '\t'; break;
9707 case 'f': value = '\f'; break;
9708 case 'b': value = '\b'; break;
9709 case 'e': value = ASCII_TO_NATIVE('\033');break;
9710 case 'a': value = ASCII_TO_NATIVE('\007');break;
9712 RExC_parse--; /* function expects to be pointed at the 'o' */
9714 const char* error_msg;
9715 bool valid = grok_bslash_o(RExC_parse,
9720 RExC_parse += numlen;
9725 if (PL_encoding && value < 0x100) {
9726 goto recode_encoding;
9730 if (*RExC_parse == '{') {
9731 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9732 | PERL_SCAN_DISALLOW_PREFIX;
9733 char * const e = strchr(RExC_parse++, '}');
9735 vFAIL("Missing right brace on \\x{}");
9737 numlen = e - RExC_parse;
9738 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9742 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9744 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9745 RExC_parse += numlen;
9747 if (PL_encoding && value < 0x100)
9748 goto recode_encoding;
9751 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9753 case '0': case '1': case '2': case '3': case '4':
9754 case '5': case '6': case '7':
9756 /* Take 1-3 octal digits */
9757 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9759 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9760 RExC_parse += numlen;
9761 if (PL_encoding && value < 0x100)
9762 goto recode_encoding;
9767 SV* enc = PL_encoding;
9768 value = reg_recode((const char)(U8)value, &enc);
9769 if (!enc && SIZE_ONLY)
9770 ckWARNreg(RExC_parse,
9771 "Invalid escape in the specified encoding");
9775 /* Allow \_ to not give an error */
9776 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9777 ckWARN2reg(RExC_parse,
9778 "Unrecognized escape \\%c in character class passed through",
9783 } /* end of \blah */
9789 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9791 /* What matches in a locale is not known until runtime, so need to
9792 * (one time per class) allocate extra space to pass to regexec.
9793 * The space will contain a bit for each named class that is to be
9794 * matched against. This isn't needed for \p{} and pseudo-classes,
9795 * as they are not affected by locale, and hence are dealt with
9797 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9800 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9803 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9804 ANYOF_CLASS_ZERO(ret);
9806 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9809 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
9810 * literal, as is the character that began the false range, i.e.
9811 * the 'a' in the examples */
9815 RExC_parse >= rangebegin ?
9816 RExC_parse - rangebegin : 0;
9817 ckWARN4reg(RExC_parse,
9818 "False [] range \"%*.*s\"",
9822 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9823 if (prevvalue < 256) {
9825 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9828 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9832 range = 0; /* this was not a true range */
9838 const char *what = NULL;
9841 /* Possible truncation here but in some 64-bit environments
9842 * the compiler gets heartburn about switch on 64-bit values.
9843 * A similar issue a little earlier when switching on value.
9845 switch ((I32)namedclass) {
9847 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9848 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9849 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9850 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9851 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9852 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9853 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9854 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9855 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9856 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9857 /* \s, \w match all unicode if utf8. */
9858 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9859 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9860 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9861 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9862 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9865 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9867 for (value = 0; value < 128; value++)
9869 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9872 what = NULL; /* Doesn't match outside ascii, so
9873 don't want to add +utf8:: */
9877 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9879 for (value = 128; value < 256; value++)
9881 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9883 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9889 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9891 /* consecutive digits assumed */
9892 for (value = '0'; value <= '9'; value++)
9894 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9901 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9903 /* consecutive digits assumed */
9904 for (value = 0; value < '0'; value++)
9906 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9907 for (value = '9' + 1; value < 256; value++)
9909 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9913 if (AT_LEAST_ASCII_RESTRICTED ) {
9914 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9918 /* this is to handle \p and \P */
9921 vFAIL("Invalid [::] class");
9924 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9925 /* Strings such as "+utf8::isWord\n" */
9926 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9931 } /* end of namedclass \blah */
9934 if (prevvalue > (IV)value) /* b-a */ {
9935 const int w = RExC_parse - rangebegin;
9936 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9937 range = 0; /* not a valid range */
9941 prevvalue = value; /* save the beginning of the range */
9942 if (RExC_parse+1 < RExC_end
9943 && *RExC_parse == '-'
9944 && RExC_parse[1] != ']')
9948 /* a bad range like \w-, [:word:]- ? */
9949 if (namedclass > OOB_NAMEDCLASS) {
9950 if (ckWARN(WARN_REGEXP)) {
9952 RExC_parse >= rangebegin ?
9953 RExC_parse - rangebegin : 0;
9955 "False [] range \"%*.*s\"",
9960 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9962 range = 1; /* yeah, it's a range! */
9963 continue; /* but do it the next time */
9967 /* non-Latin1 code point implies unicode semantics. Must be set in
9968 * pass1 so is there for the whole of pass 2 */
9970 RExC_uni_semantics = 1;
9973 /* now is the next time */
9975 if (prevvalue < 256) {
9976 const IV ceilvalue = value < 256 ? value : 255;
9979 /* In EBCDIC [\x89-\x91] should include
9980 * the \x8e but [i-j] should not. */
9981 if (literal_endpoint == 2 &&
9982 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9983 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9985 if (isLOWER(prevvalue)) {
9986 for (i = prevvalue; i <= ceilvalue; i++)
9987 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9989 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9992 for (i = prevvalue; i <= ceilvalue; i++)
9993 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9995 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10001 for (i = prevvalue; i <= ceilvalue; i++) {
10002 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10006 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
10007 const UV natvalue = NATIVE_TO_UNI(value);
10008 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
10011 literal_endpoint = 0;
10015 range = 0; /* this range (if it was one) is done now */
10022 /****** !SIZE_ONLY AFTER HERE *********/
10024 /* If folding and there are code points above 255, we calculate all
10025 * characters that could fold to or from the ones already on the list */
10026 if (FOLD && nonbitmap) {
10029 HV* fold_intersection;
10032 /* This is a list of all the characters that participate in folds
10033 * (except marks, etc in multi-char folds */
10034 if (! PL_utf8_foldable) {
10035 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10036 PL_utf8_foldable = _swash_to_invlist(swash);
10039 /* This is a hash that for a particular fold gives all characters
10040 * that are involved in it */
10041 if (! PL_utf8_foldclosures) {
10043 /* If we were unable to find any folds, then we likely won't be
10044 * able to find the closures. So just create an empty list.
10045 * Folding will effectively be restricted to the non-Unicode rules
10046 * hard-coded into Perl. (This case happens legitimately during
10047 * compilation of Perl itself before the Unicode tables are
10049 if (invlist_len(PL_utf8_foldable) == 0) {
10050 PL_utf8_foldclosures = _new_invlist(0);
10052 /* If the folds haven't been read in, call a fold function
10054 if (! PL_utf8_tofold) {
10055 U8 dummy[UTF8_MAXBYTES+1];
10057 to_utf8_fold((U8*) "A", dummy, &dummy_len);
10059 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10063 /* Only the characters in this class that participate in folds need
10064 * be checked. Get the intersection of this class and all the
10065 * possible characters that are foldable. This can quickly narrow
10066 * down a large class */
10067 fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10069 /* Now look at the foldable characters in this class individually */
10070 fold_list = invlist_array(fold_intersection);
10071 for (i = 0; i < invlist_len(fold_intersection); i++) {
10074 /* The next entry is the beginning of the range that is in the
10076 UV start = fold_list[i++];
10079 /* The next entry is the beginning of the next range, which
10080 * isn't in the class, so the end of the current range is one
10081 * less than that */
10082 UV end = fold_list[i] - 1;
10084 /* Look at every character in the range */
10085 for (j = start; j <= end; j++) {
10088 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10090 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10092 if (foldlen > (STRLEN)UNISKIP(f)) {
10094 /* Any multicharacter foldings (disallowed in
10095 * lookbehind patterns) require the following
10096 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10097 * E folds into "pq" and F folds into "rst", all other
10098 * characters fold to single characters. We save away
10099 * these multicharacter foldings, to be later saved as
10100 * part of the additional "s" data. */
10101 if (! RExC_in_lookbehind) {
10103 U8* e = foldbuf + foldlen;
10105 /* If any of the folded characters of this are in
10106 * the Latin1 range, tell the regex engine that
10107 * this can match a non-utf8 target string. The
10108 * only multi-byte fold whose source is in the
10109 * Latin1 range (U+00DF) applies only when the
10110 * target string is utf8, or under unicode rules */
10111 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10114 /* Can't mix ascii with non- under /aa */
10115 if (MORE_ASCII_RESTRICTED
10116 && (isASCII(*loc) != isASCII(j)))
10118 goto end_multi_fold;
10120 if (UTF8_IS_INVARIANT(*loc)
10121 || UTF8_IS_DOWNGRADEABLE_START(*loc))
10123 /* Can't mix above and below 256 under
10126 goto end_multi_fold;
10129 |= ANYOF_NONBITMAP_NON_UTF8;
10132 loc += UTF8SKIP(loc);
10136 add_alternate(&unicode_alternate, foldbuf, foldlen);
10141 /* Single character fold. Add everything in its fold
10142 * closure to the list that this node should match */
10145 /* The fold closures data structure is a hash with the
10146 * keys being every character that is folded to, like
10147 * 'k', and the values each an array of everything that
10148 * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
10149 if ((listp = hv_fetch(PL_utf8_foldclosures,
10150 (char *) foldbuf, foldlen, FALSE)))
10152 AV* list = (AV*) *listp;
10154 for (k = 0; k <= av_len(list); k++) {
10155 SV** c_p = av_fetch(list, k, FALSE);
10158 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10162 /* /aa doesn't allow folds between ASCII and
10163 * non-; /l doesn't allow them between above
10165 if ((MORE_ASCII_RESTRICTED
10166 && (isASCII(c) != isASCII(j)))
10167 || (LOC && ((c < 256) != (j < 256))))
10172 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10173 stored += set_regclass_bit(pRExC_state,
10176 &l1_fold_invlist, &unicode_alternate);
10178 /* It may be that the code point is already
10179 * in this range or already in the bitmap,
10180 * in which case we need do nothing */
10181 else if ((c < start || c > end)
10183 || ! ANYOF_BITMAP_TEST(ret, c)))
10185 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10192 invlist_destroy(fold_intersection);
10195 /* Combine the two lists into one. */
10196 if (l1_fold_invlist) {
10198 nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10201 nonbitmap = l1_fold_invlist;
10205 /* Here, we have calculated what code points should be in the character
10206 * class. Now we can see about various optimizations. Fold calculation
10207 * needs to take place before inversion. Otherwise /[^k]/i would invert to
10208 * include K, which under /i would match k. */
10210 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
10211 * set the FOLD flag yet, so this this does optimize those. It doesn't
10212 * optimize locale. Doing so perhaps could be done as long as there is
10213 * nothing like \w in it; some thought also would have to be given to the
10214 * interaction with above 0x100 chars */
10216 && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10217 && ! unicode_alternate
10219 && SvCUR(listsv) == initial_listsv_len)
10221 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10222 ANYOF_BITMAP(ret)[value] ^= 0xFF;
10223 stored = 256 - stored;
10225 /* The inversion means that everything above 255 is matched; and at the
10226 * same time we clear the invert flag */
10227 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10230 /* Folding in the bitmap is taken care of above, but not for locale (for
10231 * which we have to wait to see what folding is in effect at runtime), and
10232 * for things not in the bitmap. Set run-time fold flag for these */
10233 if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10234 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10237 /* A single character class can be "optimized" into an EXACTish node.
10238 * Note that since we don't currently count how many characters there are
10239 * outside the bitmap, we are XXX missing optimization possibilities for
10240 * them. This optimization can't happen unless this is a truly single
10241 * character class, which means that it can't be an inversion into a
10242 * many-character class, and there must be no possibility of there being
10243 * things outside the bitmap. 'stored' (only) for locales doesn't include
10244 * \w, etc, so have to make a special test that they aren't present
10246 * Similarly A 2-character class of the very special form like [bB] can be
10247 * optimized into an EXACTFish node, but only for non-locales, and for
10248 * characters which only have the two folds; so things like 'fF' and 'Ii'
10249 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10252 && ! unicode_alternate
10253 && SvCUR(listsv) == initial_listsv_len
10254 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10255 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10256 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10257 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10258 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10259 /* If the latest code point has a fold whose
10260 * bit is set, it must be the only other one */
10261 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10262 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10264 /* Note that the information needed to decide to do this optimization
10265 * is not currently available until the 2nd pass, and that the actually
10266 * used EXACTish node takes less space than the calculated ANYOF node,
10267 * and hence the amount of space calculated in the first pass is larger
10268 * than actually used, so this optimization doesn't gain us any space.
10269 * But an EXACT node is faster than an ANYOF node, and can be combined
10270 * with any adjacent EXACT nodes later by the optimizer for further
10271 * gains. The speed of executing an EXACTF is similar to an ANYOF
10272 * node, so the optimization advantage comes from the ability to join
10273 * it to adjacent EXACT nodes */
10275 const char * cur_parse= RExC_parse;
10277 RExC_emit = (regnode *)orig_emit;
10278 RExC_parse = (char *)orig_parse;
10282 /* A locale node with one point can be folded; all the other cases
10283 * with folding will have two points, since we calculate them above
10285 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10291 } /* else 2 chars in the bit map: the folds of each other */
10292 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10294 /* To join adjacent nodes, they must be the exact EXACTish type.
10295 * Try to use the most likely type, by using EXACTFU if the regex
10296 * calls for them, or is required because the character is
10300 else { /* Otherwise, more likely to be EXACTF type */
10304 ret = reg_node(pRExC_state, op);
10305 RExC_parse = (char *)cur_parse;
10306 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10307 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10308 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10310 RExC_emit += STR_SZ(2);
10313 *STRING(ret)= (char)value;
10315 RExC_emit += STR_SZ(1);
10317 SvREFCNT_dec(listsv);
10322 UV* nonbitmap_array = invlist_array(nonbitmap);
10323 UV nonbitmap_len = invlist_len(nonbitmap);
10326 /* Here have the full list of items to match that aren't in the
10327 * bitmap. Convert to the structure that the rest of the code is
10328 * expecting. XXX That rest of the code should convert to this
10330 for (i = 0; i < nonbitmap_len; i++) {
10332 /* The next entry is the beginning of the range that is in the
10334 UV start = nonbitmap_array[i++];
10337 /* The next entry is the beginning of the next range, which isn't
10338 * in the class, so the end of the current range is one less than
10339 * that. But if there is no next range, it means that the range
10340 * begun by 'start' extends to infinity, which for this platform
10341 * ends at UV_MAX */
10342 if (i == nonbitmap_len) {
10346 end = nonbitmap_array[i] - 1;
10349 if (start == end) {
10350 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10353 /* The \t sets the whole range */
10354 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10359 invlist_destroy(nonbitmap);
10362 if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10363 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10364 SvREFCNT_dec(listsv);
10365 SvREFCNT_dec(unicode_alternate);
10369 AV * const av = newAV();
10371 /* The 0th element stores the character class description
10372 * in its textual form: used later (regexec.c:Perl_regclass_swash())
10373 * to initialize the appropriate swash (which gets stored in
10374 * the 1st element), and also useful for dumping the regnode.
10375 * The 2nd element stores the multicharacter foldings,
10376 * used later (regexec.c:S_reginclass()). */
10377 av_store(av, 0, listsv);
10378 av_store(av, 1, NULL);
10379 av_store(av, 2, MUTABLE_SV(unicode_alternate));
10380 if (unicode_alternate) { /* This node is variable length */
10383 rv = newRV_noinc(MUTABLE_SV(av));
10384 n = add_data(pRExC_state, 1, "s");
10385 RExC_rxi->data->data[n] = (void*)rv;
10393 /* reg_skipcomment()
10395 Absorbs an /x style # comments from the input stream.
10396 Returns true if there is more text remaining in the stream.
10397 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10398 terminates the pattern without including a newline.
10400 Note its the callers responsibility to ensure that we are
10401 actually in /x mode
10406 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10410 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10412 while (RExC_parse < RExC_end)
10413 if (*RExC_parse++ == '\n') {
10418 /* we ran off the end of the pattern without ending
10419 the comment, so we have to add an \n when wrapping */
10420 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10428 Advances the parse position, and optionally absorbs
10429 "whitespace" from the inputstream.
10431 Without /x "whitespace" means (?#...) style comments only,
10432 with /x this means (?#...) and # comments and whitespace proper.
10434 Returns the RExC_parse point from BEFORE the scan occurs.
10436 This is the /x friendly way of saying RExC_parse++.
10440 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10442 char* const retval = RExC_parse++;
10444 PERL_ARGS_ASSERT_NEXTCHAR;
10447 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10448 RExC_parse[2] == '#') {
10449 while (*RExC_parse != ')') {
10450 if (RExC_parse == RExC_end)
10451 FAIL("Sequence (?#... not terminated");
10457 if (RExC_flags & RXf_PMf_EXTENDED) {
10458 if (isSPACE(*RExC_parse)) {
10462 else if (*RExC_parse == '#') {
10463 if ( reg_skipcomment( pRExC_state ) )
10472 - reg_node - emit a node
10474 STATIC regnode * /* Location. */
10475 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10478 register regnode *ptr;
10479 regnode * const ret = RExC_emit;
10480 GET_RE_DEBUG_FLAGS_DECL;
10482 PERL_ARGS_ASSERT_REG_NODE;
10485 SIZE_ALIGN(RExC_size);
10489 if (RExC_emit >= RExC_emit_bound)
10490 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10492 NODE_ALIGN_FILL(ret);
10494 FILL_ADVANCE_NODE(ptr, op);
10495 #ifdef RE_TRACK_PATTERN_OFFSETS
10496 if (RExC_offsets) { /* MJD */
10497 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10498 "reg_node", __LINE__,
10500 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10501 ? "Overwriting end of array!\n" : "OK",
10502 (UV)(RExC_emit - RExC_emit_start),
10503 (UV)(RExC_parse - RExC_start),
10504 (UV)RExC_offsets[0]));
10505 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10513 - reganode - emit a node with an argument
10515 STATIC regnode * /* Location. */
10516 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10519 register regnode *ptr;
10520 regnode * const ret = RExC_emit;
10521 GET_RE_DEBUG_FLAGS_DECL;
10523 PERL_ARGS_ASSERT_REGANODE;
10526 SIZE_ALIGN(RExC_size);
10531 assert(2==regarglen[op]+1);
10533 Anything larger than this has to allocate the extra amount.
10534 If we changed this to be:
10536 RExC_size += (1 + regarglen[op]);
10538 then it wouldn't matter. Its not clear what side effect
10539 might come from that so its not done so far.
10544 if (RExC_emit >= RExC_emit_bound)
10545 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10547 NODE_ALIGN_FILL(ret);
10549 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10550 #ifdef RE_TRACK_PATTERN_OFFSETS
10551 if (RExC_offsets) { /* MJD */
10552 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10556 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10557 "Overwriting end of array!\n" : "OK",
10558 (UV)(RExC_emit - RExC_emit_start),
10559 (UV)(RExC_parse - RExC_start),
10560 (UV)RExC_offsets[0]));
10561 Set_Cur_Node_Offset;
10569 - reguni - emit (if appropriate) a Unicode character
10572 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10576 PERL_ARGS_ASSERT_REGUNI;
10578 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10582 - reginsert - insert an operator in front of already-emitted operand
10584 * Means relocating the operand.
10587 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10590 register regnode *src;
10591 register regnode *dst;
10592 register regnode *place;
10593 const int offset = regarglen[(U8)op];
10594 const int size = NODE_STEP_REGNODE + offset;
10595 GET_RE_DEBUG_FLAGS_DECL;
10597 PERL_ARGS_ASSERT_REGINSERT;
10598 PERL_UNUSED_ARG(depth);
10599 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10600 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10609 if (RExC_open_parens) {
10611 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10612 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10613 if ( RExC_open_parens[paren] >= opnd ) {
10614 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10615 RExC_open_parens[paren] += size;
10617 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10619 if ( RExC_close_parens[paren] >= opnd ) {
10620 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10621 RExC_close_parens[paren] += size;
10623 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10628 while (src > opnd) {
10629 StructCopy(--src, --dst, regnode);
10630 #ifdef RE_TRACK_PATTERN_OFFSETS
10631 if (RExC_offsets) { /* MJD 20010112 */
10632 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10636 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10637 ? "Overwriting end of array!\n" : "OK",
10638 (UV)(src - RExC_emit_start),
10639 (UV)(dst - RExC_emit_start),
10640 (UV)RExC_offsets[0]));
10641 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10642 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10648 place = opnd; /* Op node, where operand used to be. */
10649 #ifdef RE_TRACK_PATTERN_OFFSETS
10650 if (RExC_offsets) { /* MJD */
10651 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10655 (UV)(place - RExC_emit_start) > RExC_offsets[0]
10656 ? "Overwriting end of array!\n" : "OK",
10657 (UV)(place - RExC_emit_start),
10658 (UV)(RExC_parse - RExC_start),
10659 (UV)RExC_offsets[0]));
10660 Set_Node_Offset(place, RExC_parse);
10661 Set_Node_Length(place, 1);
10664 src = NEXTOPER(place);
10665 FILL_ADVANCE_NODE(place, op);
10666 Zero(src, offset, regnode);
10670 - regtail - set the next-pointer at the end of a node chain of p to val.
10671 - SEE ALSO: regtail_study
10673 /* TODO: All three parms should be const */
10675 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10678 register regnode *scan;
10679 GET_RE_DEBUG_FLAGS_DECL;
10681 PERL_ARGS_ASSERT_REGTAIL;
10683 PERL_UNUSED_ARG(depth);
10689 /* Find last node. */
10692 regnode * const temp = regnext(scan);
10694 SV * const mysv=sv_newmortal();
10695 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10696 regprop(RExC_rx, mysv, scan);
10697 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10698 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10699 (temp == NULL ? "->" : ""),
10700 (temp == NULL ? PL_reg_name[OP(val)] : "")
10708 if (reg_off_by_arg[OP(scan)]) {
10709 ARG_SET(scan, val - scan);
10712 NEXT_OFF(scan) = val - scan;
10718 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10719 - Look for optimizable sequences at the same time.
10720 - currently only looks for EXACT chains.
10722 This is experimental code. The idea is to use this routine to perform
10723 in place optimizations on branches and groups as they are constructed,
10724 with the long term intention of removing optimization from study_chunk so
10725 that it is purely analytical.
10727 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10728 to control which is which.
10731 /* TODO: All four parms should be const */
10734 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10737 register regnode *scan;
10739 #ifdef EXPERIMENTAL_INPLACESCAN
10742 GET_RE_DEBUG_FLAGS_DECL;
10744 PERL_ARGS_ASSERT_REGTAIL_STUDY;
10750 /* Find last node. */
10754 regnode * const temp = regnext(scan);
10755 #ifdef EXPERIMENTAL_INPLACESCAN
10756 if (PL_regkind[OP(scan)] == EXACT)
10757 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10761 switch (OP(scan)) {
10767 if( exact == PSEUDO )
10769 else if ( exact != OP(scan) )
10778 SV * const mysv=sv_newmortal();
10779 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10780 regprop(RExC_rx, mysv, scan);
10781 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10782 SvPV_nolen_const(mysv),
10783 REG_NODE_NUM(scan),
10784 PL_reg_name[exact]);
10791 SV * const mysv_val=sv_newmortal();
10792 DEBUG_PARSE_MSG("");
10793 regprop(RExC_rx, mysv_val, val);
10794 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10795 SvPV_nolen_const(mysv_val),
10796 (IV)REG_NODE_NUM(val),
10800 if (reg_off_by_arg[OP(scan)]) {
10801 ARG_SET(scan, val - scan);
10804 NEXT_OFF(scan) = val - scan;
10812 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10816 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10822 for (bit=0; bit<32; bit++) {
10823 if (flags & (1<<bit)) {
10824 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10827 if (!set++ && lead)
10828 PerlIO_printf(Perl_debug_log, "%s",lead);
10829 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10832 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10833 if (!set++ && lead) {
10834 PerlIO_printf(Perl_debug_log, "%s",lead);
10837 case REGEX_UNICODE_CHARSET:
10838 PerlIO_printf(Perl_debug_log, "UNICODE");
10840 case REGEX_LOCALE_CHARSET:
10841 PerlIO_printf(Perl_debug_log, "LOCALE");
10843 case REGEX_ASCII_RESTRICTED_CHARSET:
10844 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10846 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10847 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10850 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10856 PerlIO_printf(Perl_debug_log, "\n");
10858 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10864 Perl_regdump(pTHX_ const regexp *r)
10868 SV * const sv = sv_newmortal();
10869 SV *dsv= sv_newmortal();
10870 RXi_GET_DECL(r,ri);
10871 GET_RE_DEBUG_FLAGS_DECL;
10873 PERL_ARGS_ASSERT_REGDUMP;
10875 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10877 /* Header fields of interest. */
10878 if (r->anchored_substr) {
10879 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10880 RE_SV_DUMPLEN(r->anchored_substr), 30);
10881 PerlIO_printf(Perl_debug_log,
10882 "anchored %s%s at %"IVdf" ",
10883 s, RE_SV_TAIL(r->anchored_substr),
10884 (IV)r->anchored_offset);
10885 } else if (r->anchored_utf8) {
10886 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10887 RE_SV_DUMPLEN(r->anchored_utf8), 30);
10888 PerlIO_printf(Perl_debug_log,
10889 "anchored utf8 %s%s at %"IVdf" ",
10890 s, RE_SV_TAIL(r->anchored_utf8),
10891 (IV)r->anchored_offset);
10893 if (r->float_substr) {
10894 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10895 RE_SV_DUMPLEN(r->float_substr), 30);
10896 PerlIO_printf(Perl_debug_log,
10897 "floating %s%s at %"IVdf"..%"UVuf" ",
10898 s, RE_SV_TAIL(r->float_substr),
10899 (IV)r->float_min_offset, (UV)r->float_max_offset);
10900 } else if (r->float_utf8) {
10901 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10902 RE_SV_DUMPLEN(r->float_utf8), 30);
10903 PerlIO_printf(Perl_debug_log,
10904 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10905 s, RE_SV_TAIL(r->float_utf8),
10906 (IV)r->float_min_offset, (UV)r->float_max_offset);
10908 if (r->check_substr || r->check_utf8)
10909 PerlIO_printf(Perl_debug_log,
10911 (r->check_substr == r->float_substr
10912 && r->check_utf8 == r->float_utf8
10913 ? "(checking floating" : "(checking anchored"));
10914 if (r->extflags & RXf_NOSCAN)
10915 PerlIO_printf(Perl_debug_log, " noscan");
10916 if (r->extflags & RXf_CHECK_ALL)
10917 PerlIO_printf(Perl_debug_log, " isall");
10918 if (r->check_substr || r->check_utf8)
10919 PerlIO_printf(Perl_debug_log, ") ");
10921 if (ri->regstclass) {
10922 regprop(r, sv, ri->regstclass);
10923 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10925 if (r->extflags & RXf_ANCH) {
10926 PerlIO_printf(Perl_debug_log, "anchored");
10927 if (r->extflags & RXf_ANCH_BOL)
10928 PerlIO_printf(Perl_debug_log, "(BOL)");
10929 if (r->extflags & RXf_ANCH_MBOL)
10930 PerlIO_printf(Perl_debug_log, "(MBOL)");
10931 if (r->extflags & RXf_ANCH_SBOL)
10932 PerlIO_printf(Perl_debug_log, "(SBOL)");
10933 if (r->extflags & RXf_ANCH_GPOS)
10934 PerlIO_printf(Perl_debug_log, "(GPOS)");
10935 PerlIO_putc(Perl_debug_log, ' ');
10937 if (r->extflags & RXf_GPOS_SEEN)
10938 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10939 if (r->intflags & PREGf_SKIP)
10940 PerlIO_printf(Perl_debug_log, "plus ");
10941 if (r->intflags & PREGf_IMPLICIT)
10942 PerlIO_printf(Perl_debug_log, "implicit ");
10943 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10944 if (r->extflags & RXf_EVAL_SEEN)
10945 PerlIO_printf(Perl_debug_log, "with eval ");
10946 PerlIO_printf(Perl_debug_log, "\n");
10947 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10949 PERL_ARGS_ASSERT_REGDUMP;
10950 PERL_UNUSED_CONTEXT;
10951 PERL_UNUSED_ARG(r);
10952 #endif /* DEBUGGING */
10956 - regprop - printable representation of opcode
10958 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10961 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10962 if (flags & ANYOF_INVERT) \
10963 /*make sure the invert info is in each */ \
10964 sv_catpvs(sv, "^"); \
10970 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10975 RXi_GET_DECL(prog,progi);
10976 GET_RE_DEBUG_FLAGS_DECL;
10978 PERL_ARGS_ASSERT_REGPROP;
10982 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
10983 /* It would be nice to FAIL() here, but this may be called from
10984 regexec.c, and it would be hard to supply pRExC_state. */
10985 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10986 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10988 k = PL_regkind[OP(o)];
10991 sv_catpvs(sv, " ");
10992 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10993 * is a crude hack but it may be the best for now since
10994 * we have no flag "this EXACTish node was UTF-8"
10996 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10997 PERL_PV_ESCAPE_UNI_DETECT |
10998 PERL_PV_ESCAPE_NONASCII |
10999 PERL_PV_PRETTY_ELLIPSES |
11000 PERL_PV_PRETTY_LTGT |
11001 PERL_PV_PRETTY_NOCLEAR
11003 } else if (k == TRIE) {
11004 /* print the details of the trie in dumpuntil instead, as
11005 * progi->data isn't available here */
11006 const char op = OP(o);
11007 const U32 n = ARG(o);
11008 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11009 (reg_ac_data *)progi->data->data[n] :
11011 const reg_trie_data * const trie
11012 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11014 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11015 DEBUG_TRIE_COMPILE_r(
11016 Perl_sv_catpvf(aTHX_ sv,
11017 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11018 (UV)trie->startstate,
11019 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11020 (UV)trie->wordcount,
11023 (UV)TRIE_CHARCOUNT(trie),
11024 (UV)trie->uniquecharcount
11027 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11029 int rangestart = -1;
11030 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11031 sv_catpvs(sv, "[");
11032 for (i = 0; i <= 256; i++) {
11033 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11034 if (rangestart == -1)
11036 } else if (rangestart != -1) {
11037 if (i <= rangestart + 3)
11038 for (; rangestart < i; rangestart++)
11039 put_byte(sv, rangestart);
11041 put_byte(sv, rangestart);
11042 sv_catpvs(sv, "-");
11043 put_byte(sv, i - 1);
11048 sv_catpvs(sv, "]");
11051 } else if (k == CURLY) {
11052 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11053 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11054 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11056 else if (k == WHILEM && o->flags) /* Ordinal/of */
11057 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11058 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11059 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
11060 if ( RXp_PAREN_NAMES(prog) ) {
11061 if ( k != REF || (OP(o) < NREF)) {
11062 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11063 SV **name= av_fetch(list, ARG(o), 0 );
11065 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11068 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11069 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11070 I32 *nums=(I32*)SvPVX(sv_dat);
11071 SV **name= av_fetch(list, nums[0], 0 );
11074 for ( n=0; n<SvIVX(sv_dat); n++ ) {
11075 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11076 (n ? "," : ""), (IV)nums[n]);
11078 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11082 } else if (k == GOSUB)
11083 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11084 else if (k == VERB) {
11086 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11087 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11088 } else if (k == LOGICAL)
11089 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
11090 else if (k == FOLDCHAR)
11091 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11092 else if (k == ANYOF) {
11093 int i, rangestart = -1;
11094 const U8 flags = ANYOF_FLAGS(o);
11097 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11098 static const char * const anyofs[] = {
11131 if (flags & ANYOF_LOCALE)
11132 sv_catpvs(sv, "{loc}");
11133 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11134 sv_catpvs(sv, "{i}");
11135 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11136 if (flags & ANYOF_INVERT)
11137 sv_catpvs(sv, "^");
11139 /* output what the standard cp 0-255 bitmap matches */
11140 for (i = 0; i <= 256; i++) {
11141 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11142 if (rangestart == -1)
11144 } else if (rangestart != -1) {
11145 if (i <= rangestart + 3)
11146 for (; rangestart < i; rangestart++)
11147 put_byte(sv, rangestart);
11149 put_byte(sv, rangestart);
11150 sv_catpvs(sv, "-");
11151 put_byte(sv, i - 1);
11158 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11159 /* output any special charclass tests (used entirely under use locale) */
11160 if (ANYOF_CLASS_TEST_ANY_SET(o))
11161 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11162 if (ANYOF_CLASS_TEST(o,i)) {
11163 sv_catpv(sv, anyofs[i]);
11167 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11169 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11170 sv_catpvs(sv, "{non-utf8-latin1-all}");
11173 /* output information about the unicode matching */
11174 if (flags & ANYOF_UNICODE_ALL)
11175 sv_catpvs(sv, "{unicode_all}");
11176 else if (ANYOF_NONBITMAP(o))
11177 sv_catpvs(sv, "{unicode}");
11178 if (flags & ANYOF_NONBITMAP_NON_UTF8)
11179 sv_catpvs(sv, "{outside bitmap}");
11181 if (ANYOF_NONBITMAP(o)) {
11183 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11187 U8 s[UTF8_MAXBYTES_CASE+1];
11189 for (i = 0; i <= 256; i++) { /* just the first 256 */
11190 uvchr_to_utf8(s, i);
11192 if (i < 256 && swash_fetch(sw, s, TRUE)) {
11193 if (rangestart == -1)
11195 } else if (rangestart != -1) {
11196 if (i <= rangestart + 3)
11197 for (; rangestart < i; rangestart++) {
11198 const U8 * const e = uvchr_to_utf8(s,rangestart);
11200 for(p = s; p < e; p++)
11204 const U8 *e = uvchr_to_utf8(s,rangestart);
11206 for (p = s; p < e; p++)
11208 sv_catpvs(sv, "-");
11209 e = uvchr_to_utf8(s, i-1);
11210 for (p = s; p < e; p++)
11217 sv_catpvs(sv, "..."); /* et cetera */
11221 char *s = savesvpv(lv);
11222 char * const origs = s;
11224 while (*s && *s != '\n')
11228 const char * const t = ++s;
11246 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11248 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11249 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11251 PERL_UNUSED_CONTEXT;
11252 PERL_UNUSED_ARG(sv);
11253 PERL_UNUSED_ARG(o);
11254 PERL_UNUSED_ARG(prog);
11255 #endif /* DEBUGGING */
11259 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11260 { /* Assume that RE_INTUIT is set */
11262 struct regexp *const prog = (struct regexp *)SvANY(r);
11263 GET_RE_DEBUG_FLAGS_DECL;
11265 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11266 PERL_UNUSED_CONTEXT;
11270 const char * const s = SvPV_nolen_const(prog->check_substr
11271 ? prog->check_substr : prog->check_utf8);
11273 if (!PL_colorset) reginitcolors();
11274 PerlIO_printf(Perl_debug_log,
11275 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11277 prog->check_substr ? "" : "utf8 ",
11278 PL_colors[5],PL_colors[0],
11281 (strlen(s) > 60 ? "..." : ""));
11284 return prog->check_substr ? prog->check_substr : prog->check_utf8;
11290 handles refcounting and freeing the perl core regexp structure. When
11291 it is necessary to actually free the structure the first thing it
11292 does is call the 'free' method of the regexp_engine associated to
11293 the regexp, allowing the handling of the void *pprivate; member
11294 first. (This routine is not overridable by extensions, which is why
11295 the extensions free is called first.)
11297 See regdupe and regdupe_internal if you change anything here.
11299 #ifndef PERL_IN_XSUB_RE
11301 Perl_pregfree(pTHX_ REGEXP *r)
11307 Perl_pregfree2(pTHX_ REGEXP *rx)
11310 struct regexp *const r = (struct regexp *)SvANY(rx);
11311 GET_RE_DEBUG_FLAGS_DECL;
11313 PERL_ARGS_ASSERT_PREGFREE2;
11315 if (r->mother_re) {
11316 ReREFCNT_dec(r->mother_re);
11318 CALLREGFREE_PVT(rx); /* free the private data */
11319 SvREFCNT_dec(RXp_PAREN_NAMES(r));
11322 SvREFCNT_dec(r->anchored_substr);
11323 SvREFCNT_dec(r->anchored_utf8);
11324 SvREFCNT_dec(r->float_substr);
11325 SvREFCNT_dec(r->float_utf8);
11326 Safefree(r->substrs);
11328 RX_MATCH_COPY_FREE(rx);
11329 #ifdef PERL_OLD_COPY_ON_WRITE
11330 SvREFCNT_dec(r->saved_copy);
11337 This is a hacky workaround to the structural issue of match results
11338 being stored in the regexp structure which is in turn stored in
11339 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11340 could be PL_curpm in multiple contexts, and could require multiple
11341 result sets being associated with the pattern simultaneously, such
11342 as when doing a recursive match with (??{$qr})
11344 The solution is to make a lightweight copy of the regexp structure
11345 when a qr// is returned from the code executed by (??{$qr}) this
11346 lightweight copy doesn't actually own any of its data except for
11347 the starp/end and the actual regexp structure itself.
11353 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11355 struct regexp *ret;
11356 struct regexp *const r = (struct regexp *)SvANY(rx);
11357 register const I32 npar = r->nparens+1;
11359 PERL_ARGS_ASSERT_REG_TEMP_COPY;
11362 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11363 ret = (struct regexp *)SvANY(ret_x);
11365 (void)ReREFCNT_inc(rx);
11366 /* We can take advantage of the existing "copied buffer" mechanism in SVs
11367 by pointing directly at the buffer, but flagging that the allocated
11368 space in the copy is zero. As we've just done a struct copy, it's now
11369 a case of zero-ing that, rather than copying the current length. */
11370 SvPV_set(ret_x, RX_WRAPPED(rx));
11371 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11372 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11373 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11374 SvLEN_set(ret_x, 0);
11375 SvSTASH_set(ret_x, NULL);
11376 SvMAGIC_set(ret_x, NULL);
11377 Newx(ret->offs, npar, regexp_paren_pair);
11378 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11380 Newx(ret->substrs, 1, struct reg_substr_data);
11381 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11383 SvREFCNT_inc_void(ret->anchored_substr);
11384 SvREFCNT_inc_void(ret->anchored_utf8);
11385 SvREFCNT_inc_void(ret->float_substr);
11386 SvREFCNT_inc_void(ret->float_utf8);
11388 /* check_substr and check_utf8, if non-NULL, point to either their
11389 anchored or float namesakes, and don't hold a second reference. */
11391 RX_MATCH_COPIED_off(ret_x);
11392 #ifdef PERL_OLD_COPY_ON_WRITE
11393 ret->saved_copy = NULL;
11395 ret->mother_re = rx;
11401 /* regfree_internal()
11403 Free the private data in a regexp. This is overloadable by
11404 extensions. Perl takes care of the regexp structure in pregfree(),
11405 this covers the *pprivate pointer which technically perl doesn't
11406 know about, however of course we have to handle the
11407 regexp_internal structure when no extension is in use.
11409 Note this is called before freeing anything in the regexp
11414 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11417 struct regexp *const r = (struct regexp *)SvANY(rx);
11418 RXi_GET_DECL(r,ri);
11419 GET_RE_DEBUG_FLAGS_DECL;
11421 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11427 SV *dsv= sv_newmortal();
11428 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11429 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11430 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11431 PL_colors[4],PL_colors[5],s);
11434 #ifdef RE_TRACK_PATTERN_OFFSETS
11436 Safefree(ri->u.offsets); /* 20010421 MJD */
11439 int n = ri->data->count;
11440 PAD* new_comppad = NULL;
11445 /* If you add a ->what type here, update the comment in regcomp.h */
11446 switch (ri->data->what[n]) {
11451 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11454 Safefree(ri->data->data[n]);
11457 new_comppad = MUTABLE_AV(ri->data->data[n]);
11460 if (new_comppad == NULL)
11461 Perl_croak(aTHX_ "panic: pregfree comppad");
11462 PAD_SAVE_LOCAL(old_comppad,
11463 /* Watch out for global destruction's random ordering. */
11464 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11467 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11470 op_free((OP_4tree*)ri->data->data[n]);
11472 PAD_RESTORE_LOCAL(old_comppad);
11473 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11474 new_comppad = NULL;
11479 { /* Aho Corasick add-on structure for a trie node.
11480 Used in stclass optimization only */
11482 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11484 refcount = --aho->refcount;
11487 PerlMemShared_free(aho->states);
11488 PerlMemShared_free(aho->fail);
11489 /* do this last!!!! */
11490 PerlMemShared_free(ri->data->data[n]);
11491 PerlMemShared_free(ri->regstclass);
11497 /* trie structure. */
11499 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11501 refcount = --trie->refcount;
11504 PerlMemShared_free(trie->charmap);
11505 PerlMemShared_free(trie->states);
11506 PerlMemShared_free(trie->trans);
11508 PerlMemShared_free(trie->bitmap);
11510 PerlMemShared_free(trie->jump);
11511 PerlMemShared_free(trie->wordinfo);
11512 /* do this last!!!! */
11513 PerlMemShared_free(ri->data->data[n]);
11518 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11521 Safefree(ri->data->what);
11522 Safefree(ri->data);
11528 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11529 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11530 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11533 re_dup - duplicate a regexp.
11535 This routine is expected to clone a given regexp structure. It is only
11536 compiled under USE_ITHREADS.
11538 After all of the core data stored in struct regexp is duplicated
11539 the regexp_engine.dupe method is used to copy any private data
11540 stored in the *pprivate pointer. This allows extensions to handle
11541 any duplication it needs to do.
11543 See pregfree() and regfree_internal() if you change anything here.
11545 #if defined(USE_ITHREADS)
11546 #ifndef PERL_IN_XSUB_RE
11548 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11552 const struct regexp *r = (const struct regexp *)SvANY(sstr);
11553 struct regexp *ret = (struct regexp *)SvANY(dstr);
11555 PERL_ARGS_ASSERT_RE_DUP_GUTS;
11557 npar = r->nparens+1;
11558 Newx(ret->offs, npar, regexp_paren_pair);
11559 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11561 /* no need to copy these */
11562 Newx(ret->swap, npar, regexp_paren_pair);
11565 if (ret->substrs) {
11566 /* Do it this way to avoid reading from *r after the StructCopy().
11567 That way, if any of the sv_dup_inc()s dislodge *r from the L1
11568 cache, it doesn't matter. */
11569 const bool anchored = r->check_substr
11570 ? r->check_substr == r->anchored_substr
11571 : r->check_utf8 == r->anchored_utf8;
11572 Newx(ret->substrs, 1, struct reg_substr_data);
11573 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11575 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11576 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11577 ret->float_substr = sv_dup_inc(ret->float_substr, param);
11578 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11580 /* check_substr and check_utf8, if non-NULL, point to either their
11581 anchored or float namesakes, and don't hold a second reference. */
11583 if (ret->check_substr) {
11585 assert(r->check_utf8 == r->anchored_utf8);
11586 ret->check_substr = ret->anchored_substr;
11587 ret->check_utf8 = ret->anchored_utf8;
11589 assert(r->check_substr == r->float_substr);
11590 assert(r->check_utf8 == r->float_utf8);
11591 ret->check_substr = ret->float_substr;
11592 ret->check_utf8 = ret->float_utf8;
11594 } else if (ret->check_utf8) {
11596 ret->check_utf8 = ret->anchored_utf8;
11598 ret->check_utf8 = ret->float_utf8;
11603 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11606 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11608 if (RX_MATCH_COPIED(dstr))
11609 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
11611 ret->subbeg = NULL;
11612 #ifdef PERL_OLD_COPY_ON_WRITE
11613 ret->saved_copy = NULL;
11616 if (ret->mother_re) {
11617 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11618 /* Our storage points directly to our mother regexp, but that's
11619 1: a buffer in a different thread
11620 2: something we no longer hold a reference on
11621 so we need to copy it locally. */
11622 /* Note we need to sue SvCUR() on our mother_re, because it, in
11623 turn, may well be pointing to its own mother_re. */
11624 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11625 SvCUR(ret->mother_re)+1));
11626 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11628 ret->mother_re = NULL;
11632 #endif /* PERL_IN_XSUB_RE */
11637 This is the internal complement to regdupe() which is used to copy
11638 the structure pointed to by the *pprivate pointer in the regexp.
11639 This is the core version of the extension overridable cloning hook.
11640 The regexp structure being duplicated will be copied by perl prior
11641 to this and will be provided as the regexp *r argument, however
11642 with the /old/ structures pprivate pointer value. Thus this routine
11643 may override any copying normally done by perl.
11645 It returns a pointer to the new regexp_internal structure.
11649 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11652 struct regexp *const r = (struct regexp *)SvANY(rx);
11653 regexp_internal *reti;
11655 RXi_GET_DECL(r,ri);
11657 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11659 npar = r->nparens+1;
11662 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11663 Copy(ri->program, reti->program, len+1, regnode);
11666 reti->regstclass = NULL;
11669 struct reg_data *d;
11670 const int count = ri->data->count;
11673 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11674 char, struct reg_data);
11675 Newx(d->what, count, U8);
11678 for (i = 0; i < count; i++) {
11679 d->what[i] = ri->data->what[i];
11680 switch (d->what[i]) {
11681 /* legal options are one of: sSfpontTua
11682 see also regcomp.h and pregfree() */
11683 case 'a': /* actually an AV, but the dup function is identical. */
11686 case 'p': /* actually an AV, but the dup function is identical. */
11687 case 'u': /* actually an HV, but the dup function is identical. */
11688 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11691 /* This is cheating. */
11692 Newx(d->data[i], 1, struct regnode_charclass_class);
11693 StructCopy(ri->data->data[i], d->data[i],
11694 struct regnode_charclass_class);
11695 reti->regstclass = (regnode*)d->data[i];
11698 /* Compiled op trees are readonly and in shared memory,
11699 and can thus be shared without duplication. */
11701 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11705 /* Trie stclasses are readonly and can thus be shared
11706 * without duplication. We free the stclass in pregfree
11707 * when the corresponding reg_ac_data struct is freed.
11709 reti->regstclass= ri->regstclass;
11713 ((reg_trie_data*)ri->data->data[i])->refcount++;
11717 d->data[i] = ri->data->data[i];
11720 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11729 reti->name_list_idx = ri->name_list_idx;
11731 #ifdef RE_TRACK_PATTERN_OFFSETS
11732 if (ri->u.offsets) {
11733 Newx(reti->u.offsets, 2*len+1, U32);
11734 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11737 SetProgLen(reti,len);
11740 return (void*)reti;
11743 #endif /* USE_ITHREADS */
11745 #ifndef PERL_IN_XSUB_RE
11748 - regnext - dig the "next" pointer out of a node
11751 Perl_regnext(pTHX_ register regnode *p)
11754 register I32 offset;
11759 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
11760 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11763 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11772 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11775 STRLEN l1 = strlen(pat1);
11776 STRLEN l2 = strlen(pat2);
11779 const char *message;
11781 PERL_ARGS_ASSERT_RE_CROAK2;
11787 Copy(pat1, buf, l1 , char);
11788 Copy(pat2, buf + l1, l2 , char);
11789 buf[l1 + l2] = '\n';
11790 buf[l1 + l2 + 1] = '\0';
11792 /* ANSI variant takes additional second argument */
11793 va_start(args, pat2);
11797 msv = vmess(buf, &args);
11799 message = SvPV_const(msv,l1);
11802 Copy(message, buf, l1 , char);
11803 buf[l1-1] = '\0'; /* Overwrite \n */
11804 Perl_croak(aTHX_ "%s", buf);
11807 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
11809 #ifndef PERL_IN_XSUB_RE
11811 Perl_save_re_context(pTHX)
11815 struct re_save_state *state;
11817 SAVEVPTR(PL_curcop);
11818 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11820 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11821 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11822 SSPUSHUV(SAVEt_RE_STATE);
11824 Copy(&PL_reg_state, state, 1, struct re_save_state);
11826 PL_reg_start_tmp = 0;
11827 PL_reg_start_tmpl = 0;
11828 PL_reg_oldsaved = NULL;
11829 PL_reg_oldsavedlen = 0;
11830 PL_reg_maxiter = 0;
11831 PL_reg_leftiter = 0;
11832 PL_reg_poscache = NULL;
11833 PL_reg_poscache_size = 0;
11834 #ifdef PERL_OLD_COPY_ON_WRITE
11838 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11840 const REGEXP * const rx = PM_GETRE(PL_curpm);
11843 for (i = 1; i <= RX_NPARENS(rx); i++) {
11844 char digits[TYPE_CHARS(long)];
11845 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11846 GV *const *const gvp
11847 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11850 GV * const gv = *gvp;
11851 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11861 clear_re(pTHX_ void *r)
11864 ReREFCNT_dec((REGEXP *)r);
11870 S_put_byte(pTHX_ SV *sv, int c)
11872 PERL_ARGS_ASSERT_PUT_BYTE;
11874 /* Our definition of isPRINT() ignores locales, so only bytes that are
11875 not part of UTF-8 are considered printable. I assume that the same
11876 holds for UTF-EBCDIC.
11877 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11878 which Wikipedia says:
11880 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11881 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11882 identical, to the ASCII delete (DEL) or rubout control character.
11883 ) So the old condition can be simplified to !isPRINT(c) */
11886 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11889 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11893 const char string = c;
11894 if (c == '-' || c == ']' || c == '\\' || c == '^')
11895 sv_catpvs(sv, "\\");
11896 sv_catpvn(sv, &string, 1);
11901 #define CLEAR_OPTSTART \
11902 if (optstart) STMT_START { \
11903 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11907 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11909 STATIC const regnode *
11910 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11911 const regnode *last, const regnode *plast,
11912 SV* sv, I32 indent, U32 depth)
11915 register U8 op = PSEUDO; /* Arbitrary non-END op. */
11916 register const regnode *next;
11917 const regnode *optstart= NULL;
11919 RXi_GET_DECL(r,ri);
11920 GET_RE_DEBUG_FLAGS_DECL;
11922 PERL_ARGS_ASSERT_DUMPUNTIL;
11924 #ifdef DEBUG_DUMPUNTIL
11925 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11926 last ? last-start : 0,plast ? plast-start : 0);
11929 if (plast && plast < last)
11932 while (PL_regkind[op] != END && (!last || node < last)) {
11933 /* While that wasn't END last time... */
11936 if (op == CLOSE || op == WHILEM)
11938 next = regnext((regnode *)node);
11941 if (OP(node) == OPTIMIZED) {
11942 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11949 regprop(r, sv, node);
11950 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11951 (int)(2*indent + 1), "", SvPVX_const(sv));
11953 if (OP(node) != OPTIMIZED) {
11954 if (next == NULL) /* Next ptr. */
11955 PerlIO_printf(Perl_debug_log, " (0)");
11956 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11957 PerlIO_printf(Perl_debug_log, " (FAIL)");
11959 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11960 (void)PerlIO_putc(Perl_debug_log, '\n');
11964 if (PL_regkind[(U8)op] == BRANCHJ) {
11967 register const regnode *nnode = (OP(next) == LONGJMP
11968 ? regnext((regnode *)next)
11970 if (last && nnode > last)
11972 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11975 else if (PL_regkind[(U8)op] == BRANCH) {
11977 DUMPUNTIL(NEXTOPER(node), next);
11979 else if ( PL_regkind[(U8)op] == TRIE ) {
11980 const regnode *this_trie = node;
11981 const char op = OP(node);
11982 const U32 n = ARG(node);
11983 const reg_ac_data * const ac = op>=AHOCORASICK ?
11984 (reg_ac_data *)ri->data->data[n] :
11986 const reg_trie_data * const trie =
11987 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11989 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11991 const regnode *nextbranch= NULL;
11994 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
11995 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
11997 PerlIO_printf(Perl_debug_log, "%*s%s ",
11998 (int)(2*(indent+3)), "",
11999 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12000 PL_colors[0], PL_colors[1],
12001 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12002 PERL_PV_PRETTY_ELLIPSES |
12003 PERL_PV_PRETTY_LTGT
12008 U16 dist= trie->jump[word_idx+1];
12009 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12010 (UV)((dist ? this_trie + dist : next) - start));
12013 nextbranch= this_trie + trie->jump[0];
12014 DUMPUNTIL(this_trie + dist, nextbranch);
12016 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12017 nextbranch= regnext((regnode *)nextbranch);
12019 PerlIO_printf(Perl_debug_log, "\n");
12022 if (last && next > last)
12027 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
12028 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12029 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12031 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12033 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12035 else if ( op == PLUS || op == STAR) {
12036 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
12038 else if (PL_regkind[(U8)op] == ANYOF) {
12039 /* arglen 1 + class block */
12040 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
12041 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12042 node = NEXTOPER(node);
12044 else if (PL_regkind[(U8)op] == EXACT) {
12045 /* Literal string, where present. */
12046 node += NODE_SZ_STR(node) - 1;
12047 node = NEXTOPER(node);
12050 node = NEXTOPER(node);
12051 node += regarglen[(U8)op];
12053 if (op == CURLYX || op == OPEN)
12057 #ifdef DEBUG_DUMPUNTIL
12058 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12063 #endif /* DEBUGGING */
12067 * c-indentation-style: bsd
12068 * c-basic-offset: 4
12069 * indent-tabs-mode: t
12072 * ex: set ts=8 sts=4 sw=4 noet: