5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
89 #ifndef PERL_IN_XSUB_RE
90 # include "charclass_invlists.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
100 # if defined(BUGGY_MSC6)
101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
102 # pragma optimize("a",off)
103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
104 # pragma optimize("w",on )
105 # endif /* BUGGY_MSC6 */
109 #define STATIC static
112 typedef struct RExC_state_t {
113 U32 flags; /* are we folding, multilining? */
114 char *precomp; /* uncompiled string. */
115 REGEXP *rx_sv; /* The SV that is the regexp. */
116 regexp *rx; /* perl core regexp structure */
117 regexp_internal *rxi; /* internal data for regexp object pprivate field */
118 char *start; /* Start of input for compile */
119 char *end; /* End of input for compile */
120 char *parse; /* Input-scan pointer. */
121 I32 whilem_seen; /* number of WHILEM in this expr */
122 regnode *emit_start; /* Start of emitted-code area */
123 regnode *emit_bound; /* First regnode outside of the allocated space */
124 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
125 I32 naughty; /* How bad is this pattern? */
126 I32 sawback; /* Did we see \1, ...? */
128 I32 size; /* Code size. */
129 I32 npar; /* Capture buffer count, (OPEN). */
130 I32 cpar; /* Capture buffer count, (CLOSE). */
131 I32 nestroot; /* root parens we are in - used by accept */
135 regnode **open_parens; /* pointers to open parens */
136 regnode **close_parens; /* pointers to close parens */
137 regnode *opend; /* END node in program */
138 I32 utf8; /* whether the pattern is utf8 or not */
139 I32 orig_utf8; /* whether the pattern was originally in utf8 */
140 /* XXX use this for future optimisation of case
141 * where pattern must be upgraded to utf8. */
142 I32 uni_semantics; /* If a d charset modifier should use unicode
143 rules, even if the pattern is not in
145 HV *paren_names; /* Paren names */
147 regnode **recurse; /* Recurse regops */
148 I32 recurse_count; /* Number of recurse regops */
151 I32 override_recoding;
153 char *starttry; /* -Dr: where regtry was called. */
154 #define RExC_starttry (pRExC_state->starttry)
157 const char *lastparse;
159 AV *paren_name_list; /* idx -> name */
160 #define RExC_lastparse (pRExC_state->lastparse)
161 #define RExC_lastnum (pRExC_state->lastnum)
162 #define RExC_paren_name_list (pRExC_state->paren_name_list)
166 #define RExC_flags (pRExC_state->flags)
167 #define RExC_precomp (pRExC_state->precomp)
168 #define RExC_rx_sv (pRExC_state->rx_sv)
169 #define RExC_rx (pRExC_state->rx)
170 #define RExC_rxi (pRExC_state->rxi)
171 #define RExC_start (pRExC_state->start)
172 #define RExC_end (pRExC_state->end)
173 #define RExC_parse (pRExC_state->parse)
174 #define RExC_whilem_seen (pRExC_state->whilem_seen)
175 #ifdef RE_TRACK_PATTERN_OFFSETS
176 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
178 #define RExC_emit (pRExC_state->emit)
179 #define RExC_emit_start (pRExC_state->emit_start)
180 #define RExC_emit_bound (pRExC_state->emit_bound)
181 #define RExC_naughty (pRExC_state->naughty)
182 #define RExC_sawback (pRExC_state->sawback)
183 #define RExC_seen (pRExC_state->seen)
184 #define RExC_size (pRExC_state->size)
185 #define RExC_npar (pRExC_state->npar)
186 #define RExC_nestroot (pRExC_state->nestroot)
187 #define RExC_extralen (pRExC_state->extralen)
188 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
189 #define RExC_seen_evals (pRExC_state->seen_evals)
190 #define RExC_utf8 (pRExC_state->utf8)
191 #define RExC_uni_semantics (pRExC_state->uni_semantics)
192 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
193 #define RExC_open_parens (pRExC_state->open_parens)
194 #define RExC_close_parens (pRExC_state->close_parens)
195 #define RExC_opend (pRExC_state->opend)
196 #define RExC_paren_names (pRExC_state->paren_names)
197 #define RExC_recurse (pRExC_state->recurse)
198 #define RExC_recurse_count (pRExC_state->recurse_count)
199 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
200 #define RExC_contains_locale (pRExC_state->contains_locale)
201 #define RExC_override_recoding (pRExC_state->override_recoding)
204 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
205 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
206 ((*s) == '{' && regcurly(s)))
209 #undef SPSTART /* dratted cpp namespace... */
212 * Flags to be passed up and down.
214 #define WORST 0 /* Worst case. */
215 #define HASWIDTH 0x01 /* Known to match non-null strings. */
217 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
218 * character, and if utf8, must be invariant. Note that this is not the same
219 * thing as REGNODE_SIMPLE */
221 #define SPSTART 0x04 /* Starts with * or +. */
222 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
223 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
225 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
227 /* whether trie related optimizations are enabled */
228 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
229 #define TRIE_STUDY_OPT
230 #define FULL_TRIE_STUDY
236 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
237 #define PBITVAL(paren) (1 << ((paren) & 7))
238 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
239 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
240 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
242 /* If not already in utf8, do a longjmp back to the beginning */
243 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
244 #define REQUIRE_UTF8 STMT_START { \
245 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
248 /* About scan_data_t.
250 During optimisation we recurse through the regexp program performing
251 various inplace (keyhole style) optimisations. In addition study_chunk
252 and scan_commit populate this data structure with information about
253 what strings MUST appear in the pattern. We look for the longest
254 string that must appear at a fixed location, and we look for the
255 longest string that may appear at a floating location. So for instance
260 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
261 strings (because they follow a .* construct). study_chunk will identify
262 both FOO and BAR as being the longest fixed and floating strings respectively.
264 The strings can be composites, for instance
268 will result in a composite fixed substring 'foo'.
270 For each string some basic information is maintained:
272 - offset or min_offset
273 This is the position the string must appear at, or not before.
274 It also implicitly (when combined with minlenp) tells us how many
275 characters must match before the string we are searching for.
276 Likewise when combined with minlenp and the length of the string it
277 tells us how many characters must appear after the string we have
281 Only used for floating strings. This is the rightmost point that
282 the string can appear at. If set to I32 max it indicates that the
283 string can occur infinitely far to the right.
286 A pointer to the minimum length of the pattern that the string
287 was found inside. This is important as in the case of positive
288 lookahead or positive lookbehind we can have multiple patterns
293 The minimum length of the pattern overall is 3, the minimum length
294 of the lookahead part is 3, but the minimum length of the part that
295 will actually match is 1. So 'FOO's minimum length is 3, but the
296 minimum length for the F is 1. This is important as the minimum length
297 is used to determine offsets in front of and behind the string being
298 looked for. Since strings can be composites this is the length of the
299 pattern at the time it was committed with a scan_commit. Note that
300 the length is calculated by study_chunk, so that the minimum lengths
301 are not known until the full pattern has been compiled, thus the
302 pointer to the value.
306 In the case of lookbehind the string being searched for can be
307 offset past the start point of the final matching string.
308 If this value was just blithely removed from the min_offset it would
309 invalidate some of the calculations for how many chars must match
310 before or after (as they are derived from min_offset and minlen and
311 the length of the string being searched for).
312 When the final pattern is compiled and the data is moved from the
313 scan_data_t structure into the regexp structure the information
314 about lookbehind is factored in, with the information that would
315 have been lost precalculated in the end_shift field for the
318 The fields pos_min and pos_delta are used to store the minimum offset
319 and the delta to the maximum offset at the current point in the pattern.
323 typedef struct scan_data_t {
324 /*I32 len_min; unused */
325 /*I32 len_delta; unused */
329 I32 last_end; /* min value, <0 unless valid. */
332 SV **longest; /* Either &l_fixed, or &l_float. */
333 SV *longest_fixed; /* longest fixed string found in pattern */
334 I32 offset_fixed; /* offset where it starts */
335 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
336 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
337 SV *longest_float; /* longest floating string found in pattern */
338 I32 offset_float_min; /* earliest point in string it can appear */
339 I32 offset_float_max; /* latest point in string it can appear */
340 I32 *minlen_float; /* pointer to the minlen relevant to the string */
341 I32 lookbehind_float; /* is the position of the string modified by LB */
345 struct regnode_charclass_class *start_class;
349 * Forward declarations for pregcomp()'s friends.
352 static const scan_data_t zero_scan_data =
353 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
355 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
356 #define SF_BEFORE_SEOL 0x0001
357 #define SF_BEFORE_MEOL 0x0002
358 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
359 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
362 # define SF_FIX_SHIFT_EOL (0+2)
363 # define SF_FL_SHIFT_EOL (0+4)
365 # define SF_FIX_SHIFT_EOL (+2)
366 # define SF_FL_SHIFT_EOL (+4)
369 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
370 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
372 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
373 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
374 #define SF_IS_INF 0x0040
375 #define SF_HAS_PAR 0x0080
376 #define SF_IN_PAR 0x0100
377 #define SF_HAS_EVAL 0x0200
378 #define SCF_DO_SUBSTR 0x0400
379 #define SCF_DO_STCLASS_AND 0x0800
380 #define SCF_DO_STCLASS_OR 0x1000
381 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
382 #define SCF_WHILEM_VISITED_POS 0x2000
384 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
385 #define SCF_SEEN_ACCEPT 0x8000
387 #define UTF cBOOL(RExC_utf8)
389 /* The enums for all these are ordered so things work out correctly */
390 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
391 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
392 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
393 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
394 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
395 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
396 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
398 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
400 #define OOB_UNICODE 12345678
401 #define OOB_NAMEDCLASS -1
403 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
404 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
407 /* length of regex to show in messages that don't mark a position within */
408 #define RegexLengthToShowInErrorMessages 127
411 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
412 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
413 * op/pragma/warn/regcomp.
415 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
416 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
418 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
421 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
422 * arg. Show regex, up to a maximum length. If it's too long, chop and add
425 #define _FAIL(code) STMT_START { \
426 const char *ellipses = ""; \
427 IV len = RExC_end - RExC_precomp; \
430 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
431 if (len > RegexLengthToShowInErrorMessages) { \
432 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
433 len = RegexLengthToShowInErrorMessages - 10; \
439 #define FAIL(msg) _FAIL( \
440 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
441 msg, (int)len, RExC_precomp, ellipses))
443 #define FAIL2(msg,arg) _FAIL( \
444 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
445 arg, (int)len, RExC_precomp, ellipses))
448 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
450 #define Simple_vFAIL(m) STMT_START { \
451 const IV offset = RExC_parse - RExC_precomp; \
452 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
453 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
457 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
459 #define vFAIL(m) STMT_START { \
461 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
466 * Like Simple_vFAIL(), but accepts two arguments.
468 #define Simple_vFAIL2(m,a1) STMT_START { \
469 const IV offset = RExC_parse - RExC_precomp; \
470 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
471 (int)offset, RExC_precomp, RExC_precomp + offset); \
475 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
477 #define vFAIL2(m,a1) STMT_START { \
479 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
480 Simple_vFAIL2(m, a1); \
485 * Like Simple_vFAIL(), but accepts three arguments.
487 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
488 const IV offset = RExC_parse - RExC_precomp; \
489 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
494 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
496 #define vFAIL3(m,a1,a2) STMT_START { \
498 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
499 Simple_vFAIL3(m, a1, a2); \
503 * Like Simple_vFAIL(), but accepts four arguments.
505 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
506 const IV offset = RExC_parse - RExC_precomp; \
507 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
508 (int)offset, RExC_precomp, RExC_precomp + offset); \
511 #define ckWARNreg(loc,m) STMT_START { \
512 const IV offset = loc - RExC_precomp; \
513 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
514 (int)offset, RExC_precomp, RExC_precomp + offset); \
517 #define ckWARNregdep(loc,m) STMT_START { \
518 const IV offset = loc - RExC_precomp; \
519 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
521 (int)offset, RExC_precomp, RExC_precomp + offset); \
524 #define ckWARN2regdep(loc,m, a1) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
528 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
531 #define ckWARN2reg(loc, m, a1) STMT_START { \
532 const IV offset = loc - RExC_precomp; \
533 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
534 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
537 #define vWARN3(loc, m, a1, a2) STMT_START { \
538 const IV offset = loc - RExC_precomp; \
539 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
540 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
543 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
544 const IV offset = loc - RExC_precomp; \
545 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
546 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
549 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
550 const IV offset = loc - RExC_precomp; \
551 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
555 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
556 const IV offset = loc - RExC_precomp; \
557 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
558 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
561 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
562 const IV offset = loc - RExC_precomp; \
563 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
564 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
568 /* Allow for side effects in s */
569 #define REGC(c,s) STMT_START { \
570 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
573 /* Macros for recording node offsets. 20001227 mjd@plover.com
574 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
575 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
576 * Element 0 holds the number n.
577 * Position is 1 indexed.
579 #ifndef RE_TRACK_PATTERN_OFFSETS
580 #define Set_Node_Offset_To_R(node,byte)
581 #define Set_Node_Offset(node,byte)
582 #define Set_Cur_Node_Offset
583 #define Set_Node_Length_To_R(node,len)
584 #define Set_Node_Length(node,len)
585 #define Set_Node_Cur_Length(node)
586 #define Node_Offset(n)
587 #define Node_Length(n)
588 #define Set_Node_Offset_Length(node,offset,len)
589 #define ProgLen(ri) ri->u.proglen
590 #define SetProgLen(ri,x) ri->u.proglen = x
592 #define ProgLen(ri) ri->u.offsets[0]
593 #define SetProgLen(ri,x) ri->u.offsets[0] = x
594 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
596 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
597 __LINE__, (int)(node), (int)(byte))); \
599 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
601 RExC_offsets[2*(node)-1] = (byte); \
606 #define Set_Node_Offset(node,byte) \
607 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
608 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
610 #define Set_Node_Length_To_R(node,len) STMT_START { \
612 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
613 __LINE__, (int)(node), (int)(len))); \
615 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
617 RExC_offsets[2*(node)] = (len); \
622 #define Set_Node_Length(node,len) \
623 Set_Node_Length_To_R((node)-RExC_emit_start, len)
624 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
625 #define Set_Node_Cur_Length(node) \
626 Set_Node_Length(node, RExC_parse - parse_start)
628 /* Get offsets and lengths */
629 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
630 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
632 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
633 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
634 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
638 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
639 #define EXPERIMENTAL_INPLACESCAN
640 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
642 #define DEBUG_STUDYDATA(str,data,depth) \
643 DEBUG_OPTIMISE_MORE_r(if(data){ \
644 PerlIO_printf(Perl_debug_log, \
645 "%*s" str "Pos:%"IVdf"/%"IVdf \
646 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
647 (int)(depth)*2, "", \
648 (IV)((data)->pos_min), \
649 (IV)((data)->pos_delta), \
650 (UV)((data)->flags), \
651 (IV)((data)->whilem_c), \
652 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
653 is_inf ? "INF " : "" \
655 if ((data)->last_found) \
656 PerlIO_printf(Perl_debug_log, \
657 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
658 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
659 SvPVX_const((data)->last_found), \
660 (IV)((data)->last_end), \
661 (IV)((data)->last_start_min), \
662 (IV)((data)->last_start_max), \
663 ((data)->longest && \
664 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
665 SvPVX_const((data)->longest_fixed), \
666 (IV)((data)->offset_fixed), \
667 ((data)->longest && \
668 (data)->longest==&((data)->longest_float)) ? "*" : "", \
669 SvPVX_const((data)->longest_float), \
670 (IV)((data)->offset_float_min), \
671 (IV)((data)->offset_float_max) \
673 PerlIO_printf(Perl_debug_log,"\n"); \
676 static void clear_re(pTHX_ void *r);
678 /* Mark that we cannot extend a found fixed substring at this point.
679 Update the longest found anchored substring and the longest found
680 floating substrings if needed. */
683 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
685 const STRLEN l = CHR_SVLEN(data->last_found);
686 const STRLEN old_l = CHR_SVLEN(*data->longest);
687 GET_RE_DEBUG_FLAGS_DECL;
689 PERL_ARGS_ASSERT_SCAN_COMMIT;
691 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
692 SvSetMagicSV(*data->longest, data->last_found);
693 if (*data->longest == data->longest_fixed) {
694 data->offset_fixed = l ? data->last_start_min : data->pos_min;
695 if (data->flags & SF_BEFORE_EOL)
697 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
699 data->flags &= ~SF_FIX_BEFORE_EOL;
700 data->minlen_fixed=minlenp;
701 data->lookbehind_fixed=0;
703 else { /* *data->longest == data->longest_float */
704 data->offset_float_min = l ? data->last_start_min : data->pos_min;
705 data->offset_float_max = (l
706 ? data->last_start_max
707 : data->pos_min + data->pos_delta);
708 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
709 data->offset_float_max = I32_MAX;
710 if (data->flags & SF_BEFORE_EOL)
712 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
714 data->flags &= ~SF_FL_BEFORE_EOL;
715 data->minlen_float=minlenp;
716 data->lookbehind_float=0;
719 SvCUR_set(data->last_found, 0);
721 SV * const sv = data->last_found;
722 if (SvUTF8(sv) && SvMAGICAL(sv)) {
723 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
729 data->flags &= ~SF_BEFORE_EOL;
730 DEBUG_STUDYDATA("commit: ",data,0);
733 /* Can match anything (initialization) */
735 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
737 PERL_ARGS_ASSERT_CL_ANYTHING;
739 ANYOF_BITMAP_SETALL(cl);
740 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
741 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
743 /* If any portion of the regex is to operate under locale rules,
744 * initialization includes it. The reason this isn't done for all regexes
745 * is that the optimizer was written under the assumption that locale was
746 * all-or-nothing. Given the complexity and lack of documentation in the
747 * optimizer, and that there are inadequate test cases for locale, so many
748 * parts of it may not work properly, it is safest to avoid locale unless
750 if (RExC_contains_locale) {
751 ANYOF_CLASS_SETALL(cl); /* /l uses class */
752 cl->flags |= ANYOF_LOCALE;
755 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
759 /* Can match anything (initialization) */
761 S_cl_is_anything(const struct regnode_charclass_class *cl)
765 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
767 for (value = 0; value <= ANYOF_MAX; value += 2)
768 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
770 if (!(cl->flags & ANYOF_UNICODE_ALL))
772 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
777 /* Can match anything (initialization) */
779 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
781 PERL_ARGS_ASSERT_CL_INIT;
783 Zero(cl, 1, struct regnode_charclass_class);
785 cl_anything(pRExC_state, cl);
786 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
789 /* These two functions currently do the exact same thing */
790 #define cl_init_zero S_cl_init
792 /* 'AND' a given class with another one. Can create false positives. 'cl'
793 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
794 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
796 S_cl_and(struct regnode_charclass_class *cl,
797 const struct regnode_charclass_class *and_with)
799 PERL_ARGS_ASSERT_CL_AND;
801 assert(and_with->type == ANYOF);
803 /* I (khw) am not sure all these restrictions are necessary XXX */
804 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
805 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
806 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
807 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
808 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
811 if (and_with->flags & ANYOF_INVERT)
812 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
813 cl->bitmap[i] &= ~and_with->bitmap[i];
815 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
816 cl->bitmap[i] &= and_with->bitmap[i];
817 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
819 if (and_with->flags & ANYOF_INVERT) {
821 /* Here, the and'ed node is inverted. Get the AND of the flags that
822 * aren't affected by the inversion. Those that are affected are
823 * handled individually below */
824 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
825 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
826 cl->flags |= affected_flags;
828 /* We currently don't know how to deal with things that aren't in the
829 * bitmap, but we know that the intersection is no greater than what
830 * is already in cl, so let there be false positives that get sorted
831 * out after the synthetic start class succeeds, and the node is
832 * matched for real. */
834 /* The inversion of these two flags indicate that the resulting
835 * intersection doesn't have them */
836 if (and_with->flags & ANYOF_UNICODE_ALL) {
837 cl->flags &= ~ANYOF_UNICODE_ALL;
839 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
840 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
843 else { /* and'd node is not inverted */
844 U8 outside_bitmap_but_not_utf8; /* Temp variable */
846 if (! ANYOF_NONBITMAP(and_with)) {
848 /* Here 'and_with' doesn't match anything outside the bitmap
849 * (except possibly ANYOF_UNICODE_ALL), which means the
850 * intersection can't either, except for ANYOF_UNICODE_ALL, in
851 * which case we don't know what the intersection is, but it's no
852 * greater than what cl already has, so can just leave it alone,
853 * with possible false positives */
854 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
855 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
856 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
859 else if (! ANYOF_NONBITMAP(cl)) {
861 /* Here, 'and_with' does match something outside the bitmap, and cl
862 * doesn't have a list of things to match outside the bitmap. If
863 * cl can match all code points above 255, the intersection will
864 * be those above-255 code points that 'and_with' matches. If cl
865 * can't match all Unicode code points, it means that it can't
866 * match anything outside the bitmap (since the 'if' that got us
867 * into this block tested for that), so we leave the bitmap empty.
869 if (cl->flags & ANYOF_UNICODE_ALL) {
870 ARG_SET(cl, ARG(and_with));
872 /* and_with's ARG may match things that don't require UTF8.
873 * And now cl's will too, in spite of this being an 'and'. See
874 * the comments below about the kludge */
875 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
879 /* Here, both 'and_with' and cl match something outside the
880 * bitmap. Currently we do not do the intersection, so just match
881 * whatever cl had at the beginning. */
885 /* Take the intersection of the two sets of flags. However, the
886 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
887 * kludge around the fact that this flag is not treated like the others
888 * which are initialized in cl_anything(). The way the optimizer works
889 * is that the synthetic start class (SSC) is initialized to match
890 * anything, and then the first time a real node is encountered, its
891 * values are AND'd with the SSC's with the result being the values of
892 * the real node. However, there are paths through the optimizer where
893 * the AND never gets called, so those initialized bits are set
894 * inappropriately, which is not usually a big deal, as they just cause
895 * false positives in the SSC, which will just mean a probably
896 * imperceptible slow down in execution. However this bit has a
897 * higher false positive consequence in that it can cause utf8.pm,
898 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
899 * bigger slowdown and also causes significant extra memory to be used.
900 * In order to prevent this, the code now takes a different tack. The
901 * bit isn't set unless some part of the regular expression needs it,
902 * but once set it won't get cleared. This means that these extra
903 * modules won't get loaded unless there was some path through the
904 * pattern that would have required them anyway, and so any false
905 * positives that occur by not ANDing them out when they could be
906 * aren't as severe as they would be if we treated this bit like all
908 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
909 & ANYOF_NONBITMAP_NON_UTF8;
910 cl->flags &= and_with->flags;
911 cl->flags |= outside_bitmap_but_not_utf8;
915 /* 'OR' a given class with another one. Can create false positives. 'cl'
916 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
917 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
919 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
921 PERL_ARGS_ASSERT_CL_OR;
923 if (or_with->flags & ANYOF_INVERT) {
925 /* Here, the or'd node is to be inverted. This means we take the
926 * complement of everything not in the bitmap, but currently we don't
927 * know what that is, so give up and match anything */
928 if (ANYOF_NONBITMAP(or_with)) {
929 cl_anything(pRExC_state, cl);
932 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
933 * <= (B1 | !B2) | (CL1 | !CL2)
934 * which is wasteful if CL2 is small, but we ignore CL2:
935 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
936 * XXXX Can we handle case-fold? Unclear:
937 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
938 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
940 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
941 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
942 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
945 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
946 cl->bitmap[i] |= ~or_with->bitmap[i];
947 } /* XXXX: logic is complicated otherwise */
949 cl_anything(pRExC_state, cl);
952 /* And, we can just take the union of the flags that aren't affected
953 * by the inversion */
954 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
956 /* For the remaining flags:
957 ANYOF_UNICODE_ALL and inverted means to not match anything above
958 255, which means that the union with cl should just be
959 what cl has in it, so can ignore this flag
960 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
961 is 127-255 to match them, but then invert that, so the
962 union with cl should just be what cl has in it, so can
965 } else { /* 'or_with' is not inverted */
966 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
967 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
968 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
969 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
972 /* OR char bitmap and class bitmap separately */
973 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
974 cl->bitmap[i] |= or_with->bitmap[i];
975 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
976 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
977 cl->classflags[i] |= or_with->classflags[i];
978 cl->flags |= ANYOF_CLASS;
981 else { /* XXXX: logic is complicated, leave it along for a moment. */
982 cl_anything(pRExC_state, cl);
985 if (ANYOF_NONBITMAP(or_with)) {
987 /* Use the added node's outside-the-bit-map match if there isn't a
988 * conflict. If there is a conflict (both nodes match something
989 * outside the bitmap, but what they match outside is not the same
990 * pointer, and hence not easily compared until XXX we extend
991 * inversion lists this far), give up and allow the start class to
992 * match everything outside the bitmap. If that stuff is all above
993 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
994 if (! ANYOF_NONBITMAP(cl)) {
995 ARG_SET(cl, ARG(or_with));
997 else if (ARG(cl) != ARG(or_with)) {
999 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1000 cl_anything(pRExC_state, cl);
1003 cl->flags |= ANYOF_UNICODE_ALL;
1008 /* Take the union */
1009 cl->flags |= or_with->flags;
1013 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1014 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1015 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1016 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1021 dump_trie(trie,widecharmap,revcharmap)
1022 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1023 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1025 These routines dump out a trie in a somewhat readable format.
1026 The _interim_ variants are used for debugging the interim
1027 tables that are used to generate the final compressed
1028 representation which is what dump_trie expects.
1030 Part of the reason for their existence is to provide a form
1031 of documentation as to how the different representations function.
1036 Dumps the final compressed table form of the trie to Perl_debug_log.
1037 Used for debugging make_trie().
1041 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1042 AV *revcharmap, U32 depth)
1045 SV *sv=sv_newmortal();
1046 int colwidth= widecharmap ? 6 : 4;
1048 GET_RE_DEBUG_FLAGS_DECL;
1050 PERL_ARGS_ASSERT_DUMP_TRIE;
1052 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1053 (int)depth * 2 + 2,"",
1054 "Match","Base","Ofs" );
1056 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1057 SV ** const tmp = av_fetch( revcharmap, state, 0);
1059 PerlIO_printf( Perl_debug_log, "%*s",
1061 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1062 PL_colors[0], PL_colors[1],
1063 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1064 PERL_PV_ESCAPE_FIRSTCHAR
1069 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1070 (int)depth * 2 + 2,"");
1072 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1073 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1074 PerlIO_printf( Perl_debug_log, "\n");
1076 for( state = 1 ; state < trie->statecount ; state++ ) {
1077 const U32 base = trie->states[ state ].trans.base;
1079 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1081 if ( trie->states[ state ].wordnum ) {
1082 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1084 PerlIO_printf( Perl_debug_log, "%6s", "" );
1087 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1092 while( ( base + ofs < trie->uniquecharcount ) ||
1093 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1094 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1097 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1099 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1100 if ( ( base + ofs >= trie->uniquecharcount ) &&
1101 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1102 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1104 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1106 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1108 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1112 PerlIO_printf( Perl_debug_log, "]");
1115 PerlIO_printf( Perl_debug_log, "\n" );
1117 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1118 for (word=1; word <= trie->wordcount; word++) {
1119 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1120 (int)word, (int)(trie->wordinfo[word].prev),
1121 (int)(trie->wordinfo[word].len));
1123 PerlIO_printf(Perl_debug_log, "\n" );
1126 Dumps a fully constructed but uncompressed trie in list form.
1127 List tries normally only are used for construction when the number of
1128 possible chars (trie->uniquecharcount) is very high.
1129 Used for debugging make_trie().
1132 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1133 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1137 SV *sv=sv_newmortal();
1138 int colwidth= widecharmap ? 6 : 4;
1139 GET_RE_DEBUG_FLAGS_DECL;
1141 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1143 /* print out the table precompression. */
1144 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1145 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1146 "------:-----+-----------------\n" );
1148 for( state=1 ; state < next_alloc ; state ++ ) {
1151 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1152 (int)depth * 2 + 2,"", (UV)state );
1153 if ( ! trie->states[ state ].wordnum ) {
1154 PerlIO_printf( Perl_debug_log, "%5s| ","");
1156 PerlIO_printf( Perl_debug_log, "W%4x| ",
1157 trie->states[ state ].wordnum
1160 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1161 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1163 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1165 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1166 PL_colors[0], PL_colors[1],
1167 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1168 PERL_PV_ESCAPE_FIRSTCHAR
1170 TRIE_LIST_ITEM(state,charid).forid,
1171 (UV)TRIE_LIST_ITEM(state,charid).newstate
1174 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1175 (int)((depth * 2) + 14), "");
1178 PerlIO_printf( Perl_debug_log, "\n");
1183 Dumps a fully constructed but uncompressed trie in table form.
1184 This is the normal DFA style state transition table, with a few
1185 twists to facilitate compression later.
1186 Used for debugging make_trie().
1189 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1190 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1195 SV *sv=sv_newmortal();
1196 int colwidth= widecharmap ? 6 : 4;
1197 GET_RE_DEBUG_FLAGS_DECL;
1199 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1202 print out the table precompression so that we can do a visual check
1203 that they are identical.
1206 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1208 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1209 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1211 PerlIO_printf( Perl_debug_log, "%*s",
1213 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1214 PL_colors[0], PL_colors[1],
1215 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1216 PERL_PV_ESCAPE_FIRSTCHAR
1222 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1224 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1225 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1228 PerlIO_printf( Perl_debug_log, "\n" );
1230 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1232 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1233 (int)depth * 2 + 2,"",
1234 (UV)TRIE_NODENUM( state ) );
1236 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1237 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1239 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1241 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1243 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1244 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1246 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1247 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1255 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1256 startbranch: the first branch in the whole branch sequence
1257 first : start branch of sequence of branch-exact nodes.
1258 May be the same as startbranch
1259 last : Thing following the last branch.
1260 May be the same as tail.
1261 tail : item following the branch sequence
1262 count : words in the sequence
1263 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1264 depth : indent depth
1266 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1268 A trie is an N'ary tree where the branches are determined by digital
1269 decomposition of the key. IE, at the root node you look up the 1st character and
1270 follow that branch repeat until you find the end of the branches. Nodes can be
1271 marked as "accepting" meaning they represent a complete word. Eg:
1275 would convert into the following structure. Numbers represent states, letters
1276 following numbers represent valid transitions on the letter from that state, if
1277 the number is in square brackets it represents an accepting state, otherwise it
1278 will be in parenthesis.
1280 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1284 (1) +-i->(6)-+-s->[7]
1286 +-s->(3)-+-h->(4)-+-e->[5]
1288 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1290 This shows that when matching against the string 'hers' we will begin at state 1
1291 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1292 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1293 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1294 single traverse. We store a mapping from accepting to state to which word was
1295 matched, and then when we have multiple possibilities we try to complete the
1296 rest of the regex in the order in which they occured in the alternation.
1298 The only prior NFA like behaviour that would be changed by the TRIE support is
1299 the silent ignoring of duplicate alternations which are of the form:
1301 / (DUPE|DUPE) X? (?{ ... }) Y /x
1303 Thus EVAL blocks following a trie may be called a different number of times with
1304 and without the optimisation. With the optimisations dupes will be silently
1305 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1306 the following demonstrates:
1308 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1310 which prints out 'word' three times, but
1312 'words'=~/(word|word|word)(?{ print $1 })S/
1314 which doesnt print it out at all. This is due to other optimisations kicking in.
1316 Example of what happens on a structural level:
1318 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1320 1: CURLYM[1] {1,32767}(18)
1331 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1332 and should turn into:
1334 1: CURLYM[1] {1,32767}(18)
1336 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1344 Cases where tail != last would be like /(?foo|bar)baz/:
1354 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1355 and would end up looking like:
1358 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1365 d = uvuni_to_utf8_flags(d, uv, 0);
1367 is the recommended Unicode-aware way of saying
1372 #define TRIE_STORE_REVCHAR(val) \
1375 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1376 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1377 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1378 SvCUR_set(zlopp, kapow - flrbbbbb); \
1381 av_push(revcharmap, zlopp); \
1383 char ooooff = (char)val; \
1384 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1388 #define TRIE_READ_CHAR STMT_START { \
1391 /* if it is UTF then it is either already folded, or does not need folding */ \
1392 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1394 else if (folder == PL_fold_latin1) { \
1395 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1396 if ( foldlen > 0 ) { \
1397 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1403 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1404 skiplen = UNISKIP(uvc); \
1405 foldlen -= skiplen; \
1406 scan = foldbuf + skiplen; \
1409 /* raw data, will be folded later if needed */ \
1417 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1418 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1419 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1420 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1422 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1423 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1424 TRIE_LIST_CUR( state )++; \
1427 #define TRIE_LIST_NEW(state) STMT_START { \
1428 Newxz( trie->states[ state ].trans.list, \
1429 4, reg_trie_trans_le ); \
1430 TRIE_LIST_CUR( state ) = 1; \
1431 TRIE_LIST_LEN( state ) = 4; \
1434 #define TRIE_HANDLE_WORD(state) STMT_START { \
1435 U16 dupe= trie->states[ state ].wordnum; \
1436 regnode * const noper_next = regnext( noper ); \
1439 /* store the word for dumping */ \
1441 if (OP(noper) != NOTHING) \
1442 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1444 tmp = newSVpvn_utf8( "", 0, UTF ); \
1445 av_push( trie_words, tmp ); \
1449 trie->wordinfo[curword].prev = 0; \
1450 trie->wordinfo[curword].len = wordlen; \
1451 trie->wordinfo[curword].accept = state; \
1453 if ( noper_next < tail ) { \
1455 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1456 trie->jump[curword] = (U16)(noper_next - convert); \
1458 jumper = noper_next; \
1460 nextbranch= regnext(cur); \
1464 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1465 /* chain, so that when the bits of chain are later */\
1466 /* linked together, the dups appear in the chain */\
1467 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1468 trie->wordinfo[dupe].prev = curword; \
1470 /* we haven't inserted this word yet. */ \
1471 trie->states[ state ].wordnum = curword; \
1476 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1477 ( ( base + charid >= ucharcount \
1478 && base + charid < ubound \
1479 && state == trie->trans[ base - ucharcount + charid ].check \
1480 && trie->trans[ base - ucharcount + charid ].next ) \
1481 ? trie->trans[ base - ucharcount + charid ].next \
1482 : ( state==1 ? special : 0 ) \
1486 #define MADE_JUMP_TRIE 2
1487 #define MADE_EXACT_TRIE 4
1490 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1493 /* first pass, loop through and scan words */
1494 reg_trie_data *trie;
1495 HV *widecharmap = NULL;
1496 AV *revcharmap = newAV();
1498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1503 regnode *jumper = NULL;
1504 regnode *nextbranch = NULL;
1505 regnode *convert = NULL;
1506 U32 *prev_states; /* temp array mapping each state to previous one */
1507 /* we just use folder as a flag in utf8 */
1508 const U8 * folder = NULL;
1511 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1512 AV *trie_words = NULL;
1513 /* along with revcharmap, this only used during construction but both are
1514 * useful during debugging so we store them in the struct when debugging.
1517 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1518 STRLEN trie_charcount=0;
1520 SV *re_trie_maxbuff;
1521 GET_RE_DEBUG_FLAGS_DECL;
1523 PERL_ARGS_ASSERT_MAKE_TRIE;
1525 PERL_UNUSED_ARG(depth);
1532 case EXACTFU_TRICKYFOLD:
1533 case EXACTFU: folder = PL_fold_latin1; break;
1534 case EXACTF: folder = PL_fold; break;
1535 case EXACTFL: folder = PL_fold_locale; break;
1536 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1539 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1541 trie->startstate = 1;
1542 trie->wordcount = word_count;
1543 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1544 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1546 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1547 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1548 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1551 trie_words = newAV();
1554 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1555 if (!SvIOK(re_trie_maxbuff)) {
1556 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1559 PerlIO_printf( Perl_debug_log,
1560 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1561 (int)depth * 2 + 2, "",
1562 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1563 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1567 /* Find the node we are going to overwrite */
1568 if ( first == startbranch && OP( last ) != BRANCH ) {
1569 /* whole branch chain */
1572 /* branch sub-chain */
1573 convert = NEXTOPER( first );
1576 /* -- First loop and Setup --
1578 We first traverse the branches and scan each word to determine if it
1579 contains widechars, and how many unique chars there are, this is
1580 important as we have to build a table with at least as many columns as we
1583 We use an array of integers to represent the character codes 0..255
1584 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1585 native representation of the character value as the key and IV's for the
1588 *TODO* If we keep track of how many times each character is used we can
1589 remap the columns so that the table compression later on is more
1590 efficient in terms of memory by ensuring the most common value is in the
1591 middle and the least common are on the outside. IMO this would be better
1592 than a most to least common mapping as theres a decent chance the most
1593 common letter will share a node with the least common, meaning the node
1594 will not be compressible. With a middle is most common approach the worst
1595 case is when we have the least common nodes twice.
1599 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1600 regnode * const noper = NEXTOPER( cur );
1601 const U8 *uc = (U8*)STRING( noper );
1602 const U8 * const e = uc + STR_LEN( noper );
1604 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1606 const U8 *scan = (U8*)NULL;
1607 U32 wordlen = 0; /* required init */
1609 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1611 if (OP(noper) == NOTHING) {
1615 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1616 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1617 regardless of encoding */
1618 if (OP( noper ) == EXACTFU_SS) {
1619 /* false positives are ok, so just set this */
1620 TRIE_BITMAP_SET(trie,0xDF);
1623 for ( ; uc < e ; uc += len ) {
1624 TRIE_CHARCOUNT(trie)++;
1629 U8 folded= folder[ (U8) uvc ];
1630 if ( !trie->charmap[ folded ] ) {
1631 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1632 TRIE_STORE_REVCHAR( folded );
1635 if ( !trie->charmap[ uvc ] ) {
1636 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1637 TRIE_STORE_REVCHAR( uvc );
1640 /* store the codepoint in the bitmap, and its folded
1642 TRIE_BITMAP_SET(trie, uvc);
1644 /* store the folded codepoint */
1645 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1648 /* store first byte of utf8 representation of
1649 variant codepoints */
1650 if (! UNI_IS_INVARIANT(uvc)) {
1651 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1654 set_bit = 0; /* We've done our bit :-) */
1659 widecharmap = newHV();
1661 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1664 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1666 if ( !SvTRUE( *svpp ) ) {
1667 sv_setiv( *svpp, ++trie->uniquecharcount );
1668 TRIE_STORE_REVCHAR(uvc);
1672 if( cur == first ) {
1673 trie->minlen = chars;
1674 trie->maxlen = chars;
1675 } else if (chars < trie->minlen) {
1676 trie->minlen = chars;
1677 } else if (chars > trie->maxlen) {
1678 trie->maxlen = chars;
1680 if (OP( noper ) == EXACTFU_SS) {
1681 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1682 if (trie->minlen > 1)
1685 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1686 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1687 * - We assume that any such sequence might match a 2 byte string */
1688 if (trie->minlen > 2 )
1692 } /* end first pass */
1693 DEBUG_TRIE_COMPILE_r(
1694 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1695 (int)depth * 2 + 2,"",
1696 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1697 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1698 (int)trie->minlen, (int)trie->maxlen )
1702 We now know what we are dealing with in terms of unique chars and
1703 string sizes so we can calculate how much memory a naive
1704 representation using a flat table will take. If it's over a reasonable
1705 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1706 conservative but potentially much slower representation using an array
1709 At the end we convert both representations into the same compressed
1710 form that will be used in regexec.c for matching with. The latter
1711 is a form that cannot be used to construct with but has memory
1712 properties similar to the list form and access properties similar
1713 to the table form making it both suitable for fast searches and
1714 small enough that its feasable to store for the duration of a program.
1716 See the comment in the code where the compressed table is produced
1717 inplace from the flat tabe representation for an explanation of how
1718 the compression works.
1723 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1726 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1728 Second Pass -- Array Of Lists Representation
1730 Each state will be represented by a list of charid:state records
1731 (reg_trie_trans_le) the first such element holds the CUR and LEN
1732 points of the allocated array. (See defines above).
1734 We build the initial structure using the lists, and then convert
1735 it into the compressed table form which allows faster lookups
1736 (but cant be modified once converted).
1739 STRLEN transcount = 1;
1741 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1742 "%*sCompiling trie using list compiler\n",
1743 (int)depth * 2 + 2, ""));
1745 trie->states = (reg_trie_state *)
1746 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1747 sizeof(reg_trie_state) );
1751 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1753 regnode * const noper = NEXTOPER( cur );
1754 U8 *uc = (U8*)STRING( noper );
1755 const U8 * const e = uc + STR_LEN( noper );
1756 U32 state = 1; /* required init */
1757 U16 charid = 0; /* sanity init */
1758 U8 *scan = (U8*)NULL; /* sanity init */
1759 STRLEN foldlen = 0; /* required init */
1760 U32 wordlen = 0; /* required init */
1761 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1764 if (OP(noper) != NOTHING) {
1765 for ( ; uc < e ; uc += len ) {
1770 charid = trie->charmap[ uvc ];
1772 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1776 charid=(U16)SvIV( *svpp );
1779 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1786 if ( !trie->states[ state ].trans.list ) {
1787 TRIE_LIST_NEW( state );
1789 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1790 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1791 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1796 newstate = next_alloc++;
1797 prev_states[newstate] = state;
1798 TRIE_LIST_PUSH( state, charid, newstate );
1803 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1807 TRIE_HANDLE_WORD(state);
1809 } /* end second pass */
1811 /* next alloc is the NEXT state to be allocated */
1812 trie->statecount = next_alloc;
1813 trie->states = (reg_trie_state *)
1814 PerlMemShared_realloc( trie->states,
1816 * sizeof(reg_trie_state) );
1818 /* and now dump it out before we compress it */
1819 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1820 revcharmap, next_alloc,
1824 trie->trans = (reg_trie_trans *)
1825 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1832 for( state=1 ; state < next_alloc ; state ++ ) {
1836 DEBUG_TRIE_COMPILE_MORE_r(
1837 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1841 if (trie->states[state].trans.list) {
1842 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1846 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1847 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1848 if ( forid < minid ) {
1850 } else if ( forid > maxid ) {
1854 if ( transcount < tp + maxid - minid + 1) {
1856 trie->trans = (reg_trie_trans *)
1857 PerlMemShared_realloc( trie->trans,
1859 * sizeof(reg_trie_trans) );
1860 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1862 base = trie->uniquecharcount + tp - minid;
1863 if ( maxid == minid ) {
1865 for ( ; zp < tp ; zp++ ) {
1866 if ( ! trie->trans[ zp ].next ) {
1867 base = trie->uniquecharcount + zp - minid;
1868 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1869 trie->trans[ zp ].check = state;
1875 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1876 trie->trans[ tp ].check = state;
1881 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1882 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1883 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1884 trie->trans[ tid ].check = state;
1886 tp += ( maxid - minid + 1 );
1888 Safefree(trie->states[ state ].trans.list);
1891 DEBUG_TRIE_COMPILE_MORE_r(
1892 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1895 trie->states[ state ].trans.base=base;
1897 trie->lasttrans = tp + 1;
1901 Second Pass -- Flat Table Representation.
1903 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1904 We know that we will need Charcount+1 trans at most to store the data
1905 (one row per char at worst case) So we preallocate both structures
1906 assuming worst case.
1908 We then construct the trie using only the .next slots of the entry
1911 We use the .check field of the first entry of the node temporarily to
1912 make compression both faster and easier by keeping track of how many non
1913 zero fields are in the node.
1915 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1918 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1919 number representing the first entry of the node, and state as a
1920 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1921 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1922 are 2 entrys per node. eg:
1930 The table is internally in the right hand, idx form. However as we also
1931 have to deal with the states array which is indexed by nodenum we have to
1932 use TRIE_NODENUM() to convert.
1935 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1936 "%*sCompiling trie using table compiler\n",
1937 (int)depth * 2 + 2, ""));
1939 trie->trans = (reg_trie_trans *)
1940 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1941 * trie->uniquecharcount + 1,
1942 sizeof(reg_trie_trans) );
1943 trie->states = (reg_trie_state *)
1944 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1945 sizeof(reg_trie_state) );
1946 next_alloc = trie->uniquecharcount + 1;
1949 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1951 regnode * const noper = NEXTOPER( cur );
1952 const U8 *uc = (U8*)STRING( noper );
1953 const U8 * const e = uc + STR_LEN( noper );
1955 U32 state = 1; /* required init */
1957 U16 charid = 0; /* sanity init */
1958 U32 accept_state = 0; /* sanity init */
1959 U8 *scan = (U8*)NULL; /* sanity init */
1961 STRLEN foldlen = 0; /* required init */
1962 U32 wordlen = 0; /* required init */
1964 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1967 if ( OP(noper) != NOTHING ) {
1968 for ( ; uc < e ; uc += len ) {
1973 charid = trie->charmap[ uvc ];
1975 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1976 charid = svpp ? (U16)SvIV(*svpp) : 0;
1980 if ( !trie->trans[ state + charid ].next ) {
1981 trie->trans[ state + charid ].next = next_alloc;
1982 trie->trans[ state ].check++;
1983 prev_states[TRIE_NODENUM(next_alloc)]
1984 = TRIE_NODENUM(state);
1985 next_alloc += trie->uniquecharcount;
1987 state = trie->trans[ state + charid ].next;
1989 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1991 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1994 accept_state = TRIE_NODENUM( state );
1995 TRIE_HANDLE_WORD(accept_state);
1997 } /* end second pass */
1999 /* and now dump it out before we compress it */
2000 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2002 next_alloc, depth+1));
2006 * Inplace compress the table.*
2008 For sparse data sets the table constructed by the trie algorithm will
2009 be mostly 0/FAIL transitions or to put it another way mostly empty.
2010 (Note that leaf nodes will not contain any transitions.)
2012 This algorithm compresses the tables by eliminating most such
2013 transitions, at the cost of a modest bit of extra work during lookup:
2015 - Each states[] entry contains a .base field which indicates the
2016 index in the state[] array wheres its transition data is stored.
2018 - If .base is 0 there are no valid transitions from that node.
2020 - If .base is nonzero then charid is added to it to find an entry in
2023 -If trans[states[state].base+charid].check!=state then the
2024 transition is taken to be a 0/Fail transition. Thus if there are fail
2025 transitions at the front of the node then the .base offset will point
2026 somewhere inside the previous nodes data (or maybe even into a node
2027 even earlier), but the .check field determines if the transition is
2031 The following process inplace converts the table to the compressed
2032 table: We first do not compress the root node 1,and mark all its
2033 .check pointers as 1 and set its .base pointer as 1 as well. This
2034 allows us to do a DFA construction from the compressed table later,
2035 and ensures that any .base pointers we calculate later are greater
2038 - We set 'pos' to indicate the first entry of the second node.
2040 - We then iterate over the columns of the node, finding the first and
2041 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2042 and set the .check pointers accordingly, and advance pos
2043 appropriately and repreat for the next node. Note that when we copy
2044 the next pointers we have to convert them from the original
2045 NODEIDX form to NODENUM form as the former is not valid post
2048 - If a node has no transitions used we mark its base as 0 and do not
2049 advance the pos pointer.
2051 - If a node only has one transition we use a second pointer into the
2052 structure to fill in allocated fail transitions from other states.
2053 This pointer is independent of the main pointer and scans forward
2054 looking for null transitions that are allocated to a state. When it
2055 finds one it writes the single transition into the "hole". If the
2056 pointer doesnt find one the single transition is appended as normal.
2058 - Once compressed we can Renew/realloc the structures to release the
2061 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2062 specifically Fig 3.47 and the associated pseudocode.
2066 const U32 laststate = TRIE_NODENUM( next_alloc );
2069 trie->statecount = laststate;
2071 for ( state = 1 ; state < laststate ; state++ ) {
2073 const U32 stateidx = TRIE_NODEIDX( state );
2074 const U32 o_used = trie->trans[ stateidx ].check;
2075 U32 used = trie->trans[ stateidx ].check;
2076 trie->trans[ stateidx ].check = 0;
2078 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2079 if ( flag || trie->trans[ stateidx + charid ].next ) {
2080 if ( trie->trans[ stateidx + charid ].next ) {
2082 for ( ; zp < pos ; zp++ ) {
2083 if ( ! trie->trans[ zp ].next ) {
2087 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2088 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2089 trie->trans[ zp ].check = state;
2090 if ( ++zp > pos ) pos = zp;
2097 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2099 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2100 trie->trans[ pos ].check = state;
2105 trie->lasttrans = pos + 1;
2106 trie->states = (reg_trie_state *)
2107 PerlMemShared_realloc( trie->states, laststate
2108 * sizeof(reg_trie_state) );
2109 DEBUG_TRIE_COMPILE_MORE_r(
2110 PerlIO_printf( Perl_debug_log,
2111 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2112 (int)depth * 2 + 2,"",
2113 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2116 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2119 } /* end table compress */
2121 DEBUG_TRIE_COMPILE_MORE_r(
2122 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2123 (int)depth * 2 + 2, "",
2124 (UV)trie->statecount,
2125 (UV)trie->lasttrans)
2127 /* resize the trans array to remove unused space */
2128 trie->trans = (reg_trie_trans *)
2129 PerlMemShared_realloc( trie->trans, trie->lasttrans
2130 * sizeof(reg_trie_trans) );
2132 { /* Modify the program and insert the new TRIE node */
2133 U8 nodetype =(U8)(flags & 0xFF);
2137 regnode *optimize = NULL;
2138 #ifdef RE_TRACK_PATTERN_OFFSETS
2141 U32 mjd_nodelen = 0;
2142 #endif /* RE_TRACK_PATTERN_OFFSETS */
2143 #endif /* DEBUGGING */
2145 This means we convert either the first branch or the first Exact,
2146 depending on whether the thing following (in 'last') is a branch
2147 or not and whther first is the startbranch (ie is it a sub part of
2148 the alternation or is it the whole thing.)
2149 Assuming its a sub part we convert the EXACT otherwise we convert
2150 the whole branch sequence, including the first.
2152 /* Find the node we are going to overwrite */
2153 if ( first != startbranch || OP( last ) == BRANCH ) {
2154 /* branch sub-chain */
2155 NEXT_OFF( first ) = (U16)(last - first);
2156 #ifdef RE_TRACK_PATTERN_OFFSETS
2158 mjd_offset= Node_Offset((convert));
2159 mjd_nodelen= Node_Length((convert));
2162 /* whole branch chain */
2164 #ifdef RE_TRACK_PATTERN_OFFSETS
2167 const regnode *nop = NEXTOPER( convert );
2168 mjd_offset= Node_Offset((nop));
2169 mjd_nodelen= Node_Length((nop));
2173 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2174 (int)depth * 2 + 2, "",
2175 (UV)mjd_offset, (UV)mjd_nodelen)
2178 /* But first we check to see if there is a common prefix we can
2179 split out as an EXACT and put in front of the TRIE node. */
2180 trie->startstate= 1;
2181 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2183 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2187 const U32 base = trie->states[ state ].trans.base;
2189 if ( trie->states[state].wordnum )
2192 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2193 if ( ( base + ofs >= trie->uniquecharcount ) &&
2194 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2195 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2197 if ( ++count > 1 ) {
2198 SV **tmp = av_fetch( revcharmap, ofs, 0);
2199 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2200 if ( state == 1 ) break;
2202 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2204 PerlIO_printf(Perl_debug_log,
2205 "%*sNew Start State=%"UVuf" Class: [",
2206 (int)depth * 2 + 2, "",
2209 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2210 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2212 TRIE_BITMAP_SET(trie,*ch);
2214 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2216 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2220 TRIE_BITMAP_SET(trie,*ch);
2222 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2223 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2229 SV **tmp = av_fetch( revcharmap, idx, 0);
2231 char *ch = SvPV( *tmp, len );
2233 SV *sv=sv_newmortal();
2234 PerlIO_printf( Perl_debug_log,
2235 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2236 (int)depth * 2 + 2, "",
2238 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2239 PL_colors[0], PL_colors[1],
2240 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2241 PERL_PV_ESCAPE_FIRSTCHAR
2246 OP( convert ) = nodetype;
2247 str=STRING(convert);
2250 STR_LEN(convert) += len;
2256 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2261 trie->prefixlen = (state-1);
2263 regnode *n = convert+NODE_SZ_STR(convert);
2264 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2265 trie->startstate = state;
2266 trie->minlen -= (state - 1);
2267 trie->maxlen -= (state - 1);
2269 /* At least the UNICOS C compiler choked on this
2270 * being argument to DEBUG_r(), so let's just have
2273 #ifdef PERL_EXT_RE_BUILD
2279 regnode *fix = convert;
2280 U32 word = trie->wordcount;
2282 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2283 while( ++fix < n ) {
2284 Set_Node_Offset_Length(fix, 0, 0);
2287 SV ** const tmp = av_fetch( trie_words, word, 0 );
2289 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2290 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2292 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2300 NEXT_OFF(convert) = (U16)(tail - convert);
2301 DEBUG_r(optimize= n);
2307 if ( trie->maxlen ) {
2308 NEXT_OFF( convert ) = (U16)(tail - convert);
2309 ARG_SET( convert, data_slot );
2310 /* Store the offset to the first unabsorbed branch in
2311 jump[0], which is otherwise unused by the jump logic.
2312 We use this when dumping a trie and during optimisation. */
2314 trie->jump[0] = (U16)(nextbranch - convert);
2316 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2317 * and there is a bitmap
2318 * and the first "jump target" node we found leaves enough room
2319 * then convert the TRIE node into a TRIEC node, with the bitmap
2320 * embedded inline in the opcode - this is hypothetically faster.
2322 if ( !trie->states[trie->startstate].wordnum
2324 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2326 OP( convert ) = TRIEC;
2327 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2328 PerlMemShared_free(trie->bitmap);
2331 OP( convert ) = TRIE;
2333 /* store the type in the flags */
2334 convert->flags = nodetype;
2338 + regarglen[ OP( convert ) ];
2340 /* XXX We really should free up the resource in trie now,
2341 as we won't use them - (which resources?) dmq */
2343 /* needed for dumping*/
2344 DEBUG_r(if (optimize) {
2345 regnode *opt = convert;
2347 while ( ++opt < optimize) {
2348 Set_Node_Offset_Length(opt,0,0);
2351 Try to clean up some of the debris left after the
2354 while( optimize < jumper ) {
2355 mjd_nodelen += Node_Length((optimize));
2356 OP( optimize ) = OPTIMIZED;
2357 Set_Node_Offset_Length(optimize,0,0);
2360 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2362 } /* end node insert */
2363 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2365 /* Finish populating the prev field of the wordinfo array. Walk back
2366 * from each accept state until we find another accept state, and if
2367 * so, point the first word's .prev field at the second word. If the
2368 * second already has a .prev field set, stop now. This will be the
2369 * case either if we've already processed that word's accept state,
2370 * or that state had multiple words, and the overspill words were
2371 * already linked up earlier.
2378 for (word=1; word <= trie->wordcount; word++) {
2380 if (trie->wordinfo[word].prev)
2382 state = trie->wordinfo[word].accept;
2384 state = prev_states[state];
2387 prev = trie->states[state].wordnum;
2391 trie->wordinfo[word].prev = prev;
2393 Safefree(prev_states);
2397 /* and now dump out the compressed format */
2398 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2400 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2402 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2403 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2405 SvREFCNT_dec(revcharmap);
2409 : trie->startstate>1
2415 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2417 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2419 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2420 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2423 We find the fail state for each state in the trie, this state is the longest proper
2424 suffix of the current state's 'word' that is also a proper prefix of another word in our
2425 trie. State 1 represents the word '' and is thus the default fail state. This allows
2426 the DFA not to have to restart after its tried and failed a word at a given point, it
2427 simply continues as though it had been matching the other word in the first place.
2429 'abcdgu'=~/abcdefg|cdgu/
2430 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2431 fail, which would bring us to the state representing 'd' in the second word where we would
2432 try 'g' and succeed, proceeding to match 'cdgu'.
2434 /* add a fail transition */
2435 const U32 trie_offset = ARG(source);
2436 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2438 const U32 ucharcount = trie->uniquecharcount;
2439 const U32 numstates = trie->statecount;
2440 const U32 ubound = trie->lasttrans + ucharcount;
2444 U32 base = trie->states[ 1 ].trans.base;
2447 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2448 GET_RE_DEBUG_FLAGS_DECL;
2450 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2452 PERL_UNUSED_ARG(depth);
2456 ARG_SET( stclass, data_slot );
2457 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2458 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2459 aho->trie=trie_offset;
2460 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2461 Copy( trie->states, aho->states, numstates, reg_trie_state );
2462 Newxz( q, numstates, U32);
2463 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2466 /* initialize fail[0..1] to be 1 so that we always have
2467 a valid final fail state */
2468 fail[ 0 ] = fail[ 1 ] = 1;
2470 for ( charid = 0; charid < ucharcount ; charid++ ) {
2471 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2473 q[ q_write ] = newstate;
2474 /* set to point at the root */
2475 fail[ q[ q_write++ ] ]=1;
2478 while ( q_read < q_write) {
2479 const U32 cur = q[ q_read++ % numstates ];
2480 base = trie->states[ cur ].trans.base;
2482 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2483 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2485 U32 fail_state = cur;
2488 fail_state = fail[ fail_state ];
2489 fail_base = aho->states[ fail_state ].trans.base;
2490 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2492 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2493 fail[ ch_state ] = fail_state;
2494 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2496 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2498 q[ q_write++ % numstates] = ch_state;
2502 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2503 when we fail in state 1, this allows us to use the
2504 charclass scan to find a valid start char. This is based on the principle
2505 that theres a good chance the string being searched contains lots of stuff
2506 that cant be a start char.
2508 fail[ 0 ] = fail[ 1 ] = 0;
2509 DEBUG_TRIE_COMPILE_r({
2510 PerlIO_printf(Perl_debug_log,
2511 "%*sStclass Failtable (%"UVuf" states): 0",
2512 (int)(depth * 2), "", (UV)numstates
2514 for( q_read=1; q_read<numstates; q_read++ ) {
2515 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2517 PerlIO_printf(Perl_debug_log, "\n");
2520 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2525 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2526 * These need to be revisited when a newer toolchain becomes available.
2528 #if defined(__sparc64__) && defined(__GNUC__)
2529 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2530 # undef SPARC64_GCC_WORKAROUND
2531 # define SPARC64_GCC_WORKAROUND 1
2535 #define DEBUG_PEEP(str,scan,depth) \
2536 DEBUG_OPTIMISE_r({if (scan){ \
2537 SV * const mysv=sv_newmortal(); \
2538 regnode *Next = regnext(scan); \
2539 regprop(RExC_rx, mysv, scan); \
2540 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2541 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2542 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2546 /* The below joins as many adjacent EXACTish nodes as possible into a single
2547 * one, and looks for problematic sequences of characters whose folds vs.
2548 * non-folds have sufficiently different lengths, that the optimizer would be
2549 * fooled into rejecting legitimate matches of them, and the trie construction
2550 * code can't cope with them. The joining is only done if:
2551 * 1) there is room in the current conglomerated node to entirely contain the
2553 * 2) they are the exact same node type
2555 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2556 * these get optimized out
2558 * If there are problematic code sequences, *min_subtract is set to the delta
2559 * that the minimum size of the node can be less than its actual size. And,
2560 * the node type of the result is changed to reflect that it contains these
2563 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2564 * and contains LATIN SMALL LETTER SHARP S
2566 * This is as good a place as any to discuss the design of handling these
2567 * problematic sequences. It's been wrong in Perl for a very long time. There
2568 * are three code points in Unicode whose folded lengths differ so much from
2569 * the un-folded lengths that it causes problems for the optimizer and trie
2570 * construction. Why only these are problematic, and not others where lengths
2571 * also differ is something I (khw) do not understand. New versions of Unicode
2572 * might add more such code points. Hopefully the logic in fold_grind.t that
2573 * figures out what to test (in part by verifying that each size-combination
2574 * gets tested) will catch any that do come along, so they can be added to the
2575 * special handling below. The chances of new ones are actually rather small,
2576 * as most, if not all, of the world's scripts that have casefolding have
2577 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2578 * made to allow compatibility with pre-existing standards, and almost all of
2579 * those have already been dealt with. These would otherwise be the most
2580 * likely candidates for generating further tricky sequences. In other words,
2581 * Unicode by itself is unlikely to add new ones unless it is for compatibility
2582 * with pre-existing standards, and there aren't many of those left.
2584 * The previous designs for dealing with these involved assigning a special
2585 * node for them. This approach doesn't work, as evidenced by this example:
2586 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2587 * Both these fold to "sss", but if the pattern is parsed to create a node of
2588 * that would match just the \xDF, it won't be able to handle the case where a
2589 * successful match would have to cross the node's boundary. The new approach
2590 * that hopefully generally solves the problem generates an EXACTFU_SS node
2593 * There are a number of components to the approach (a lot of work for just
2594 * three code points!):
2595 * 1) This routine examines each EXACTFish node that could contain the
2596 * problematic sequences. It returns in *min_subtract how much to
2597 * subtract from the the actual length of the string to get a real minimum
2598 * for one that could match it. This number is usually 0 except for the
2599 * problematic sequences. This delta is used by the caller to adjust the
2600 * min length of the match, and the delta between min and max, so that the
2601 * optimizer doesn't reject these possibilities based on size constraints.
2602 * 2) These sequences are not currently correctly handled by the trie code
2603 * either, so it changes the joined node type to ops that are not handled
2604 * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2605 * 3) This is sufficient for the two Greek sequences (described below), but
2606 * the one involving the Sharp s (\xDF) needs more. The node type
2607 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2608 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2609 * case where there is a possible fold length change. That means that a
2610 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2611 * itself with length changes, and so can be processed faster. regexec.c
2612 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2613 * is pre-folded by regcomp.c. This saves effort in regex matching.
2614 * However, probably mostly for historical reasons, the pre-folding isn't
2615 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2616 * nodes, as what they fold to isn't known until runtime.) The fold
2617 * possibilities for the non-UTF8 patterns are quite simple, except for
2618 * the sharp s. All the ones that don't involve a UTF-8 target string
2619 * are members of a fold-pair, and arrays are set up for all of them
2620 * that quickly find the other member of the pair. It might actually
2621 * be faster to pre-fold these, but it isn't currently done, except for
2622 * the sharp s. Code elsewhere in this file makes sure that it gets
2623 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2624 * issues described in the next item.
2625 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2626 * 'ss' or not is not knowable at compile time. It will match iff the
2627 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2628 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2629 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2630 * described in item 3). An assumption that the optimizer part of
2631 * regexec.c (probably unwittingly) makes is that a character in the
2632 * pattern corresponds to at most a single character in the target string.
2633 * (And I do mean character, and not byte here, unlike other parts of the
2634 * documentation that have never been updated to account for multibyte
2635 * Unicode.) This assumption is wrong only in this case, as all other
2636 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2637 * virtue of having this file pre-fold UTF-8 patterns. I'm
2638 * reluctant to try to change this assumption, so instead the code punts.
2639 * This routine examines EXACTF nodes for the sharp s, and returns a
2640 * boolean indicating whether or not the node is an EXACTF node that
2641 * contains a sharp s. When it is true, the caller sets a flag that later
2642 * causes the optimizer in this file to not set values for the floating
2643 * and fixed string lengths, and thus avoids the optimizer code in
2644 * regexec.c that makes the invalid assumption. Thus, there is no
2645 * optimization based on string lengths for EXACTF nodes that contain the
2646 * sharp s. This only happens for /id rules (which means the pattern
2650 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2651 if (PL_regkind[OP(scan)] == EXACT) \
2652 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2655 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2656 /* Merge several consecutive EXACTish nodes into one. */
2657 regnode *n = regnext(scan);
2659 regnode *next = scan + NODE_SZ_STR(scan);
2663 regnode *stop = scan;
2664 GET_RE_DEBUG_FLAGS_DECL;
2666 PERL_UNUSED_ARG(depth);
2669 PERL_ARGS_ASSERT_JOIN_EXACT;
2670 #ifndef EXPERIMENTAL_INPLACESCAN
2671 PERL_UNUSED_ARG(flags);
2672 PERL_UNUSED_ARG(val);
2674 DEBUG_PEEP("join",scan,depth);
2676 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2677 * EXACT ones that are mergeable to the current one. */
2679 && (PL_regkind[OP(n)] == NOTHING
2680 || (stringok && OP(n) == OP(scan)))
2682 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2685 if (OP(n) == TAIL || n > next)
2687 if (PL_regkind[OP(n)] == NOTHING) {
2688 DEBUG_PEEP("skip:",n,depth);
2689 NEXT_OFF(scan) += NEXT_OFF(n);
2690 next = n + NODE_STEP_REGNODE;
2697 else if (stringok) {
2698 const unsigned int oldl = STR_LEN(scan);
2699 regnode * const nnext = regnext(n);
2701 if (oldl + STR_LEN(n) > U8_MAX)
2704 DEBUG_PEEP("merg",n,depth);
2707 NEXT_OFF(scan) += NEXT_OFF(n);
2708 STR_LEN(scan) += STR_LEN(n);
2709 next = n + NODE_SZ_STR(n);
2710 /* Now we can overwrite *n : */
2711 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2719 #ifdef EXPERIMENTAL_INPLACESCAN
2720 if (flags && !NEXT_OFF(n)) {
2721 DEBUG_PEEP("atch", val, depth);
2722 if (reg_off_by_arg[OP(n)]) {
2723 ARG_SET(n, val - n);
2726 NEXT_OFF(n) = val - n;
2734 *has_exactf_sharp_s = FALSE;
2736 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2737 * can now analyze for sequences of problematic code points. (Prior to
2738 * this final joining, sequences could have been split over boundaries, and
2739 * hence missed). The sequences only happen in folding, hence for any
2740 * non-EXACT EXACTish node */
2741 if (OP(scan) != EXACT) {
2743 U8 * s0 = (U8*) STRING(scan);
2744 U8 * const s_end = s0 + STR_LEN(scan);
2746 /* The below is perhaps overboard, but this allows us to save a test
2747 * each time through the loop at the expense of a mask. This is
2748 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2749 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2750 * This uses an exclusive 'or' to find that bit and then inverts it to
2751 * form a mask, with just a single 0, in the bit position where 'S' and
2753 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2754 const U8 s_masked = 's' & S_or_s_mask;
2756 /* One pass is made over the node's string looking for all the
2757 * possibilities. to avoid some tests in the loop, there are two main
2758 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2762 /* There are two problematic Greek code points in Unicode
2765 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2766 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2772 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2773 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2775 * This means that in case-insensitive matching (or "loose
2776 * matching", as Unicode calls it), an EXACTF of length six (the
2777 * UTF-8 encoded byte length of the above casefolded versions) can
2778 * match a target string of length two (the byte length of UTF-8
2779 * encoded U+0390 or U+03B0). This would rather mess up the
2780 * minimum length computation. (there are other code points that
2781 * also fold to these two sequences, but the delta is smaller)
2783 * If these sequences are found, the minimum length is decreased by
2784 * four (six minus two).
2786 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2787 * LETTER SHARP S. We decrease the min length by 1 for each
2788 * occurrence of 'ss' found */
2790 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2791 # define U390_first_byte 0xb4
2792 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2793 # define U3B0_first_byte 0xb5
2794 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2796 # define U390_first_byte 0xce
2797 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2798 # define U3B0_first_byte 0xcf
2799 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2801 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2802 yields a net of 0 */
2803 /* Examine the string for one of the problematic sequences */
2805 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2806 * sequence we are looking for is 2 */
2810 /* Look for the first byte in each problematic sequence */
2812 /* We don't have to worry about other things that fold to
2813 * 's' (such as the long s, U+017F), as all above-latin1
2814 * code points have been pre-folded */
2818 /* Current character is an 's' or 'S'. If next one is
2819 * as well, we have the dreaded sequence */
2820 if (((*(s+1) & S_or_s_mask) == s_masked)
2821 /* These two node types don't have special handling
2823 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2826 OP(scan) = EXACTFU_SS;
2827 s++; /* No need to look at this character again */
2831 case U390_first_byte:
2832 if (s_end - s >= len
2834 /* The 1's are because are skipping comparing the
2836 && memEQ(s + 1, U390_tail, len - 1))
2838 goto greek_sequence;
2842 case U3B0_first_byte:
2843 if (! (s_end - s >= len
2844 && memEQ(s + 1, U3B0_tail, len - 1)))
2851 /* This can't currently be handled by trie's, so change
2852 * the node type to indicate this. If EXACTFA and
2853 * EXACTFL were ever to be handled by trie's, this
2854 * would have to be changed. If this node has already
2855 * been changed to EXACTFU_SS in this loop, leave it as
2856 * is. (I (khw) think it doesn't matter in regexec.c
2857 * for UTF patterns, but no need to change it */
2858 if (OP(scan) == EXACTFU) {
2859 OP(scan) = EXACTFU_TRICKYFOLD;
2861 s += 6; /* We already know what this sequence is. Skip
2867 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2869 /* Here, the pattern is not UTF-8. We need to look only for the
2870 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2871 * in the final position. Otherwise we can stop looking 1 byte
2872 * earlier because have to find both the first and second 's' */
2873 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2875 for (s = s0; s < upper; s++) {
2880 && ((*(s+1) & S_or_s_mask) == s_masked))
2884 /* EXACTF nodes need to know that the minimum
2885 * length changed so that a sharp s in the string
2886 * can match this ss in the pattern, but they
2887 * remain EXACTF nodes, as they are not trie'able,
2888 * so don't have to invent a new node type to
2889 * exclude them from the trie code */
2890 if (OP(scan) != EXACTF) {
2891 OP(scan) = EXACTFU_SS;
2896 case LATIN_SMALL_LETTER_SHARP_S:
2897 if (OP(scan) == EXACTF) {
2898 *has_exactf_sharp_s = TRUE;
2907 /* Allow dumping but overwriting the collection of skipped
2908 * ops and/or strings with fake optimized ops */
2909 n = scan + NODE_SZ_STR(scan);
2917 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2921 /* REx optimizer. Converts nodes into quicker variants "in place".
2922 Finds fixed substrings. */
2924 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2925 to the position after last scanned or to NULL. */
2927 #define INIT_AND_WITHP \
2928 assert(!and_withp); \
2929 Newx(and_withp,1,struct regnode_charclass_class); \
2930 SAVEFREEPV(and_withp)
2932 /* this is a chain of data about sub patterns we are processing that
2933 need to be handled separately/specially in study_chunk. Its so
2934 we can simulate recursion without losing state. */
2936 typedef struct scan_frame {
2937 regnode *last; /* last node to process in this frame */
2938 regnode *next; /* next node to process when last is reached */
2939 struct scan_frame *prev; /*previous frame*/
2940 I32 stop; /* what stopparen do we use */
2944 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2946 #define CASE_SYNST_FNC(nAmE) \
2948 if (flags & SCF_DO_STCLASS_AND) { \
2949 for (value = 0; value < 256; value++) \
2950 if (!is_ ## nAmE ## _cp(value)) \
2951 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2954 for (value = 0; value < 256; value++) \
2955 if (is_ ## nAmE ## _cp(value)) \
2956 ANYOF_BITMAP_SET(data->start_class, value); \
2960 if (flags & SCF_DO_STCLASS_AND) { \
2961 for (value = 0; value < 256; value++) \
2962 if (is_ ## nAmE ## _cp(value)) \
2963 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2966 for (value = 0; value < 256; value++) \
2967 if (!is_ ## nAmE ## _cp(value)) \
2968 ANYOF_BITMAP_SET(data->start_class, value); \
2975 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2976 I32 *minlenp, I32 *deltap,
2981 struct regnode_charclass_class *and_withp,
2982 U32 flags, U32 depth)
2983 /* scanp: Start here (read-write). */
2984 /* deltap: Write maxlen-minlen here. */
2985 /* last: Stop before this one. */
2986 /* data: string data about the pattern */
2987 /* stopparen: treat close N as END */
2988 /* recursed: which subroutines have we recursed into */
2989 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2992 I32 min = 0, pars = 0, code;
2993 regnode *scan = *scanp, *next;
2995 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2996 int is_inf_internal = 0; /* The studied chunk is infinite */
2997 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2998 scan_data_t data_fake;
2999 SV *re_trie_maxbuff = NULL;
3000 regnode *first_non_open = scan;
3001 I32 stopmin = I32_MAX;
3002 scan_frame *frame = NULL;
3003 GET_RE_DEBUG_FLAGS_DECL;
3005 PERL_ARGS_ASSERT_STUDY_CHUNK;
3008 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3012 while (first_non_open && OP(first_non_open) == OPEN)
3013 first_non_open=regnext(first_non_open);
3018 while ( scan && OP(scan) != END && scan < last ){
3019 UV min_subtract = 0; /* How much to subtract from the minimum node
3020 length to get a real minimum (because the
3021 folded version may be shorter) */
3022 bool has_exactf_sharp_s = FALSE;
3023 /* Peephole optimizer: */
3024 DEBUG_STUDYDATA("Peep:", data,depth);
3025 DEBUG_PEEP("Peep",scan,depth);
3027 /* Its not clear to khw or hv why this is done here, and not in the
3028 * clauses that deal with EXACT nodes. khw's guess is that it's
3029 * because of a previous design */
3030 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3032 /* Follow the next-chain of the current node and optimize
3033 away all the NOTHINGs from it. */
3034 if (OP(scan) != CURLYX) {
3035 const int max = (reg_off_by_arg[OP(scan)]
3037 /* I32 may be smaller than U16 on CRAYs! */
3038 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3039 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3043 /* Skip NOTHING and LONGJMP. */
3044 while ((n = regnext(n))
3045 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3046 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3047 && off + noff < max)
3049 if (reg_off_by_arg[OP(scan)])
3052 NEXT_OFF(scan) = off;
3057 /* The principal pseudo-switch. Cannot be a switch, since we
3058 look into several different things. */
3059 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3060 || OP(scan) == IFTHEN) {
3061 next = regnext(scan);
3063 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3065 if (OP(next) == code || code == IFTHEN) {
3066 /* NOTE - There is similar code to this block below for handling
3067 TRIE nodes on a re-study. If you change stuff here check there
3069 I32 max1 = 0, min1 = I32_MAX, num = 0;
3070 struct regnode_charclass_class accum;
3071 regnode * const startbranch=scan;
3073 if (flags & SCF_DO_SUBSTR)
3074 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3075 if (flags & SCF_DO_STCLASS)
3076 cl_init_zero(pRExC_state, &accum);
3078 while (OP(scan) == code) {
3079 I32 deltanext, minnext, f = 0, fake;
3080 struct regnode_charclass_class this_class;
3083 data_fake.flags = 0;
3085 data_fake.whilem_c = data->whilem_c;
3086 data_fake.last_closep = data->last_closep;
3089 data_fake.last_closep = &fake;
3091 data_fake.pos_delta = delta;
3092 next = regnext(scan);
3093 scan = NEXTOPER(scan);
3095 scan = NEXTOPER(scan);
3096 if (flags & SCF_DO_STCLASS) {
3097 cl_init(pRExC_state, &this_class);
3098 data_fake.start_class = &this_class;
3099 f = SCF_DO_STCLASS_AND;
3101 if (flags & SCF_WHILEM_VISITED_POS)
3102 f |= SCF_WHILEM_VISITED_POS;
3104 /* we suppose the run is continuous, last=next...*/
3105 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3107 stopparen, recursed, NULL, f,depth+1);
3110 if (max1 < minnext + deltanext)
3111 max1 = minnext + deltanext;
3112 if (deltanext == I32_MAX)
3113 is_inf = is_inf_internal = 1;
3115 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3117 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3118 if ( stopmin > minnext)
3119 stopmin = min + min1;
3120 flags &= ~SCF_DO_SUBSTR;
3122 data->flags |= SCF_SEEN_ACCEPT;
3125 if (data_fake.flags & SF_HAS_EVAL)
3126 data->flags |= SF_HAS_EVAL;
3127 data->whilem_c = data_fake.whilem_c;
3129 if (flags & SCF_DO_STCLASS)
3130 cl_or(pRExC_state, &accum, &this_class);
3132 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3134 if (flags & SCF_DO_SUBSTR) {
3135 data->pos_min += min1;
3136 data->pos_delta += max1 - min1;
3137 if (max1 != min1 || is_inf)
3138 data->longest = &(data->longest_float);
3141 delta += max1 - min1;
3142 if (flags & SCF_DO_STCLASS_OR) {
3143 cl_or(pRExC_state, data->start_class, &accum);
3145 cl_and(data->start_class, and_withp);
3146 flags &= ~SCF_DO_STCLASS;
3149 else if (flags & SCF_DO_STCLASS_AND) {
3151 cl_and(data->start_class, &accum);
3152 flags &= ~SCF_DO_STCLASS;
3155 /* Switch to OR mode: cache the old value of
3156 * data->start_class */
3158 StructCopy(data->start_class, and_withp,
3159 struct regnode_charclass_class);
3160 flags &= ~SCF_DO_STCLASS_AND;
3161 StructCopy(&accum, data->start_class,
3162 struct regnode_charclass_class);
3163 flags |= SCF_DO_STCLASS_OR;
3164 data->start_class->flags |= ANYOF_EOS;
3168 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3171 Assuming this was/is a branch we are dealing with: 'scan' now
3172 points at the item that follows the branch sequence, whatever
3173 it is. We now start at the beginning of the sequence and look
3180 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3182 If we can find such a subsequence we need to turn the first
3183 element into a trie and then add the subsequent branch exact
3184 strings to the trie.
3188 1. patterns where the whole set of branches can be converted.
3190 2. patterns where only a subset can be converted.
3192 In case 1 we can replace the whole set with a single regop
3193 for the trie. In case 2 we need to keep the start and end
3196 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3197 becomes BRANCH TRIE; BRANCH X;
3199 There is an additional case, that being where there is a
3200 common prefix, which gets split out into an EXACT like node
3201 preceding the TRIE node.
3203 If x(1..n)==tail then we can do a simple trie, if not we make
3204 a "jump" trie, such that when we match the appropriate word
3205 we "jump" to the appropriate tail node. Essentially we turn
3206 a nested if into a case structure of sorts.
3211 if (!re_trie_maxbuff) {
3212 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3213 if (!SvIOK(re_trie_maxbuff))
3214 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3216 if ( SvIV(re_trie_maxbuff)>=0 ) {
3218 regnode *first = (regnode *)NULL;
3219 regnode *last = (regnode *)NULL;
3220 regnode *tail = scan;
3225 SV * const mysv = sv_newmortal(); /* for dumping */
3227 /* var tail is used because there may be a TAIL
3228 regop in the way. Ie, the exacts will point to the
3229 thing following the TAIL, but the last branch will
3230 point at the TAIL. So we advance tail. If we
3231 have nested (?:) we may have to move through several
3235 while ( OP( tail ) == TAIL ) {
3236 /* this is the TAIL generated by (?:) */
3237 tail = regnext( tail );
3242 regprop(RExC_rx, mysv, tail );
3243 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3244 (int)depth * 2 + 2, "",
3245 "Looking for TRIE'able sequences. Tail node is: ",
3246 SvPV_nolen_const( mysv )
3252 Step through the branches
3253 cur represents each branch,
3254 noper is the first thing to be matched as part of that branch
3255 noper_next is the regnext() of that node.
3257 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3258 via a "jump trie" but we also support building with NOJUMPTRIE,
3259 which restricts the trie logic to structures like /FOO|BAR/.
3261 If noper is a trieable nodetype then the branch is a possible optimization
3262 target. If we are building under NOJUMPTRIE then we require that noper_next
3263 is the same as scan (our current position in the regex program).
3265 Once we have two or more consecutive such branches we can create a
3266 trie of the EXACT's contents and stitch it in place into the program.
3268 If the sequence represents all of the branches in the alternation we
3269 replace the entire thing with a single TRIE node.
3271 Otherwise when it is a subsequence we need to stitch it in place and
3272 replace only the relevant branches. This means the first branch has
3273 to remain as it is used by the alternation logic, and its next pointer,
3274 and needs to be repointed at the item on the branch chain following
3275 the last branch we have optimized away.
3277 This could be either a BRANCH, in which case the subsequence is internal,
3278 or it could be the item following the branch sequence in which case the
3279 subsequence is at the end (which does not necessarily mean the first node
3280 is the start of the alternation).
3282 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3285 ----------------+-----------
3289 EXACTFU_SS | EXACTFU
3290 EXACTFU_TRICKYFOLD | EXACTFU
3295 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3296 ( EXACT == (X) ) ? EXACT : \
3297 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3300 /* dont use tail as the end marker for this traverse */
3301 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3302 regnode * const noper = NEXTOPER( cur );
3303 U8 noper_type = OP( noper );
3304 U8 noper_trietype = TRIE_TYPE( noper_type );
3305 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3306 regnode * const noper_next = regnext( noper );
3310 regprop(RExC_rx, mysv, cur);
3311 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3312 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3314 regprop(RExC_rx, mysv, noper);
3315 PerlIO_printf( Perl_debug_log, " -> %s",
3316 SvPV_nolen_const(mysv));
3319 regprop(RExC_rx, mysv, noper_next );
3320 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3321 SvPV_nolen_const(mysv));
3323 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3324 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3327 /* Is noper a trieable nodetype that can be merged with the
3328 * current trie (if there is one)? */
3332 /* XXX: Currently we cannot allow a NOTHING node to be the first element
3333 * of a TRIEABLE sequence, Otherwise we will overwrite the regop following
3334 * the NOTHING with the TRIE regop later on. This is because a NOTHING node
3335 * is only one regnode wide, and a TRIE is two regnodes. An example of a
3336 * problematic pattern is: "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/
3337 * At a later point of time we can somewhat workaround this by handling
3338 * NOTHING -> EXACT sequences as generated by /(?:)A|(?:)B/ type patterns,
3339 * as we can effectively ignore the NOTHING regop in that case.
3340 * This clause, which allows NOTHING to start a sequence is left commented
3341 * out as a reference.
3344 ( noper_trietype == NOTHING)
3345 || ( trietype == NOTHING )
3347 ( noper_trietype == NOTHING && trietype )
3348 || ( trietype == noper_trietype )
3351 && noper_next == tail
3355 /* Handle mergable triable node
3356 * Either we are the first node in a new trieable sequence,
3357 * in which case we do some bookkeeping, otherwise we update
3358 * the end pointer. */
3362 trietype = noper_trietype;
3364 if ( trietype == NOTHING )
3365 trietype = noper_trietype;
3368 } /* end handle mergable triable node */
3370 /* handle unmergable node -
3371 * noper may either be a triable node which can not be tried
3372 * together with the current trie, or a non triable node */
3374 /* If last is set and trietype is not NOTHING then we have found
3375 * at least two triable branch sequences in a row of a similar
3376 * trietype so we can turn them into a trie. If/when we
3377 * allow NOTHING to start a trie sequence this condition will be
3378 * required, and it isn't expensive so we leave it in for now. */
3379 if ( trietype != NOTHING )
3380 make_trie( pRExC_state,
3381 startbranch, first, cur, tail, count,
3382 trietype, depth+1 );
3383 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3387 && noper_next == tail
3390 /* noper is triable, so we can start a new trie sequence */
3393 trietype = noper_trietype;
3395 /* if we already saw a first but the current node is not triable then we have
3396 * to reset the first information. */
3401 } /* end handle unmergable node */
3402 } /* loop over branches */
3404 regprop(RExC_rx, mysv, cur);
3405 PerlIO_printf( Perl_debug_log,
3406 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3407 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3410 if ( last && trietype != NOTHING ) {
3411 /* the last branch of the sequence was part of a trie,
3412 * so we have to construct it here outside of the loop
3414 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3415 #ifdef TRIE_STUDY_OPT
3416 if ( ((made == MADE_EXACT_TRIE &&
3417 startbranch == first)
3418 || ( first_non_open == first )) &&
3420 flags |= SCF_TRIE_RESTUDY;
3421 if ( startbranch == first
3424 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3428 } /* end if ( last) */
3429 } /* TRIE_MAXBUF is non zero */
3434 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3435 scan = NEXTOPER(NEXTOPER(scan));
3436 } else /* single branch is optimized. */
3437 scan = NEXTOPER(scan);
3439 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3440 scan_frame *newframe = NULL;
3445 if (OP(scan) != SUSPEND) {
3446 /* set the pointer */
3447 if (OP(scan) == GOSUB) {
3449 RExC_recurse[ARG2L(scan)] = scan;
3450 start = RExC_open_parens[paren-1];
3451 end = RExC_close_parens[paren-1];
3454 start = RExC_rxi->program + 1;
3458 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3459 SAVEFREEPV(recursed);
3461 if (!PAREN_TEST(recursed,paren+1)) {
3462 PAREN_SET(recursed,paren+1);
3463 Newx(newframe,1,scan_frame);
3465 if (flags & SCF_DO_SUBSTR) {
3466 SCAN_COMMIT(pRExC_state,data,minlenp);
3467 data->longest = &(data->longest_float);
3469 is_inf = is_inf_internal = 1;
3470 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3471 cl_anything(pRExC_state, data->start_class);
3472 flags &= ~SCF_DO_STCLASS;
3475 Newx(newframe,1,scan_frame);
3478 end = regnext(scan);
3483 SAVEFREEPV(newframe);
3484 newframe->next = regnext(scan);
3485 newframe->last = last;
3486 newframe->stop = stopparen;
3487 newframe->prev = frame;
3497 else if (OP(scan) == EXACT) {
3498 I32 l = STR_LEN(scan);
3501 const U8 * const s = (U8*)STRING(scan);
3502 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3503 l = utf8_length(s, s + l);
3505 uc = *((U8*)STRING(scan));
3508 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3509 /* The code below prefers earlier match for fixed
3510 offset, later match for variable offset. */
3511 if (data->last_end == -1) { /* Update the start info. */
3512 data->last_start_min = data->pos_min;
3513 data->last_start_max = is_inf
3514 ? I32_MAX : data->pos_min + data->pos_delta;
3516 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3518 SvUTF8_on(data->last_found);
3520 SV * const sv = data->last_found;
3521 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3522 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3523 if (mg && mg->mg_len >= 0)
3524 mg->mg_len += utf8_length((U8*)STRING(scan),
3525 (U8*)STRING(scan)+STR_LEN(scan));
3527 data->last_end = data->pos_min + l;
3528 data->pos_min += l; /* As in the first entry. */
3529 data->flags &= ~SF_BEFORE_EOL;
3531 if (flags & SCF_DO_STCLASS_AND) {
3532 /* Check whether it is compatible with what we know already! */
3536 /* If compatible, we or it in below. It is compatible if is
3537 * in the bitmp and either 1) its bit or its fold is set, or 2)
3538 * it's for a locale. Even if there isn't unicode semantics
3539 * here, at runtime there may be because of matching against a
3540 * utf8 string, so accept a possible false positive for
3541 * latin1-range folds */
3543 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3544 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3545 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3546 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3551 ANYOF_CLASS_ZERO(data->start_class);
3552 ANYOF_BITMAP_ZERO(data->start_class);
3554 ANYOF_BITMAP_SET(data->start_class, uc);
3555 else if (uc >= 0x100) {
3558 /* Some Unicode code points fold to the Latin1 range; as
3559 * XXX temporary code, instead of figuring out if this is
3560 * one, just assume it is and set all the start class bits
3561 * that could be some such above 255 code point's fold
3562 * which will generate fals positives. As the code
3563 * elsewhere that does compute the fold settles down, it
3564 * can be extracted out and re-used here */
3565 for (i = 0; i < 256; i++){
3566 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3567 ANYOF_BITMAP_SET(data->start_class, i);
3571 data->start_class->flags &= ~ANYOF_EOS;
3573 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3575 else if (flags & SCF_DO_STCLASS_OR) {
3576 /* false positive possible if the class is case-folded */
3578 ANYOF_BITMAP_SET(data->start_class, uc);
3580 data->start_class->flags |= ANYOF_UNICODE_ALL;
3581 data->start_class->flags &= ~ANYOF_EOS;
3582 cl_and(data->start_class, and_withp);
3584 flags &= ~SCF_DO_STCLASS;
3586 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3587 I32 l = STR_LEN(scan);
3588 UV uc = *((U8*)STRING(scan));
3590 /* Search for fixed substrings supports EXACT only. */
3591 if (flags & SCF_DO_SUBSTR) {
3593 SCAN_COMMIT(pRExC_state, data, minlenp);
3596 const U8 * const s = (U8 *)STRING(scan);
3597 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3598 l = utf8_length(s, s + l);
3600 else if (has_exactf_sharp_s) {
3601 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3603 min += l - min_subtract;
3607 delta += min_subtract;
3608 if (flags & SCF_DO_SUBSTR) {
3609 data->pos_min += l - min_subtract;
3610 if (data->pos_min < 0) {
3613 data->pos_delta += min_subtract;
3615 data->longest = &(data->longest_float);
3618 if (flags & SCF_DO_STCLASS_AND) {
3619 /* Check whether it is compatible with what we know already! */
3622 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3623 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3624 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3628 ANYOF_CLASS_ZERO(data->start_class);
3629 ANYOF_BITMAP_ZERO(data->start_class);
3631 ANYOF_BITMAP_SET(data->start_class, uc);
3632 data->start_class->flags &= ~ANYOF_EOS;
3633 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3634 if (OP(scan) == EXACTFL) {
3635 /* XXX This set is probably no longer necessary, and
3636 * probably wrong as LOCALE now is on in the initial
3638 data->start_class->flags |= ANYOF_LOCALE;
3642 /* Also set the other member of the fold pair. In case
3643 * that unicode semantics is called for at runtime, use
3644 * the full latin1 fold. (Can't do this for locale,
3645 * because not known until runtime) */
3646 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3648 /* All other (EXACTFL handled above) folds except under
3649 * /iaa that include s, S, and sharp_s also may include
3651 if (OP(scan) != EXACTFA) {
3652 if (uc == 's' || uc == 'S') {
3653 ANYOF_BITMAP_SET(data->start_class,
3654 LATIN_SMALL_LETTER_SHARP_S);
3656 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3657 ANYOF_BITMAP_SET(data->start_class, 's');
3658 ANYOF_BITMAP_SET(data->start_class, 'S');
3663 else if (uc >= 0x100) {
3665 for (i = 0; i < 256; i++){
3666 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3667 ANYOF_BITMAP_SET(data->start_class, i);
3672 else if (flags & SCF_DO_STCLASS_OR) {
3673 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3674 /* false positive possible if the class is case-folded.
3675 Assume that the locale settings are the same... */
3677 ANYOF_BITMAP_SET(data->start_class, uc);
3678 if (OP(scan) != EXACTFL) {
3680 /* And set the other member of the fold pair, but
3681 * can't do that in locale because not known until
3683 ANYOF_BITMAP_SET(data->start_class,
3684 PL_fold_latin1[uc]);
3686 /* All folds except under /iaa that include s, S,
3687 * and sharp_s also may include the others */
3688 if (OP(scan) != EXACTFA) {
3689 if (uc == 's' || uc == 'S') {
3690 ANYOF_BITMAP_SET(data->start_class,
3691 LATIN_SMALL_LETTER_SHARP_S);
3693 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3694 ANYOF_BITMAP_SET(data->start_class, 's');
3695 ANYOF_BITMAP_SET(data->start_class, 'S');
3700 data->start_class->flags &= ~ANYOF_EOS;
3702 cl_and(data->start_class, and_withp);
3704 flags &= ~SCF_DO_STCLASS;
3706 else if (REGNODE_VARIES(OP(scan))) {
3707 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3708 I32 f = flags, pos_before = 0;
3709 regnode * const oscan = scan;
3710 struct regnode_charclass_class this_class;
3711 struct regnode_charclass_class *oclass = NULL;
3712 I32 next_is_eval = 0;
3714 switch (PL_regkind[OP(scan)]) {
3715 case WHILEM: /* End of (?:...)* . */
3716 scan = NEXTOPER(scan);
3719 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3720 next = NEXTOPER(scan);
3721 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3723 maxcount = REG_INFTY;
3724 next = regnext(scan);
3725 scan = NEXTOPER(scan);
3729 if (flags & SCF_DO_SUBSTR)
3734 if (flags & SCF_DO_STCLASS) {
3736 maxcount = REG_INFTY;
3737 next = regnext(scan);
3738 scan = NEXTOPER(scan);
3741 is_inf = is_inf_internal = 1;
3742 scan = regnext(scan);
3743 if (flags & SCF_DO_SUBSTR) {
3744 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3745 data->longest = &(data->longest_float);
3747 goto optimize_curly_tail;
3749 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3750 && (scan->flags == stopparen))
3755 mincount = ARG1(scan);
3756 maxcount = ARG2(scan);
3758 next = regnext(scan);
3759 if (OP(scan) == CURLYX) {
3760 I32 lp = (data ? *(data->last_closep) : 0);
3761 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3763 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3764 next_is_eval = (OP(scan) == EVAL);
3766 if (flags & SCF_DO_SUBSTR) {
3767 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3768 pos_before = data->pos_min;
3772 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3774 data->flags |= SF_IS_INF;
3776 if (flags & SCF_DO_STCLASS) {
3777 cl_init(pRExC_state, &this_class);
3778 oclass = data->start_class;
3779 data->start_class = &this_class;
3780 f |= SCF_DO_STCLASS_AND;
3781 f &= ~SCF_DO_STCLASS_OR;
3783 /* Exclude from super-linear cache processing any {n,m}
3784 regops for which the combination of input pos and regex
3785 pos is not enough information to determine if a match
3788 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3789 regex pos at the \s*, the prospects for a match depend not
3790 only on the input position but also on how many (bar\s*)
3791 repeats into the {4,8} we are. */
3792 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3793 f &= ~SCF_WHILEM_VISITED_POS;
3795 /* This will finish on WHILEM, setting scan, or on NULL: */
3796 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3797 last, data, stopparen, recursed, NULL,
3799 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3801 if (flags & SCF_DO_STCLASS)
3802 data->start_class = oclass;
3803 if (mincount == 0 || minnext == 0) {
3804 if (flags & SCF_DO_STCLASS_OR) {
3805 cl_or(pRExC_state, data->start_class, &this_class);
3807 else if (flags & SCF_DO_STCLASS_AND) {
3808 /* Switch to OR mode: cache the old value of
3809 * data->start_class */
3811 StructCopy(data->start_class, and_withp,
3812 struct regnode_charclass_class);
3813 flags &= ~SCF_DO_STCLASS_AND;
3814 StructCopy(&this_class, data->start_class,
3815 struct regnode_charclass_class);
3816 flags |= SCF_DO_STCLASS_OR;
3817 data->start_class->flags |= ANYOF_EOS;
3819 } else { /* Non-zero len */
3820 if (flags & SCF_DO_STCLASS_OR) {
3821 cl_or(pRExC_state, data->start_class, &this_class);
3822 cl_and(data->start_class, and_withp);
3824 else if (flags & SCF_DO_STCLASS_AND)
3825 cl_and(data->start_class, &this_class);
3826 flags &= ~SCF_DO_STCLASS;
3828 if (!scan) /* It was not CURLYX, but CURLY. */
3830 if ( /* ? quantifier ok, except for (?{ ... }) */
3831 (next_is_eval || !(mincount == 0 && maxcount == 1))
3832 && (minnext == 0) && (deltanext == 0)
3833 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3834 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3836 ckWARNreg(RExC_parse,
3837 "Quantifier unexpected on zero-length expression");
3840 min += minnext * mincount;
3841 is_inf_internal |= ((maxcount == REG_INFTY
3842 && (minnext + deltanext) > 0)
3843 || deltanext == I32_MAX);
3844 is_inf |= is_inf_internal;
3845 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3847 /* Try powerful optimization CURLYX => CURLYN. */
3848 if ( OP(oscan) == CURLYX && data
3849 && data->flags & SF_IN_PAR
3850 && !(data->flags & SF_HAS_EVAL)
3851 && !deltanext && minnext == 1 ) {
3852 /* Try to optimize to CURLYN. */
3853 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3854 regnode * const nxt1 = nxt;
3861 if (!REGNODE_SIMPLE(OP(nxt))
3862 && !(PL_regkind[OP(nxt)] == EXACT
3863 && STR_LEN(nxt) == 1))
3869 if (OP(nxt) != CLOSE)
3871 if (RExC_open_parens) {
3872 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3873 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3875 /* Now we know that nxt2 is the only contents: */
3876 oscan->flags = (U8)ARG(nxt);
3878 OP(nxt1) = NOTHING; /* was OPEN. */
3881 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3882 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3883 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3884 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3885 OP(nxt + 1) = OPTIMIZED; /* was count. */
3886 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3891 /* Try optimization CURLYX => CURLYM. */
3892 if ( OP(oscan) == CURLYX && data
3893 && !(data->flags & SF_HAS_PAR)
3894 && !(data->flags & SF_HAS_EVAL)
3895 && !deltanext /* atom is fixed width */
3896 && minnext != 0 /* CURLYM can't handle zero width */
3898 /* XXXX How to optimize if data == 0? */
3899 /* Optimize to a simpler form. */
3900 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3904 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3905 && (OP(nxt2) != WHILEM))
3907 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3908 /* Need to optimize away parenths. */
3909 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3910 /* Set the parenth number. */
3911 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3913 oscan->flags = (U8)ARG(nxt);
3914 if (RExC_open_parens) {
3915 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3916 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3918 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3919 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3922 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3923 OP(nxt + 1) = OPTIMIZED; /* was count. */
3924 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3925 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3928 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3929 regnode *nnxt = regnext(nxt1);
3931 if (reg_off_by_arg[OP(nxt1)])
3932 ARG_SET(nxt1, nxt2 - nxt1);
3933 else if (nxt2 - nxt1 < U16_MAX)
3934 NEXT_OFF(nxt1) = nxt2 - nxt1;
3936 OP(nxt) = NOTHING; /* Cannot beautify */
3941 /* Optimize again: */
3942 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3943 NULL, stopparen, recursed, NULL, 0,depth+1);
3948 else if ((OP(oscan) == CURLYX)
3949 && (flags & SCF_WHILEM_VISITED_POS)
3950 /* See the comment on a similar expression above.
3951 However, this time it's not a subexpression
3952 we care about, but the expression itself. */
3953 && (maxcount == REG_INFTY)
3954 && data && ++data->whilem_c < 16) {
3955 /* This stays as CURLYX, we can put the count/of pair. */
3956 /* Find WHILEM (as in regexec.c) */
3957 regnode *nxt = oscan + NEXT_OFF(oscan);
3959 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3961 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3962 | (RExC_whilem_seen << 4)); /* On WHILEM */
3964 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3966 if (flags & SCF_DO_SUBSTR) {
3967 SV *last_str = NULL;
3968 int counted = mincount != 0;
3970 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3971 #if defined(SPARC64_GCC_WORKAROUND)
3974 const char *s = NULL;
3977 if (pos_before >= data->last_start_min)
3980 b = data->last_start_min;
3983 s = SvPV_const(data->last_found, l);
3984 old = b - data->last_start_min;
3987 I32 b = pos_before >= data->last_start_min
3988 ? pos_before : data->last_start_min;
3990 const char * const s = SvPV_const(data->last_found, l);
3991 I32 old = b - data->last_start_min;
3995 old = utf8_hop((U8*)s, old) - (U8*)s;
3997 /* Get the added string: */
3998 last_str = newSVpvn_utf8(s + old, l, UTF);
3999 if (deltanext == 0 && pos_before == b) {
4000 /* What was added is a constant string */
4002 SvGROW(last_str, (mincount * l) + 1);
4003 repeatcpy(SvPVX(last_str) + l,
4004 SvPVX_const(last_str), l, mincount - 1);
4005 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4006 /* Add additional parts. */
4007 SvCUR_set(data->last_found,
4008 SvCUR(data->last_found) - l);
4009 sv_catsv(data->last_found, last_str);
4011 SV * sv = data->last_found;
4013 SvUTF8(sv) && SvMAGICAL(sv) ?
4014 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4015 if (mg && mg->mg_len >= 0)
4016 mg->mg_len += CHR_SVLEN(last_str) - l;
4018 data->last_end += l * (mincount - 1);
4021 /* start offset must point into the last copy */
4022 data->last_start_min += minnext * (mincount - 1);
4023 data->last_start_max += is_inf ? I32_MAX
4024 : (maxcount - 1) * (minnext + data->pos_delta);
4027 /* It is counted once already... */
4028 data->pos_min += minnext * (mincount - counted);
4029 data->pos_delta += - counted * deltanext +
4030 (minnext + deltanext) * maxcount - minnext * mincount;
4031 if (mincount != maxcount) {
4032 /* Cannot extend fixed substrings found inside
4034 SCAN_COMMIT(pRExC_state,data,minlenp);
4035 if (mincount && last_str) {
4036 SV * const sv = data->last_found;
4037 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4038 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4042 sv_setsv(sv, last_str);
4043 data->last_end = data->pos_min;
4044 data->last_start_min =
4045 data->pos_min - CHR_SVLEN(last_str);
4046 data->last_start_max = is_inf
4048 : data->pos_min + data->pos_delta
4049 - CHR_SVLEN(last_str);
4051 data->longest = &(data->longest_float);
4053 SvREFCNT_dec(last_str);
4055 if (data && (fl & SF_HAS_EVAL))
4056 data->flags |= SF_HAS_EVAL;
4057 optimize_curly_tail:
4058 if (OP(oscan) != CURLYX) {
4059 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4061 NEXT_OFF(oscan) += NEXT_OFF(next);
4064 default: /* REF, ANYOFV, and CLUMP only? */
4065 if (flags & SCF_DO_SUBSTR) {
4066 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4067 data->longest = &(data->longest_float);
4069 is_inf = is_inf_internal = 1;
4070 if (flags & SCF_DO_STCLASS_OR)
4071 cl_anything(pRExC_state, data->start_class);
4072 flags &= ~SCF_DO_STCLASS;
4076 else if (OP(scan) == LNBREAK) {
4077 if (flags & SCF_DO_STCLASS) {
4079 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4080 if (flags & SCF_DO_STCLASS_AND) {
4081 for (value = 0; value < 256; value++)
4082 if (!is_VERTWS_cp(value))
4083 ANYOF_BITMAP_CLEAR(data->start_class, value);
4086 for (value = 0; value < 256; value++)
4087 if (is_VERTWS_cp(value))
4088 ANYOF_BITMAP_SET(data->start_class, value);
4090 if (flags & SCF_DO_STCLASS_OR)
4091 cl_and(data->start_class, and_withp);
4092 flags &= ~SCF_DO_STCLASS;
4096 if (flags & SCF_DO_SUBSTR) {
4097 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4099 data->pos_delta += 1;
4100 data->longest = &(data->longest_float);
4103 else if (REGNODE_SIMPLE(OP(scan))) {
4106 if (flags & SCF_DO_SUBSTR) {
4107 SCAN_COMMIT(pRExC_state,data,minlenp);
4111 if (flags & SCF_DO_STCLASS) {
4112 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4114 /* Some of the logic below assumes that switching
4115 locale on will only add false positives. */
4116 switch (PL_regkind[OP(scan)]) {
4120 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4121 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4122 cl_anything(pRExC_state, data->start_class);
4125 if (OP(scan) == SANY)
4127 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4128 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4129 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4130 cl_anything(pRExC_state, data->start_class);
4132 if (flags & SCF_DO_STCLASS_AND || !value)
4133 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4136 if (flags & SCF_DO_STCLASS_AND)
4137 cl_and(data->start_class,
4138 (struct regnode_charclass_class*)scan);
4140 cl_or(pRExC_state, data->start_class,
4141 (struct regnode_charclass_class*)scan);
4144 if (flags & SCF_DO_STCLASS_AND) {
4145 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4146 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4147 if (OP(scan) == ALNUMU) {
4148 for (value = 0; value < 256; value++) {
4149 if (!isWORDCHAR_L1(value)) {
4150 ANYOF_BITMAP_CLEAR(data->start_class, value);
4154 for (value = 0; value < 256; value++) {
4155 if (!isALNUM(value)) {
4156 ANYOF_BITMAP_CLEAR(data->start_class, value);
4163 if (data->start_class->flags & ANYOF_LOCALE)
4164 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4166 /* Even if under locale, set the bits for non-locale
4167 * in case it isn't a true locale-node. This will
4168 * create false positives if it truly is locale */
4169 if (OP(scan) == ALNUMU) {
4170 for (value = 0; value < 256; value++) {
4171 if (isWORDCHAR_L1(value)) {
4172 ANYOF_BITMAP_SET(data->start_class, value);
4176 for (value = 0; value < 256; value++) {
4177 if (isALNUM(value)) {
4178 ANYOF_BITMAP_SET(data->start_class, value);
4185 if (flags & SCF_DO_STCLASS_AND) {
4186 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4187 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4188 if (OP(scan) == NALNUMU) {
4189 for (value = 0; value < 256; value++) {
4190 if (isWORDCHAR_L1(value)) {
4191 ANYOF_BITMAP_CLEAR(data->start_class, value);
4195 for (value = 0; value < 256; value++) {
4196 if (isALNUM(value)) {
4197 ANYOF_BITMAP_CLEAR(data->start_class, value);
4204 if (data->start_class->flags & ANYOF_LOCALE)
4205 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4207 /* Even if under locale, set the bits for non-locale in
4208 * case it isn't a true locale-node. This will create
4209 * false positives if it truly is locale */
4210 if (OP(scan) == NALNUMU) {
4211 for (value = 0; value < 256; value++) {
4212 if (! isWORDCHAR_L1(value)) {
4213 ANYOF_BITMAP_SET(data->start_class, value);
4217 for (value = 0; value < 256; value++) {
4218 if (! isALNUM(value)) {
4219 ANYOF_BITMAP_SET(data->start_class, value);
4226 if (flags & SCF_DO_STCLASS_AND) {
4227 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4228 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4229 if (OP(scan) == SPACEU) {
4230 for (value = 0; value < 256; value++) {
4231 if (!isSPACE_L1(value)) {
4232 ANYOF_BITMAP_CLEAR(data->start_class, value);
4236 for (value = 0; value < 256; value++) {
4237 if (!isSPACE(value)) {
4238 ANYOF_BITMAP_CLEAR(data->start_class, value);
4245 if (data->start_class->flags & ANYOF_LOCALE) {
4246 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4248 if (OP(scan) == SPACEU) {
4249 for (value = 0; value < 256; value++) {
4250 if (isSPACE_L1(value)) {
4251 ANYOF_BITMAP_SET(data->start_class, value);
4255 for (value = 0; value < 256; value++) {
4256 if (isSPACE(value)) {
4257 ANYOF_BITMAP_SET(data->start_class, value);
4264 if (flags & SCF_DO_STCLASS_AND) {
4265 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4266 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4267 if (OP(scan) == NSPACEU) {
4268 for (value = 0; value < 256; value++) {
4269 if (isSPACE_L1(value)) {
4270 ANYOF_BITMAP_CLEAR(data->start_class, value);
4274 for (value = 0; value < 256; value++) {
4275 if (isSPACE(value)) {
4276 ANYOF_BITMAP_CLEAR(data->start_class, value);
4283 if (data->start_class->flags & ANYOF_LOCALE)
4284 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4285 if (OP(scan) == NSPACEU) {
4286 for (value = 0; value < 256; value++) {
4287 if (!isSPACE_L1(value)) {
4288 ANYOF_BITMAP_SET(data->start_class, value);
4293 for (value = 0; value < 256; value++) {
4294 if (!isSPACE(value)) {
4295 ANYOF_BITMAP_SET(data->start_class, value);
4302 if (flags & SCF_DO_STCLASS_AND) {
4303 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4305 for (value = 0; value < 256; value++)
4306 if (!isDIGIT(value))
4307 ANYOF_BITMAP_CLEAR(data->start_class, value);
4311 if (data->start_class->flags & ANYOF_LOCALE)
4312 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4313 for (value = 0; value < 256; value++)
4315 ANYOF_BITMAP_SET(data->start_class, value);
4319 if (flags & SCF_DO_STCLASS_AND) {
4320 if (!(data->start_class->flags & ANYOF_LOCALE))
4321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4322 for (value = 0; value < 256; value++)
4324 ANYOF_BITMAP_CLEAR(data->start_class, value);
4327 if (data->start_class->flags & ANYOF_LOCALE)
4328 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4329 for (value = 0; value < 256; value++)
4330 if (!isDIGIT(value))
4331 ANYOF_BITMAP_SET(data->start_class, value);
4334 CASE_SYNST_FNC(VERTWS);
4335 CASE_SYNST_FNC(HORIZWS);
4338 if (flags & SCF_DO_STCLASS_OR)
4339 cl_and(data->start_class, and_withp);
4340 flags &= ~SCF_DO_STCLASS;
4343 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4344 data->flags |= (OP(scan) == MEOL
4348 else if ( PL_regkind[OP(scan)] == BRANCHJ
4349 /* Lookbehind, or need to calculate parens/evals/stclass: */
4350 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4351 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4352 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4353 || OP(scan) == UNLESSM )
4355 /* Negative Lookahead/lookbehind
4356 In this case we can't do fixed string optimisation.
4359 I32 deltanext, minnext, fake = 0;
4361 struct regnode_charclass_class intrnl;
4364 data_fake.flags = 0;
4366 data_fake.whilem_c = data->whilem_c;
4367 data_fake.last_closep = data->last_closep;
4370 data_fake.last_closep = &fake;
4371 data_fake.pos_delta = delta;
4372 if ( flags & SCF_DO_STCLASS && !scan->flags
4373 && OP(scan) == IFMATCH ) { /* Lookahead */
4374 cl_init(pRExC_state, &intrnl);
4375 data_fake.start_class = &intrnl;
4376 f |= SCF_DO_STCLASS_AND;
4378 if (flags & SCF_WHILEM_VISITED_POS)
4379 f |= SCF_WHILEM_VISITED_POS;
4380 next = regnext(scan);
4381 nscan = NEXTOPER(NEXTOPER(scan));
4382 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4383 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4386 FAIL("Variable length lookbehind not implemented");
4388 else if (minnext > (I32)U8_MAX) {
4389 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4391 scan->flags = (U8)minnext;
4394 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4396 if (data_fake.flags & SF_HAS_EVAL)
4397 data->flags |= SF_HAS_EVAL;
4398 data->whilem_c = data_fake.whilem_c;
4400 if (f & SCF_DO_STCLASS_AND) {
4401 if (flags & SCF_DO_STCLASS_OR) {
4402 /* OR before, AND after: ideally we would recurse with
4403 * data_fake to get the AND applied by study of the
4404 * remainder of the pattern, and then derecurse;
4405 * *** HACK *** for now just treat as "no information".
4406 * See [perl #56690].
4408 cl_init(pRExC_state, data->start_class);
4410 /* AND before and after: combine and continue */
4411 const int was = (data->start_class->flags & ANYOF_EOS);
4413 cl_and(data->start_class, &intrnl);
4415 data->start_class->flags |= ANYOF_EOS;
4419 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4421 /* Positive Lookahead/lookbehind
4422 In this case we can do fixed string optimisation,
4423 but we must be careful about it. Note in the case of
4424 lookbehind the positions will be offset by the minimum
4425 length of the pattern, something we won't know about
4426 until after the recurse.
4428 I32 deltanext, fake = 0;
4430 struct regnode_charclass_class intrnl;
4432 /* We use SAVEFREEPV so that when the full compile
4433 is finished perl will clean up the allocated
4434 minlens when it's all done. This way we don't
4435 have to worry about freeing them when we know
4436 they wont be used, which would be a pain.
4439 Newx( minnextp, 1, I32 );
4440 SAVEFREEPV(minnextp);
4443 StructCopy(data, &data_fake, scan_data_t);
4444 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4447 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4448 data_fake.last_found=newSVsv(data->last_found);
4452 data_fake.last_closep = &fake;
4453 data_fake.flags = 0;
4454 data_fake.pos_delta = delta;
4456 data_fake.flags |= SF_IS_INF;
4457 if ( flags & SCF_DO_STCLASS && !scan->flags
4458 && OP(scan) == IFMATCH ) { /* Lookahead */
4459 cl_init(pRExC_state, &intrnl);
4460 data_fake.start_class = &intrnl;
4461 f |= SCF_DO_STCLASS_AND;
4463 if (flags & SCF_WHILEM_VISITED_POS)
4464 f |= SCF_WHILEM_VISITED_POS;
4465 next = regnext(scan);
4466 nscan = NEXTOPER(NEXTOPER(scan));
4468 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4469 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4472 FAIL("Variable length lookbehind not implemented");
4474 else if (*minnextp > (I32)U8_MAX) {
4475 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4477 scan->flags = (U8)*minnextp;
4482 if (f & SCF_DO_STCLASS_AND) {
4483 const int was = (data->start_class->flags & ANYOF_EOS);
4485 cl_and(data->start_class, &intrnl);
4487 data->start_class->flags |= ANYOF_EOS;
4490 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4492 if (data_fake.flags & SF_HAS_EVAL)
4493 data->flags |= SF_HAS_EVAL;
4494 data->whilem_c = data_fake.whilem_c;
4495 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4496 if (RExC_rx->minlen<*minnextp)
4497 RExC_rx->minlen=*minnextp;
4498 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4499 SvREFCNT_dec(data_fake.last_found);
4501 if ( data_fake.minlen_fixed != minlenp )
4503 data->offset_fixed= data_fake.offset_fixed;
4504 data->minlen_fixed= data_fake.minlen_fixed;
4505 data->lookbehind_fixed+= scan->flags;
4507 if ( data_fake.minlen_float != minlenp )
4509 data->minlen_float= data_fake.minlen_float;
4510 data->offset_float_min=data_fake.offset_float_min;
4511 data->offset_float_max=data_fake.offset_float_max;
4512 data->lookbehind_float+= scan->flags;
4521 else if (OP(scan) == OPEN) {
4522 if (stopparen != (I32)ARG(scan))
4525 else if (OP(scan) == CLOSE) {
4526 if (stopparen == (I32)ARG(scan)) {
4529 if ((I32)ARG(scan) == is_par) {
4530 next = regnext(scan);
4532 if ( next && (OP(next) != WHILEM) && next < last)
4533 is_par = 0; /* Disable optimization */
4536 *(data->last_closep) = ARG(scan);
4538 else if (OP(scan) == EVAL) {
4540 data->flags |= SF_HAS_EVAL;
4542 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4543 if (flags & SCF_DO_SUBSTR) {
4544 SCAN_COMMIT(pRExC_state,data,minlenp);
4545 flags &= ~SCF_DO_SUBSTR;
4547 if (data && OP(scan)==ACCEPT) {
4548 data->flags |= SCF_SEEN_ACCEPT;
4553 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4555 if (flags & SCF_DO_SUBSTR) {
4556 SCAN_COMMIT(pRExC_state,data,minlenp);
4557 data->longest = &(data->longest_float);
4559 is_inf = is_inf_internal = 1;
4560 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4561 cl_anything(pRExC_state, data->start_class);
4562 flags &= ~SCF_DO_STCLASS;
4564 else if (OP(scan) == GPOS) {
4565 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4566 !(delta || is_inf || (data && data->pos_delta)))
4568 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4569 RExC_rx->extflags |= RXf_ANCH_GPOS;
4570 if (RExC_rx->gofs < (U32)min)
4571 RExC_rx->gofs = min;
4573 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4577 #ifdef TRIE_STUDY_OPT
4578 #ifdef FULL_TRIE_STUDY
4579 else if (PL_regkind[OP(scan)] == TRIE) {
4580 /* NOTE - There is similar code to this block above for handling
4581 BRANCH nodes on the initial study. If you change stuff here
4583 regnode *trie_node= scan;
4584 regnode *tail= regnext(scan);
4585 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4586 I32 max1 = 0, min1 = I32_MAX;
4587 struct regnode_charclass_class accum;
4589 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4590 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4591 if (flags & SCF_DO_STCLASS)
4592 cl_init_zero(pRExC_state, &accum);
4598 const regnode *nextbranch= NULL;
4601 for ( word=1 ; word <= trie->wordcount ; word++)
4603 I32 deltanext=0, minnext=0, f = 0, fake;
4604 struct regnode_charclass_class this_class;
4606 data_fake.flags = 0;
4608 data_fake.whilem_c = data->whilem_c;
4609 data_fake.last_closep = data->last_closep;
4612 data_fake.last_closep = &fake;
4613 data_fake.pos_delta = delta;
4614 if (flags & SCF_DO_STCLASS) {
4615 cl_init(pRExC_state, &this_class);
4616 data_fake.start_class = &this_class;
4617 f = SCF_DO_STCLASS_AND;
4619 if (flags & SCF_WHILEM_VISITED_POS)
4620 f |= SCF_WHILEM_VISITED_POS;
4622 if (trie->jump[word]) {
4624 nextbranch = trie_node + trie->jump[0];
4625 scan= trie_node + trie->jump[word];
4626 /* We go from the jump point to the branch that follows
4627 it. Note this means we need the vestigal unused branches
4628 even though they arent otherwise used.
4630 minnext = study_chunk(pRExC_state, &scan, minlenp,
4631 &deltanext, (regnode *)nextbranch, &data_fake,
4632 stopparen, recursed, NULL, f,depth+1);
4634 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4635 nextbranch= regnext((regnode*)nextbranch);
4637 if (min1 > (I32)(minnext + trie->minlen))
4638 min1 = minnext + trie->minlen;
4639 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4640 max1 = minnext + deltanext + trie->maxlen;
4641 if (deltanext == I32_MAX)
4642 is_inf = is_inf_internal = 1;
4644 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4646 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4647 if ( stopmin > min + min1)
4648 stopmin = min + min1;
4649 flags &= ~SCF_DO_SUBSTR;
4651 data->flags |= SCF_SEEN_ACCEPT;
4654 if (data_fake.flags & SF_HAS_EVAL)
4655 data->flags |= SF_HAS_EVAL;
4656 data->whilem_c = data_fake.whilem_c;
4658 if (flags & SCF_DO_STCLASS)
4659 cl_or(pRExC_state, &accum, &this_class);
4662 if (flags & SCF_DO_SUBSTR) {
4663 data->pos_min += min1;
4664 data->pos_delta += max1 - min1;
4665 if (max1 != min1 || is_inf)
4666 data->longest = &(data->longest_float);
4669 delta += max1 - min1;
4670 if (flags & SCF_DO_STCLASS_OR) {
4671 cl_or(pRExC_state, data->start_class, &accum);
4673 cl_and(data->start_class, and_withp);
4674 flags &= ~SCF_DO_STCLASS;
4677 else if (flags & SCF_DO_STCLASS_AND) {
4679 cl_and(data->start_class, &accum);
4680 flags &= ~SCF_DO_STCLASS;
4683 /* Switch to OR mode: cache the old value of
4684 * data->start_class */
4686 StructCopy(data->start_class, and_withp,
4687 struct regnode_charclass_class);
4688 flags &= ~SCF_DO_STCLASS_AND;
4689 StructCopy(&accum, data->start_class,
4690 struct regnode_charclass_class);
4691 flags |= SCF_DO_STCLASS_OR;
4692 data->start_class->flags |= ANYOF_EOS;
4699 else if (PL_regkind[OP(scan)] == TRIE) {
4700 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4703 min += trie->minlen;
4704 delta += (trie->maxlen - trie->minlen);
4705 flags &= ~SCF_DO_STCLASS; /* xxx */
4706 if (flags & SCF_DO_SUBSTR) {
4707 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4708 data->pos_min += trie->minlen;
4709 data->pos_delta += (trie->maxlen - trie->minlen);
4710 if (trie->maxlen != trie->minlen)
4711 data->longest = &(data->longest_float);
4713 if (trie->jump) /* no more substrings -- for now /grr*/
4714 flags &= ~SCF_DO_SUBSTR;
4716 #endif /* old or new */
4717 #endif /* TRIE_STUDY_OPT */
4719 /* Else: zero-length, ignore. */
4720 scan = regnext(scan);
4725 stopparen = frame->stop;
4726 frame = frame->prev;
4727 goto fake_study_recurse;
4732 DEBUG_STUDYDATA("pre-fin:",data,depth);
4735 *deltap = is_inf_internal ? I32_MAX : delta;
4736 if (flags & SCF_DO_SUBSTR && is_inf)
4737 data->pos_delta = I32_MAX - data->pos_min;
4738 if (is_par > (I32)U8_MAX)
4740 if (is_par && pars==1 && data) {
4741 data->flags |= SF_IN_PAR;
4742 data->flags &= ~SF_HAS_PAR;
4744 else if (pars && data) {
4745 data->flags |= SF_HAS_PAR;
4746 data->flags &= ~SF_IN_PAR;
4748 if (flags & SCF_DO_STCLASS_OR)
4749 cl_and(data->start_class, and_withp);
4750 if (flags & SCF_TRIE_RESTUDY)
4751 data->flags |= SCF_TRIE_RESTUDY;
4753 DEBUG_STUDYDATA("post-fin:",data,depth);
4755 return min < stopmin ? min : stopmin;
4759 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4761 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4763 PERL_ARGS_ASSERT_ADD_DATA;
4765 Renewc(RExC_rxi->data,
4766 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4767 char, struct reg_data);
4769 Renew(RExC_rxi->data->what, count + n, U8);
4771 Newx(RExC_rxi->data->what, n, U8);
4772 RExC_rxi->data->count = count + n;
4773 Copy(s, RExC_rxi->data->what + count, n, U8);
4777 /*XXX: todo make this not included in a non debugging perl */
4778 #ifndef PERL_IN_XSUB_RE
4780 Perl_reginitcolors(pTHX)
4783 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4785 char *t = savepv(s);
4789 t = strchr(t, '\t');
4795 PL_colors[i] = t = (char *)"";
4800 PL_colors[i++] = (char *)"";
4807 #ifdef TRIE_STUDY_OPT
4808 #define CHECK_RESTUDY_GOTO \
4810 (data.flags & SCF_TRIE_RESTUDY) \
4814 #define CHECK_RESTUDY_GOTO
4818 - pregcomp - compile a regular expression into internal code
4820 * We can't allocate space until we know how big the compiled form will be,
4821 * but we can't compile it (and thus know how big it is) until we've got a
4822 * place to put the code. So we cheat: we compile it twice, once with code
4823 * generation turned off and size counting turned on, and once "for real".
4824 * This also means that we don't allocate space until we are sure that the
4825 * thing really will compile successfully, and we never have to move the
4826 * code and thus invalidate pointers into it. (Note that it has to be in
4827 * one piece because free() must be able to free it all.) [NB: not true in perl]
4829 * Beware that the optimization-preparation code in here knows about some
4830 * of the structure of the compiled regexp. [I'll say.]
4835 #ifndef PERL_IN_XSUB_RE
4836 #define RE_ENGINE_PTR &PL_core_reg_engine
4838 extern const struct regexp_engine my_reg_engine;
4839 #define RE_ENGINE_PTR &my_reg_engine
4842 #ifndef PERL_IN_XSUB_RE
4844 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4847 HV * const table = GvHV(PL_hintgv);
4849 PERL_ARGS_ASSERT_PREGCOMP;
4851 /* Dispatch a request to compile a regexp to correct
4854 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4855 GET_RE_DEBUG_FLAGS_DECL;
4856 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4857 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4859 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4862 return CALLREGCOMP_ENG(eng, pattern, flags);
4865 return Perl_re_compile(aTHX_ pattern, flags);
4870 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4875 register regexp_internal *ri;
4884 /* these are all flags - maybe they should be turned
4885 * into a single int with different bit masks */
4886 I32 sawlookahead = 0;
4889 bool used_setjump = FALSE;
4890 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4895 RExC_state_t RExC_state;
4896 RExC_state_t * const pRExC_state = &RExC_state;
4897 #ifdef TRIE_STUDY_OPT
4899 RExC_state_t copyRExC_state;
4901 GET_RE_DEBUG_FLAGS_DECL;
4903 PERL_ARGS_ASSERT_RE_COMPILE;
4905 DEBUG_r(if (!PL_colorset) reginitcolors());
4907 #ifndef PERL_IN_XSUB_RE
4908 /* Initialize these here instead of as-needed, as is quick and avoids
4909 * having to test them each time otherwise */
4910 if (! PL_AboveLatin1) {
4911 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4912 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4913 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
4915 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4916 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4918 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4919 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4921 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4922 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4924 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4926 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4927 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4929 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4931 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4932 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4934 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4935 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4937 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4938 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4940 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4941 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4943 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4944 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4946 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4947 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4949 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4950 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4952 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4953 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4955 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4957 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4958 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4960 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4961 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
4965 exp = SvPV(pattern, plen);
4967 if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4968 RExC_utf8 = RExC_orig_utf8 = 0;
4971 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4973 RExC_uni_semantics = 0;
4974 RExC_contains_locale = 0;
4976 /****************** LONG JUMP TARGET HERE***********************/
4977 /* Longjmp back to here if have to switch in midstream to utf8 */
4978 if (! RExC_orig_utf8) {
4979 JMPENV_PUSH(jump_ret);
4980 used_setjump = TRUE;
4983 if (jump_ret == 0) { /* First time through */
4987 SV *dsv= sv_newmortal();
4988 RE_PV_QUOTED_DECL(s, RExC_utf8,
4989 dsv, exp, plen, 60);
4990 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4991 PL_colors[4],PL_colors[5],s);
4994 else { /* longjumped back */
4997 /* If the cause for the longjmp was other than changing to utf8, pop
4998 * our own setjmp, and longjmp to the correct handler */
4999 if (jump_ret != UTF8_LONGJMP) {
5001 JMPENV_JUMP(jump_ret);
5006 /* It's possible to write a regexp in ascii that represents Unicode
5007 codepoints outside of the byte range, such as via \x{100}. If we
5008 detect such a sequence we have to convert the entire pattern to utf8
5009 and then recompile, as our sizing calculation will have been based
5010 on 1 byte == 1 character, but we will need to use utf8 to encode
5011 at least some part of the pattern, and therefore must convert the whole
5014 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5015 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5016 exp = (char*)Perl_bytes_to_utf8(aTHX_
5017 (U8*)SvPV_nomg(pattern, plen),
5020 RExC_orig_utf8 = RExC_utf8 = 1;
5024 #ifdef TRIE_STUDY_OPT
5028 pm_flags = orig_pm_flags;
5030 if (initial_charset == REGEX_LOCALE_CHARSET) {
5031 RExC_contains_locale = 1;
5033 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5035 /* Set to use unicode semantics if the pattern is in utf8 and has the
5036 * 'depends' charset specified, as it means unicode when utf8 */
5037 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5041 RExC_flags = pm_flags;
5045 RExC_in_lookbehind = 0;
5046 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5047 RExC_seen_evals = 0;
5049 RExC_override_recoding = 0;
5051 /* First pass: determine size, legality. */
5059 RExC_emit = &PL_regdummy;
5060 RExC_whilem_seen = 0;
5061 RExC_open_parens = NULL;
5062 RExC_close_parens = NULL;
5064 RExC_paren_names = NULL;
5066 RExC_paren_name_list = NULL;
5068 RExC_recurse = NULL;
5069 RExC_recurse_count = 0;
5071 #if 0 /* REGC() is (currently) a NOP at the first pass.
5072 * Clever compilers notice this and complain. --jhi */
5073 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5076 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5078 RExC_lastparse=NULL;
5080 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5081 RExC_precomp = NULL;
5085 /* Here, finished first pass. Get rid of any added setjmp */
5091 PerlIO_printf(Perl_debug_log,
5092 "Required size %"IVdf" nodes\n"
5093 "Starting second pass (creation)\n",
5096 RExC_lastparse=NULL;
5099 /* The first pass could have found things that force Unicode semantics */
5100 if ((RExC_utf8 || RExC_uni_semantics)
5101 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5103 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5106 /* Small enough for pointer-storage convention?
5107 If extralen==0, this means that we will not need long jumps. */
5108 if (RExC_size >= 0x10000L && RExC_extralen)
5109 RExC_size += RExC_extralen;
5112 if (RExC_whilem_seen > 15)
5113 RExC_whilem_seen = 15;
5115 /* Allocate space and zero-initialize. Note, the two step process
5116 of zeroing when in debug mode, thus anything assigned has to
5117 happen after that */
5118 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5119 r = (struct regexp*)SvANY(rx);
5120 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5121 char, regexp_internal);
5122 if ( r == NULL || ri == NULL )
5123 FAIL("Regexp out of space");
5125 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5126 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5128 /* bulk initialize base fields with 0. */
5129 Zero(ri, sizeof(regexp_internal), char);
5132 /* non-zero initialization begins here */
5134 r->engine= RE_ENGINE_PTR;
5135 r->extflags = pm_flags;
5137 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5138 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5140 /* The caret is output if there are any defaults: if not all the STD
5141 * flags are set, or if no character set specifier is needed */
5143 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5145 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5146 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5147 >> RXf_PMf_STD_PMMOD_SHIFT);
5148 const char *fptr = STD_PAT_MODS; /*"msix"*/
5150 /* Allocate for the worst case, which is all the std flags are turned
5151 * on. If more precision is desired, we could do a population count of
5152 * the flags set. This could be done with a small lookup table, or by
5153 * shifting, masking and adding, or even, when available, assembly
5154 * language for a machine-language population count.
5155 * We never output a minus, as all those are defaults, so are
5156 * covered by the caret */
5157 const STRLEN wraplen = plen + has_p + has_runon
5158 + has_default /* If needs a caret */
5160 /* If needs a character set specifier */
5161 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5162 + (sizeof(STD_PAT_MODS) - 1)
5163 + (sizeof("(?:)") - 1);
5165 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5167 SvFLAGS(rx) |= SvUTF8(pattern);
5170 /* If a default, cover it using the caret */
5172 *p++= DEFAULT_PAT_MOD;
5176 const char* const name = get_regex_charset_name(r->extflags, &len);
5177 Copy(name, p, len, char);
5181 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5184 while((ch = *fptr++)) {
5192 Copy(RExC_precomp, p, plen, char);
5193 assert ((RX_WRAPPED(rx) - p) < 16);
5194 r->pre_prefix = p - RX_WRAPPED(rx);
5200 SvCUR_set(rx, p - SvPVX_const(rx));
5204 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5206 if (RExC_seen & REG_SEEN_RECURSE) {
5207 Newxz(RExC_open_parens, RExC_npar,regnode *);
5208 SAVEFREEPV(RExC_open_parens);
5209 Newxz(RExC_close_parens,RExC_npar,regnode *);
5210 SAVEFREEPV(RExC_close_parens);
5213 /* Useful during FAIL. */
5214 #ifdef RE_TRACK_PATTERN_OFFSETS
5215 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5216 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5217 "%s %"UVuf" bytes for offset annotations.\n",
5218 ri->u.offsets ? "Got" : "Couldn't get",
5219 (UV)((2*RExC_size+1) * sizeof(U32))));
5221 SetProgLen(ri,RExC_size);
5225 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5227 /* Second pass: emit code. */
5228 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
5233 RExC_emit_start = ri->program;
5234 RExC_emit = ri->program;
5235 RExC_emit_bound = ri->program + RExC_size + 1;
5237 /* Store the count of eval-groups for security checks: */
5238 RExC_rx->seen_evals = RExC_seen_evals;
5239 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5240 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5244 /* XXXX To minimize changes to RE engine we always allocate
5245 3-units-long substrs field. */
5246 Newx(r->substrs, 1, struct reg_substr_data);
5247 if (RExC_recurse_count) {
5248 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5249 SAVEFREEPV(RExC_recurse);
5253 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5254 Zero(r->substrs, 1, struct reg_substr_data);
5256 #ifdef TRIE_STUDY_OPT
5258 StructCopy(&zero_scan_data, &data, scan_data_t);
5259 copyRExC_state = RExC_state;
5262 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5264 RExC_state = copyRExC_state;
5265 if (seen & REG_TOP_LEVEL_BRANCHES)
5266 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5268 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5269 if (data.last_found) {
5270 SvREFCNT_dec(data.longest_fixed);
5271 SvREFCNT_dec(data.longest_float);
5272 SvREFCNT_dec(data.last_found);
5274 StructCopy(&zero_scan_data, &data, scan_data_t);
5277 StructCopy(&zero_scan_data, &data, scan_data_t);
5280 /* Dig out information for optimizations. */
5281 r->extflags = RExC_flags; /* was pm_op */
5282 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5285 SvUTF8_on(rx); /* Unicode in it? */
5286 ri->regstclass = NULL;
5287 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5288 r->intflags |= PREGf_NAUGHTY;
5289 scan = ri->program + 1; /* First BRANCH. */
5291 /* testing for BRANCH here tells us whether there is "must appear"
5292 data in the pattern. If there is then we can use it for optimisations */
5293 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5295 STRLEN longest_float_length, longest_fixed_length;
5296 struct regnode_charclass_class ch_class; /* pointed to by data */
5298 I32 last_close = 0; /* pointed to by data */
5299 regnode *first= scan;
5300 regnode *first_next= regnext(first);
5302 * Skip introductions and multiplicators >= 1
5303 * so that we can extract the 'meat' of the pattern that must
5304 * match in the large if() sequence following.
5305 * NOTE that EXACT is NOT covered here, as it is normally
5306 * picked up by the optimiser separately.
5308 * This is unfortunate as the optimiser isnt handling lookahead
5309 * properly currently.
5312 while ((OP(first) == OPEN && (sawopen = 1)) ||
5313 /* An OR of *one* alternative - should not happen now. */
5314 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5315 /* for now we can't handle lookbehind IFMATCH*/
5316 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5317 (OP(first) == PLUS) ||
5318 (OP(first) == MINMOD) ||
5319 /* An {n,m} with n>0 */
5320 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5321 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5324 * the only op that could be a regnode is PLUS, all the rest
5325 * will be regnode_1 or regnode_2.
5328 if (OP(first) == PLUS)
5331 first += regarglen[OP(first)];
5333 first = NEXTOPER(first);
5334 first_next= regnext(first);
5337 /* Starting-point info. */
5339 DEBUG_PEEP("first:",first,0);
5340 /* Ignore EXACT as we deal with it later. */
5341 if (PL_regkind[OP(first)] == EXACT) {
5342 if (OP(first) == EXACT)
5343 NOOP; /* Empty, get anchored substr later. */
5345 ri->regstclass = first;
5348 else if (PL_regkind[OP(first)] == TRIE &&
5349 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5352 /* this can happen only on restudy */
5353 if ( OP(first) == TRIE ) {
5354 struct regnode_1 *trieop = (struct regnode_1 *)
5355 PerlMemShared_calloc(1, sizeof(struct regnode_1));
5356 StructCopy(first,trieop,struct regnode_1);
5357 trie_op=(regnode *)trieop;
5359 struct regnode_charclass *trieop = (struct regnode_charclass *)
5360 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5361 StructCopy(first,trieop,struct regnode_charclass);
5362 trie_op=(regnode *)trieop;
5365 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5366 ri->regstclass = trie_op;
5369 else if (REGNODE_SIMPLE(OP(first)))
5370 ri->regstclass = first;
5371 else if (PL_regkind[OP(first)] == BOUND ||
5372 PL_regkind[OP(first)] == NBOUND)
5373 ri->regstclass = first;
5374 else if (PL_regkind[OP(first)] == BOL) {
5375 r->extflags |= (OP(first) == MBOL
5377 : (OP(first) == SBOL
5380 first = NEXTOPER(first);
5383 else if (OP(first) == GPOS) {
5384 r->extflags |= RXf_ANCH_GPOS;
5385 first = NEXTOPER(first);
5388 else if ((!sawopen || !RExC_sawback) &&
5389 (OP(first) == STAR &&
5390 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5391 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5393 /* turn .* into ^.* with an implied $*=1 */
5395 (OP(NEXTOPER(first)) == REG_ANY)
5398 r->extflags |= type;
5399 r->intflags |= PREGf_IMPLICIT;
5400 first = NEXTOPER(first);
5403 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5404 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5405 /* x+ must match at the 1st pos of run of x's */
5406 r->intflags |= PREGf_SKIP;
5408 /* Scan is after the zeroth branch, first is atomic matcher. */
5409 #ifdef TRIE_STUDY_OPT
5412 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5413 (IV)(first - scan + 1))
5417 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5418 (IV)(first - scan + 1))
5424 * If there's something expensive in the r.e., find the
5425 * longest literal string that must appear and make it the
5426 * regmust. Resolve ties in favor of later strings, since
5427 * the regstart check works with the beginning of the r.e.
5428 * and avoiding duplication strengthens checking. Not a
5429 * strong reason, but sufficient in the absence of others.
5430 * [Now we resolve ties in favor of the earlier string if
5431 * it happens that c_offset_min has been invalidated, since the
5432 * earlier string may buy us something the later one won't.]
5435 data.longest_fixed = newSVpvs("");
5436 data.longest_float = newSVpvs("");
5437 data.last_found = newSVpvs("");
5438 data.longest = &(data.longest_fixed);
5440 if (!ri->regstclass) {
5441 cl_init(pRExC_state, &ch_class);
5442 data.start_class = &ch_class;
5443 stclass_flag = SCF_DO_STCLASS_AND;
5444 } else /* XXXX Check for BOUND? */
5446 data.last_closep = &last_close;
5448 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5449 &data, -1, NULL, NULL,
5450 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5456 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5457 && data.last_start_min == 0 && data.last_end > 0
5458 && !RExC_seen_zerolen
5459 && !(RExC_seen & REG_SEEN_VERBARG)
5460 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5461 r->extflags |= RXf_CHECK_ALL;
5462 scan_commit(pRExC_state, &data,&minlen,0);
5463 SvREFCNT_dec(data.last_found);
5465 /* Note that code very similar to this but for anchored string
5466 follows immediately below, changes may need to be made to both.
5469 longest_float_length = CHR_SVLEN(data.longest_float);
5470 if (longest_float_length
5471 || (data.flags & SF_FL_BEFORE_EOL
5472 && (!(data.flags & SF_FL_BEFORE_MEOL)
5473 || (RExC_flags & RXf_PMf_MULTILINE))))
5477 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5478 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5479 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5480 && data.offset_fixed == data.offset_float_min
5481 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5482 goto remove_float; /* As in (a)+. */
5484 /* copy the information about the longest float from the reg_scan_data
5485 over to the program. */
5486 if (SvUTF8(data.longest_float)) {
5487 r->float_utf8 = data.longest_float;
5488 r->float_substr = NULL;
5490 r->float_substr = data.longest_float;
5491 r->float_utf8 = NULL;
5493 /* float_end_shift is how many chars that must be matched that
5494 follow this item. We calculate it ahead of time as once the
5495 lookbehind offset is added in we lose the ability to correctly
5497 ml = data.minlen_float ? *(data.minlen_float)
5498 : (I32)longest_float_length;
5499 r->float_end_shift = ml - data.offset_float_min
5500 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5501 + data.lookbehind_float;
5502 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5503 r->float_max_offset = data.offset_float_max;
5504 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5505 r->float_max_offset -= data.lookbehind_float;
5507 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5508 && (!(data.flags & SF_FL_BEFORE_MEOL)
5509 || (RExC_flags & RXf_PMf_MULTILINE)));
5510 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5514 r->float_substr = r->float_utf8 = NULL;
5515 SvREFCNT_dec(data.longest_float);
5516 longest_float_length = 0;
5519 /* Note that code very similar to this but for floating string
5520 is immediately above, changes may need to be made to both.
5523 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5525 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5526 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5527 && (longest_fixed_length
5528 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5529 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5530 || (RExC_flags & RXf_PMf_MULTILINE)))) )
5534 /* copy the information about the longest fixed
5535 from the reg_scan_data over to the program. */
5536 if (SvUTF8(data.longest_fixed)) {
5537 r->anchored_utf8 = data.longest_fixed;
5538 r->anchored_substr = NULL;
5540 r->anchored_substr = data.longest_fixed;
5541 r->anchored_utf8 = NULL;
5543 /* fixed_end_shift is how many chars that must be matched that
5544 follow this item. We calculate it ahead of time as once the
5545 lookbehind offset is added in we lose the ability to correctly
5547 ml = data.minlen_fixed ? *(data.minlen_fixed)
5548 : (I32)longest_fixed_length;
5549 r->anchored_end_shift = ml - data.offset_fixed
5550 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5551 + data.lookbehind_fixed;
5552 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5554 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5555 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5556 || (RExC_flags & RXf_PMf_MULTILINE)));
5557 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5560 r->anchored_substr = r->anchored_utf8 = NULL;
5561 SvREFCNT_dec(data.longest_fixed);
5562 longest_fixed_length = 0;
5565 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5566 ri->regstclass = NULL;
5568 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5570 && !(data.start_class->flags & ANYOF_EOS)
5571 && !cl_is_anything(data.start_class))
5573 const U32 n = add_data(pRExC_state, 1, "f");
5574 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5576 Newx(RExC_rxi->data->data[n], 1,
5577 struct regnode_charclass_class);
5578 StructCopy(data.start_class,
5579 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5580 struct regnode_charclass_class);
5581 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5582 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5583 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5584 regprop(r, sv, (regnode*)data.start_class);
5585 PerlIO_printf(Perl_debug_log,
5586 "synthetic stclass \"%s\".\n",
5587 SvPVX_const(sv));});
5590 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5591 if (longest_fixed_length > longest_float_length) {
5592 r->check_end_shift = r->anchored_end_shift;
5593 r->check_substr = r->anchored_substr;
5594 r->check_utf8 = r->anchored_utf8;
5595 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5596 if (r->extflags & RXf_ANCH_SINGLE)
5597 r->extflags |= RXf_NOSCAN;
5600 r->check_end_shift = r->float_end_shift;
5601 r->check_substr = r->float_substr;
5602 r->check_utf8 = r->float_utf8;
5603 r->check_offset_min = r->float_min_offset;
5604 r->check_offset_max = r->float_max_offset;
5606 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5607 This should be changed ASAP! */
5608 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5609 r->extflags |= RXf_USE_INTUIT;
5610 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5611 r->extflags |= RXf_INTUIT_TAIL;
5613 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5614 if ( (STRLEN)minlen < longest_float_length )
5615 minlen= longest_float_length;
5616 if ( (STRLEN)minlen < longest_fixed_length )
5617 minlen= longest_fixed_length;
5621 /* Several toplevels. Best we can is to set minlen. */
5623 struct regnode_charclass_class ch_class;
5626 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5628 scan = ri->program + 1;
5629 cl_init(pRExC_state, &ch_class);
5630 data.start_class = &ch_class;
5631 data.last_closep = &last_close;
5634 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5635 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5639 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5640 = r->float_substr = r->float_utf8 = NULL;
5642 if (!(data.start_class->flags & ANYOF_EOS)
5643 && !cl_is_anything(data.start_class))
5645 const U32 n = add_data(pRExC_state, 1, "f");
5646 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5648 Newx(RExC_rxi->data->data[n], 1,
5649 struct regnode_charclass_class);
5650 StructCopy(data.start_class,
5651 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5652 struct regnode_charclass_class);
5653 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5654 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5655 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5656 regprop(r, sv, (regnode*)data.start_class);
5657 PerlIO_printf(Perl_debug_log,
5658 "synthetic stclass \"%s\".\n",
5659 SvPVX_const(sv));});
5663 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5664 the "real" pattern. */
5666 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5667 (IV)minlen, (IV)r->minlen);
5669 r->minlenret = minlen;
5670 if (r->minlen < minlen)
5673 if (RExC_seen & REG_SEEN_GPOS)
5674 r->extflags |= RXf_GPOS_SEEN;
5675 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5676 r->extflags |= RXf_LOOKBEHIND_SEEN;
5677 if (RExC_seen & REG_SEEN_EVAL)
5678 r->extflags |= RXf_EVAL_SEEN;
5679 if (RExC_seen & REG_SEEN_CANY)
5680 r->extflags |= RXf_CANY_SEEN;
5681 if (RExC_seen & REG_SEEN_VERBARG)
5682 r->intflags |= PREGf_VERBARG_SEEN;
5683 if (RExC_seen & REG_SEEN_CUTGROUP)
5684 r->intflags |= PREGf_CUTGROUP_SEEN;
5685 if (RExC_paren_names)
5686 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5688 RXp_PAREN_NAMES(r) = NULL;
5690 #ifdef STUPID_PATTERN_CHECKS
5691 if (RX_PRELEN(rx) == 0)
5692 r->extflags |= RXf_NULL;
5693 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5694 /* XXX: this should happen BEFORE we compile */
5695 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5696 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5697 r->extflags |= RXf_WHITE;
5698 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5699 r->extflags |= RXf_START_ONLY;
5701 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5702 /* XXX: this should happen BEFORE we compile */
5703 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5705 regnode *first = ri->program + 1;
5708 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5709 r->extflags |= RXf_NULL;
5710 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5711 r->extflags |= RXf_START_ONLY;
5712 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5713 && OP(regnext(first)) == END)
5714 r->extflags |= RXf_WHITE;
5718 if (RExC_paren_names) {
5719 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5720 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5723 ri->name_list_idx = 0;
5725 if (RExC_recurse_count) {
5726 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5727 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5728 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5731 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5732 /* assume we don't need to swap parens around before we match */
5735 PerlIO_printf(Perl_debug_log,"Final program:\n");
5738 #ifdef RE_TRACK_PATTERN_OFFSETS
5739 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5740 const U32 len = ri->u.offsets[0];
5742 GET_RE_DEBUG_FLAGS_DECL;
5743 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5744 for (i = 1; i <= len; i++) {
5745 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5746 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5747 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5749 PerlIO_printf(Perl_debug_log, "\n");
5755 #undef RE_ENGINE_PTR
5759 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5762 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5764 PERL_UNUSED_ARG(value);
5766 if (flags & RXapif_FETCH) {
5767 return reg_named_buff_fetch(rx, key, flags);
5768 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5769 Perl_croak_no_modify(aTHX);
5771 } else if (flags & RXapif_EXISTS) {
5772 return reg_named_buff_exists(rx, key, flags)
5775 } else if (flags & RXapif_REGNAMES) {
5776 return reg_named_buff_all(rx, flags);
5777 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5778 return reg_named_buff_scalar(rx, flags);
5780 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5786 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5789 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5790 PERL_UNUSED_ARG(lastkey);
5792 if (flags & RXapif_FIRSTKEY)
5793 return reg_named_buff_firstkey(rx, flags);
5794 else if (flags & RXapif_NEXTKEY)
5795 return reg_named_buff_nextkey(rx, flags);
5797 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5803 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5806 AV *retarray = NULL;
5808 struct regexp *const rx = (struct regexp *)SvANY(r);
5810 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5812 if (flags & RXapif_ALL)
5815 if (rx && RXp_PAREN_NAMES(rx)) {
5816 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5819 SV* sv_dat=HeVAL(he_str);
5820 I32 *nums=(I32*)SvPVX(sv_dat);
5821 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5822 if ((I32)(rx->nparens) >= nums[i]
5823 && rx->offs[nums[i]].start != -1
5824 && rx->offs[nums[i]].end != -1)
5827 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5832 ret = newSVsv(&PL_sv_undef);
5835 av_push(retarray, ret);
5838 return newRV_noinc(MUTABLE_SV(retarray));
5845 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5848 struct regexp *const rx = (struct regexp *)SvANY(r);
5850 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5852 if (rx && RXp_PAREN_NAMES(rx)) {
5853 if (flags & RXapif_ALL) {
5854 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5856 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5870 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5872 struct regexp *const rx = (struct regexp *)SvANY(r);
5874 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5876 if ( rx && RXp_PAREN_NAMES(rx) ) {
5877 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5879 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5886 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5888 struct regexp *const rx = (struct regexp *)SvANY(r);
5889 GET_RE_DEBUG_FLAGS_DECL;
5891 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5893 if (rx && RXp_PAREN_NAMES(rx)) {
5894 HV *hv = RXp_PAREN_NAMES(rx);
5896 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5899 SV* sv_dat = HeVAL(temphe);
5900 I32 *nums = (I32*)SvPVX(sv_dat);
5901 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5902 if ((I32)(rx->lastparen) >= nums[i] &&
5903 rx->offs[nums[i]].start != -1 &&
5904 rx->offs[nums[i]].end != -1)
5910 if (parno || flags & RXapif_ALL) {
5911 return newSVhek(HeKEY_hek(temphe));
5919 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5924 struct regexp *const rx = (struct regexp *)SvANY(r);
5926 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5928 if (rx && RXp_PAREN_NAMES(rx)) {
5929 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5930 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5931 } else if (flags & RXapif_ONE) {
5932 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5933 av = MUTABLE_AV(SvRV(ret));
5934 length = av_len(av);
5936 return newSViv(length + 1);
5938 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5942 return &PL_sv_undef;
5946 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5948 struct regexp *const rx = (struct regexp *)SvANY(r);
5951 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5953 if (rx && RXp_PAREN_NAMES(rx)) {
5954 HV *hv= RXp_PAREN_NAMES(rx);
5956 (void)hv_iterinit(hv);
5957 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5960 SV* sv_dat = HeVAL(temphe);
5961 I32 *nums = (I32*)SvPVX(sv_dat);
5962 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5963 if ((I32)(rx->lastparen) >= nums[i] &&
5964 rx->offs[nums[i]].start != -1 &&
5965 rx->offs[nums[i]].end != -1)
5971 if (parno || flags & RXapif_ALL) {
5972 av_push(av, newSVhek(HeKEY_hek(temphe)));
5977 return newRV_noinc(MUTABLE_SV(av));
5981 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5984 struct regexp *const rx = (struct regexp *)SvANY(r);
5989 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5992 sv_setsv(sv,&PL_sv_undef);
5996 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5998 i = rx->offs[0].start;
6002 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6004 s = rx->subbeg + rx->offs[0].end;
6005 i = rx->sublen - rx->offs[0].end;
6008 if ( 0 <= paren && paren <= (I32)rx->nparens &&
6009 (s1 = rx->offs[paren].start) != -1 &&
6010 (t1 = rx->offs[paren].end) != -1)
6014 s = rx->subbeg + s1;
6016 sv_setsv(sv,&PL_sv_undef);
6019 assert(rx->sublen >= (s - rx->subbeg) + i );
6021 const int oldtainted = PL_tainted;
6023 sv_setpvn(sv, s, i);
6024 PL_tainted = oldtainted;
6025 if ( (rx->extflags & RXf_CANY_SEEN)
6026 ? (RXp_MATCH_UTF8(rx)
6027 && (!i || is_utf8_string((U8*)s, i)))
6028 : (RXp_MATCH_UTF8(rx)) )
6035 if (RXp_MATCH_TAINTED(rx)) {
6036 if (SvTYPE(sv) >= SVt_PVMG) {
6037 MAGIC* const mg = SvMAGIC(sv);
6040 SvMAGIC_set(sv, mg->mg_moremagic);
6042 if ((mgt = SvMAGIC(sv))) {
6043 mg->mg_moremagic = mgt;
6044 SvMAGIC_set(sv, mg);
6054 sv_setsv(sv,&PL_sv_undef);
6060 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6061 SV const * const value)
6063 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6065 PERL_UNUSED_ARG(rx);
6066 PERL_UNUSED_ARG(paren);
6067 PERL_UNUSED_ARG(value);
6070 Perl_croak_no_modify(aTHX);
6074 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6077 struct regexp *const rx = (struct regexp *)SvANY(r);
6081 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6083 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6085 /* $` / ${^PREMATCH} */
6086 case RX_BUFF_IDX_PREMATCH:
6087 if (rx->offs[0].start != -1) {
6088 i = rx->offs[0].start;
6096 /* $' / ${^POSTMATCH} */
6097 case RX_BUFF_IDX_POSTMATCH:
6098 if (rx->offs[0].end != -1) {
6099 i = rx->sublen - rx->offs[0].end;
6101 s1 = rx->offs[0].end;
6107 /* $& / ${^MATCH}, $1, $2, ... */
6109 if (paren <= (I32)rx->nparens &&
6110 (s1 = rx->offs[paren].start) != -1 &&
6111 (t1 = rx->offs[paren].end) != -1)
6116 if (ckWARN(WARN_UNINITIALIZED))
6117 report_uninit((const SV *)sv);
6122 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6123 const char * const s = rx->subbeg + s1;
6128 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6135 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6137 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6138 PERL_UNUSED_ARG(rx);
6142 return newSVpvs("Regexp");
6145 /* Scans the name of a named buffer from the pattern.
6146 * If flags is REG_RSN_RETURN_NULL returns null.
6147 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6148 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6149 * to the parsed name as looked up in the RExC_paren_names hash.
6150 * If there is an error throws a vFAIL().. type exception.
6153 #define REG_RSN_RETURN_NULL 0
6154 #define REG_RSN_RETURN_NAME 1
6155 #define REG_RSN_RETURN_DATA 2
6158 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6160 char *name_start = RExC_parse;
6162 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6164 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6165 /* skip IDFIRST by using do...while */
6168 RExC_parse += UTF8SKIP(RExC_parse);
6169 } while (isALNUM_utf8((U8*)RExC_parse));
6173 } while (isALNUM(*RExC_parse));
6178 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6179 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6180 if ( flags == REG_RSN_RETURN_NAME)
6182 else if (flags==REG_RSN_RETURN_DATA) {
6185 if ( ! sv_name ) /* should not happen*/
6186 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6187 if (RExC_paren_names)
6188 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6190 sv_dat = HeVAL(he_str);
6192 vFAIL("Reference to nonexistent named group");
6196 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6197 (unsigned long) flags);
6204 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6205 int rem=(int)(RExC_end - RExC_parse); \
6214 if (RExC_lastparse!=RExC_parse) \
6215 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6218 iscut ? "..." : "<" \
6221 PerlIO_printf(Perl_debug_log,"%16s",""); \
6224 num = RExC_size + 1; \
6226 num=REG_NODE_NUM(RExC_emit); \
6227 if (RExC_lastnum!=num) \
6228 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6230 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6231 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6232 (int)((depth*2)), "", \
6236 RExC_lastparse=RExC_parse; \
6241 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6242 DEBUG_PARSE_MSG((funcname)); \
6243 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6245 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6246 DEBUG_PARSE_MSG((funcname)); \
6247 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6250 /* This section of code defines the inversion list object and its methods. The
6251 * interfaces are highly subject to change, so as much as possible is static to
6252 * this file. An inversion list is here implemented as a malloc'd C UV array
6253 * with some added info that is placed as UVs at the beginning in a header
6254 * portion. An inversion list for Unicode is an array of code points, sorted
6255 * by ordinal number. The zeroth element is the first code point in the list.
6256 * The 1th element is the first element beyond that not in the list. In other
6257 * words, the first range is
6258 * invlist[0]..(invlist[1]-1)
6259 * The other ranges follow. Thus every element whose index is divisible by two
6260 * marks the beginning of a range that is in the list, and every element not
6261 * divisible by two marks the beginning of a range not in the list. A single
6262 * element inversion list that contains the single code point N generally
6263 * consists of two elements
6266 * (The exception is when N is the highest representable value on the
6267 * machine, in which case the list containing just it would be a single
6268 * element, itself. By extension, if the last range in the list extends to
6269 * infinity, then the first element of that range will be in the inversion list
6270 * at a position that is divisible by two, and is the final element in the
6272 * Taking the complement (inverting) an inversion list is quite simple, if the
6273 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6274 * This implementation reserves an element at the beginning of each inversion list
6275 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6276 * beginning of the list is either that element if 0, or the next one if 1.
6278 * More about inversion lists can be found in "Unicode Demystified"
6279 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6280 * More will be coming when functionality is added later.
6282 * The inversion list data structure is currently implemented as an SV pointing
6283 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6284 * array of UV whose memory management is automatically handled by the existing
6285 * facilities for SV's.
6287 * Some of the methods should always be private to the implementation, and some
6288 * should eventually be made public */
6290 #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6291 #define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6293 /* This is a combination of a version and data structure type, so that one
6294 * being passed in can be validated to be an inversion list of the correct
6295 * vintage. When the structure of the header is changed, a new random number
6296 * in the range 2**31-1 should be generated and the new() method changed to
6297 * insert that at this location. Then, if an auxiliary program doesn't change
6298 * correspondingly, it will be discovered immediately */
6299 #define INVLIST_VERSION_ID_OFFSET 2
6300 #define INVLIST_VERSION_ID 1064334010
6302 /* For safety, when adding new elements, remember to #undef them at the end of
6303 * the inversion list code section */
6305 #define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
6306 /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
6307 * contains the code point U+00000, and begins here. If 1, the inversion list
6308 * doesn't contain U+0000, and it begins at the next UV in the array.
6309 * Inverting an inversion list consists of adding or removing the 0 at the
6310 * beginning of it. By reserving a space for that 0, inversion can be made
6313 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6315 /* Internally things are UVs */
6316 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6317 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6319 #define INVLIST_INITIAL_LEN 10
6321 PERL_STATIC_INLINE UV*
6322 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6324 /* Returns a pointer to the first element in the inversion list's array.
6325 * This is called upon initialization of an inversion list. Where the
6326 * array begins depends on whether the list has the code point U+0000
6327 * in it or not. The other parameter tells it whether the code that
6328 * follows this call is about to put a 0 in the inversion list or not.
6329 * The first element is either the element with 0, if 0, or the next one,
6332 UV* zero = get_invlist_zero_addr(invlist);
6334 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6337 assert(! *get_invlist_len_addr(invlist));
6339 /* 1^1 = 0; 1^0 = 1 */
6340 *zero = 1 ^ will_have_0;
6341 return zero + *zero;
6344 PERL_STATIC_INLINE UV*
6345 S_invlist_array(pTHX_ SV* const invlist)
6347 /* Returns the pointer to the inversion list's array. Every time the
6348 * length changes, this needs to be called in case malloc or realloc moved
6351 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6353 /* Must not be empty. If these fail, you probably didn't check for <len>
6354 * being non-zero before trying to get the array */
6355 assert(*get_invlist_len_addr(invlist));
6356 assert(*get_invlist_zero_addr(invlist) == 0
6357 || *get_invlist_zero_addr(invlist) == 1);
6359 /* The array begins either at the element reserved for zero if the
6360 * list contains 0 (that element will be set to 0), or otherwise the next
6361 * element (in which case the reserved element will be set to 1). */
6362 return (UV *) (get_invlist_zero_addr(invlist)
6363 + *get_invlist_zero_addr(invlist));
6366 PERL_STATIC_INLINE UV*
6367 S_get_invlist_len_addr(pTHX_ SV* invlist)
6369 /* Return the address of the UV that contains the current number
6370 * of used elements in the inversion list */
6372 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6374 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6377 PERL_STATIC_INLINE UV
6378 S_invlist_len(pTHX_ SV* const invlist)
6380 /* Returns the current number of elements stored in the inversion list's
6383 PERL_ARGS_ASSERT_INVLIST_LEN;
6385 return *get_invlist_len_addr(invlist);
6388 PERL_STATIC_INLINE void
6389 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6391 /* Sets the current number of elements stored in the inversion list */
6393 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6395 *get_invlist_len_addr(invlist) = len;
6397 assert(len <= SvLEN(invlist));
6399 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6400 /* If the list contains U+0000, that element is part of the header,
6401 * and should not be counted as part of the array. It will contain
6402 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
6404 * SvCUR_set(invlist,
6405 * TO_INTERNAL_SIZE(len
6406 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
6407 * But, this is only valid if len is not 0. The consequences of not doing
6408 * this is that the memory allocation code may think that 1 more UV is
6409 * being used than actually is, and so might do an unnecessary grow. That
6410 * seems worth not bothering to make this the precise amount.
6412 * Note that when inverting, SvCUR shouldn't change */
6415 PERL_STATIC_INLINE UV
6416 S_invlist_max(pTHX_ SV* const invlist)
6418 /* Returns the maximum number of elements storable in the inversion list's
6419 * array, without having to realloc() */
6421 PERL_ARGS_ASSERT_INVLIST_MAX;
6423 return FROM_INTERNAL_SIZE(SvLEN(invlist));
6426 PERL_STATIC_INLINE UV*
6427 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6429 /* Return the address of the UV that is reserved to hold 0 if the inversion
6430 * list contains 0. This has to be the last element of the heading, as the
6431 * list proper starts with either it if 0, or the next element if not.
6432 * (But we force it to contain either 0 or 1) */
6434 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6436 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6439 #ifndef PERL_IN_XSUB_RE
6441 Perl__new_invlist(pTHX_ IV initial_size)
6444 /* Return a pointer to a newly constructed inversion list, with enough
6445 * space to store 'initial_size' elements. If that number is negative, a
6446 * system default is used instead */
6450 if (initial_size < 0) {
6451 initial_size = INVLIST_INITIAL_LEN;
6454 /* Allocate the initial space */
6455 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6456 invlist_set_len(new_list, 0);
6458 /* Force iterinit() to be used to get iteration to work */
6459 *get_invlist_iter_addr(new_list) = UV_MAX;
6461 /* This should force a segfault if a method doesn't initialize this
6463 *get_invlist_zero_addr(new_list) = UV_MAX;
6465 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6466 #if HEADER_LENGTH != 4
6467 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6475 S__new_invlist_C_array(pTHX_ UV* list)
6477 /* Return a pointer to a newly constructed inversion list, initialized to
6478 * point to <list>, which has to be in the exact correct inversion list
6479 * form, including internal fields. Thus this is a dangerous routine that
6480 * should not be used in the wrong hands */
6482 SV* invlist = newSV_type(SVt_PV);
6484 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6486 SvPV_set(invlist, (char *) list);
6487 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
6488 shouldn't touch it */
6489 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6491 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6492 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6499 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6501 /* Grow the maximum size of an inversion list */
6503 PERL_ARGS_ASSERT_INVLIST_EXTEND;
6505 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6508 PERL_STATIC_INLINE void
6509 S_invlist_trim(pTHX_ SV* const invlist)
6511 PERL_ARGS_ASSERT_INVLIST_TRIM;
6513 /* Change the length of the inversion list to how many entries it currently
6516 SvPV_shrink_to_cur((SV *) invlist);
6519 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6521 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6522 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6524 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6527 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6529 /* Subject to change or removal. Append the range from 'start' to 'end' at
6530 * the end of the inversion list. The range must be above any existing
6534 UV max = invlist_max(invlist);
6535 UV len = invlist_len(invlist);
6537 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6539 if (len == 0) { /* Empty lists must be initialized */
6540 array = _invlist_array_init(invlist, start == 0);
6543 /* Here, the existing list is non-empty. The current max entry in the
6544 * list is generally the first value not in the set, except when the
6545 * set extends to the end of permissible values, in which case it is
6546 * the first entry in that final set, and so this call is an attempt to
6547 * append out-of-order */
6549 UV final_element = len - 1;
6550 array = invlist_array(invlist);
6551 if (array[final_element] > start
6552 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6554 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6555 array[final_element], start,
6556 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6559 /* Here, it is a legal append. If the new range begins with the first
6560 * value not in the set, it is extending the set, so the new first
6561 * value not in the set is one greater than the newly extended range.
6563 if (array[final_element] == start) {
6564 if (end != UV_MAX) {
6565 array[final_element] = end + 1;
6568 /* But if the end is the maximum representable on the machine,
6569 * just let the range that this would extend to have no end */
6570 invlist_set_len(invlist, len - 1);
6576 /* Here the new range doesn't extend any existing set. Add it */
6578 len += 2; /* Includes an element each for the start and end of range */
6580 /* If overflows the existing space, extend, which may cause the array to be
6583 invlist_extend(invlist, len);
6584 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
6585 failure in invlist_array() */
6586 array = invlist_array(invlist);
6589 invlist_set_len(invlist, len);
6592 /* The next item on the list starts the range, the one after that is
6593 * one past the new range. */
6594 array[len - 2] = start;
6595 if (end != UV_MAX) {
6596 array[len - 1] = end + 1;
6599 /* But if the end is the maximum representable on the machine, just let
6600 * the range have no end */
6601 invlist_set_len(invlist, len - 1);
6605 #ifndef PERL_IN_XSUB_RE
6608 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6610 /* Searches the inversion list for the entry that contains the input code
6611 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
6612 * return value is the index into the list's array of the range that
6616 IV high = invlist_len(invlist);
6617 const UV * const array = invlist_array(invlist);
6619 PERL_ARGS_ASSERT_INVLIST_SEARCH;
6621 /* If list is empty or the code point is before the first element, return
6623 if (high == 0 || cp < array[0]) {
6627 /* Binary search. What we are looking for is <i> such that
6628 * array[i] <= cp < array[i+1]
6629 * The loop below converges on the i+1. */
6630 while (low < high) {
6631 IV mid = (low + high) / 2;
6632 if (array[mid] <= cp) {
6635 /* We could do this extra test to exit the loop early.
6636 if (cp < array[low]) {
6641 else { /* cp < array[mid] */
6650 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6652 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6653 * but is used when the swash has an inversion list. This makes this much
6654 * faster, as it uses a binary search instead of a linear one. This is
6655 * intimately tied to that function, and perhaps should be in utf8.c,
6656 * except it is intimately tied to inversion lists as well. It assumes
6657 * that <swatch> is all 0's on input */
6660 const IV len = invlist_len(invlist);
6664 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6666 if (len == 0) { /* Empty inversion list */
6670 array = invlist_array(invlist);
6672 /* Find which element it is */
6673 i = invlist_search(invlist, start);
6675 /* We populate from <start> to <end> */
6676 while (current < end) {
6679 /* The inversion list gives the results for every possible code point
6680 * after the first one in the list. Only those ranges whose index is
6681 * even are ones that the inversion list matches. For the odd ones,
6682 * and if the initial code point is not in the list, we have to skip
6683 * forward to the next element */
6684 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6686 if (i >= len) { /* Finished if beyond the end of the array */
6690 if (current >= end) { /* Finished if beyond the end of what we
6695 assert(current >= start);
6697 /* The current range ends one below the next one, except don't go past
6700 upper = (i < len && array[i] < end) ? array[i] : end;
6702 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
6703 * for each code point in it */
6704 for (; current < upper; current++) {
6705 const STRLEN offset = (STRLEN)(current - start);
6706 swatch[offset >> 3] |= 1 << (offset & 7);
6709 /* Quit if at the end of the list */
6712 /* But first, have to deal with the highest possible code point on
6713 * the platform. The previous code assumes that <end> is one
6714 * beyond where we want to populate, but that is impossible at the
6715 * platform's infinity, so have to handle it specially */
6716 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6718 const STRLEN offset = (STRLEN)(end - start);
6719 swatch[offset >> 3] |= 1 << (offset & 7);
6724 /* Advance to the next range, which will be for code points not in the
6734 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
6736 /* Take the union of two inversion lists and point <output> to it. *output
6737 * should be defined upon input, and if it points to one of the two lists,
6738 * the reference count to that list will be decremented. The first list,
6739 * <a>, may be NULL, in which case a copy of the second list is returned.
6740 * If <complement_b> is TRUE, the union is taken of the complement
6741 * (inversion) of <b> instead of b itself.
6743 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6744 * Richard Gillam, published by Addison-Wesley, and explained at some
6745 * length there. The preface says to incorporate its examples into your
6746 * code at your own risk.
6748 * The algorithm is like a merge sort.
6750 * XXX A potential performance improvement is to keep track as we go along
6751 * if only one of the inputs contributes to the result, meaning the other
6752 * is a subset of that one. In that case, we can skip the final copy and
6753 * return the larger of the input lists, but then outside code might need
6754 * to keep track of whether to free the input list or not */
6756 UV* array_a; /* a's array */
6758 UV len_a; /* length of a's array */
6761 SV* u; /* the resulting union */
6765 UV i_a = 0; /* current index into a's array */
6769 /* running count, as explained in the algorithm source book; items are
6770 * stopped accumulating and are output when the count changes to/from 0.
6771 * The count is incremented when we start a range that's in the set, and
6772 * decremented when we start a range that's not in the set. So its range
6773 * is 0 to 2. Only when the count is zero is something not in the set.
6777 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
6780 /* If either one is empty, the union is the other one */
6781 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
6788 *output = invlist_clone(b);
6790 _invlist_invert(*output);
6792 } /* else *output already = b; */
6795 else if ((len_b = invlist_len(b)) == 0) {
6800 /* The complement of an empty list is a list that has everything in it,
6801 * so the union with <a> includes everything too */
6806 *output = _new_invlist(1);
6807 _append_range_to_invlist(*output, 0, UV_MAX);
6809 else if (*output != a) {
6810 *output = invlist_clone(a);
6812 /* else *output already = a; */
6816 /* Here both lists exist and are non-empty */
6817 array_a = invlist_array(a);
6818 array_b = invlist_array(b);
6820 /* If are to take the union of 'a' with the complement of b, set it
6821 * up so are looking at b's complement. */
6824 /* To complement, we invert: if the first element is 0, remove it. To
6825 * do this, we just pretend the array starts one later, and clear the
6826 * flag as we don't have to do anything else later */
6827 if (array_b[0] == 0) {
6830 complement_b = FALSE;
6834 /* But if the first element is not zero, we unshift a 0 before the
6835 * array. The data structure reserves a space for that 0 (which
6836 * should be a '1' right now), so physical shifting is unneeded,
6837 * but temporarily change that element to 0. Before exiting the
6838 * routine, we must restore the element to '1' */
6845 /* Size the union for the worst case: that the sets are completely
6847 u = _new_invlist(len_a + len_b);
6849 /* Will contain U+0000 if either component does */
6850 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6851 || (len_b > 0 && array_b[0] == 0));
6853 /* Go through each list item by item, stopping when exhausted one of
6855 while (i_a < len_a && i_b < len_b) {
6856 UV cp; /* The element to potentially add to the union's array */
6857 bool cp_in_set; /* is it in the the input list's set or not */
6859 /* We need to take one or the other of the two inputs for the union.
6860 * Since we are merging two sorted lists, we take the smaller of the
6861 * next items. In case of a tie, we take the one that is in its set
6862 * first. If we took one not in the set first, it would decrement the
6863 * count, possibly to 0 which would cause it to be output as ending the
6864 * range, and the next time through we would take the same number, and
6865 * output it again as beginning the next range. By doing it the
6866 * opposite way, there is no possibility that the count will be
6867 * momentarily decremented to 0, and thus the two adjoining ranges will
6868 * be seamlessly merged. (In a tie and both are in the set or both not
6869 * in the set, it doesn't matter which we take first.) */
6870 if (array_a[i_a] < array_b[i_b]
6871 || (array_a[i_a] == array_b[i_b]
6872 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6874 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6878 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6882 /* Here, have chosen which of the two inputs to look at. Only output
6883 * if the running count changes to/from 0, which marks the
6884 * beginning/end of a range in that's in the set */
6887 array_u[i_u++] = cp;
6894 array_u[i_u++] = cp;
6899 /* Here, we are finished going through at least one of the lists, which
6900 * means there is something remaining in at most one. We check if the list
6901 * that hasn't been exhausted is positioned such that we are in the middle
6902 * of a range in its set or not. (i_a and i_b point to the element beyond
6903 * the one we care about.) If in the set, we decrement 'count'; if 0, there
6904 * is potentially more to output.
6905 * There are four cases:
6906 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6907 * in the union is entirely from the non-exhausted set.
6908 * 2) Both were in their sets, count is 2. Nothing further should
6909 * be output, as everything that remains will be in the exhausted
6910 * list's set, hence in the union; decrementing to 1 but not 0 insures
6912 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6913 * Nothing further should be output because the union includes
6914 * everything from the exhausted set. Not decrementing ensures that.
6915 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6916 * decrementing to 0 insures that we look at the remainder of the
6917 * non-exhausted set */
6918 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6919 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6924 /* The final length is what we've output so far, plus what else is about to
6925 * be output. (If 'count' is non-zero, then the input list we exhausted
6926 * has everything remaining up to the machine's limit in its set, and hence
6927 * in the union, so there will be no further output. */
6930 /* At most one of the subexpressions will be non-zero */
6931 len_u += (len_a - i_a) + (len_b - i_b);
6934 /* Set result to final length, which can change the pointer to array_u, so
6936 if (len_u != invlist_len(u)) {
6937 invlist_set_len(u, len_u);
6939 array_u = invlist_array(u);
6942 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6943 * the other) ended with everything above it not in its set. That means
6944 * that the remaining part of the union is precisely the same as the
6945 * non-exhausted list, so can just copy it unchanged. (If both list were
6946 * exhausted at the same time, then the operations below will be both 0.)
6949 IV copy_count; /* At most one will have a non-zero copy count */
6950 if ((copy_count = len_a - i_a) > 0) {
6951 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6953 else if ((copy_count = len_b - i_b) > 0) {
6954 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6958 /* We may be removing a reference to one of the inputs */
6959 if (a == *output || b == *output) {
6960 SvREFCNT_dec(*output);
6963 /* If we've changed b, restore it */
6973 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
6975 /* Take the intersection of two inversion lists and point <i> to it. *i
6976 * should be defined upon input, and if it points to one of the two lists,
6977 * the reference count to that list will be decremented.
6978 * If <complement_b> is TRUE, the result will be the intersection of <a>
6979 * and the complement (or inversion) of <b> instead of <b> directly.
6981 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6982 * Richard Gillam, published by Addison-Wesley, and explained at some
6983 * length there. The preface says to incorporate its examples into your
6984 * code at your own risk. In fact, it had bugs
6986 * The algorithm is like a merge sort, and is essentially the same as the
6990 UV* array_a; /* a's array */
6992 UV len_a; /* length of a's array */
6995 SV* r; /* the resulting intersection */
6999 UV i_a = 0; /* current index into a's array */
7003 /* running count, as explained in the algorithm source book; items are
7004 * stopped accumulating and are output when the count changes to/from 2.
7005 * The count is incremented when we start a range that's in the set, and
7006 * decremented when we start a range that's not in the set. So its range
7007 * is 0 to 2. Only when the count is 2 is something in the intersection.
7011 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7014 /* Special case if either one is empty */
7015 len_a = invlist_len(a);
7016 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7018 if (len_a != 0 && complement_b) {
7020 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7021 * be empty. Here, also we are using 'b's complement, which hence
7022 * must be every possible code point. Thus the intersection is
7025 *i = invlist_clone(a);
7031 /* else *i is already 'a' */
7035 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7036 * intersection must be empty */
7043 *i = _new_invlist(0);
7047 /* Here both lists exist and are non-empty */
7048 array_a = invlist_array(a);
7049 array_b = invlist_array(b);
7051 /* If are to take the intersection of 'a' with the complement of b, set it
7052 * up so are looking at b's complement. */
7055 /* To complement, we invert: if the first element is 0, remove it. To
7056 * do this, we just pretend the array starts one later, and clear the
7057 * flag as we don't have to do anything else later */
7058 if (array_b[0] == 0) {
7061 complement_b = FALSE;
7065 /* But if the first element is not zero, we unshift a 0 before the
7066 * array. The data structure reserves a space for that 0 (which
7067 * should be a '1' right now), so physical shifting is unneeded,
7068 * but temporarily change that element to 0. Before exiting the
7069 * routine, we must restore the element to '1' */
7076 /* Size the intersection for the worst case: that the intersection ends up
7077 * fragmenting everything to be completely disjoint */
7078 r= _new_invlist(len_a + len_b);
7080 /* Will contain U+0000 iff both components do */
7081 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7082 && len_b > 0 && array_b[0] == 0);
7084 /* Go through each list item by item, stopping when exhausted one of
7086 while (i_a < len_a && i_b < len_b) {
7087 UV cp; /* The element to potentially add to the intersection's
7089 bool cp_in_set; /* Is it in the input list's set or not */
7091 /* We need to take one or the other of the two inputs for the
7092 * intersection. Since we are merging two sorted lists, we take the
7093 * smaller of the next items. In case of a tie, we take the one that
7094 * is not in its set first (a difference from the union algorithm). If
7095 * we took one in the set first, it would increment the count, possibly
7096 * to 2 which would cause it to be output as starting a range in the
7097 * intersection, and the next time through we would take that same
7098 * number, and output it again as ending the set. By doing it the
7099 * opposite of this, there is no possibility that the count will be
7100 * momentarily incremented to 2. (In a tie and both are in the set or
7101 * both not in the set, it doesn't matter which we take first.) */
7102 if (array_a[i_a] < array_b[i_b]
7103 || (array_a[i_a] == array_b[i_b]
7104 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7106 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7110 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7114 /* Here, have chosen which of the two inputs to look at. Only output
7115 * if the running count changes to/from 2, which marks the
7116 * beginning/end of a range that's in the intersection */
7120 array_r[i_r++] = cp;
7125 array_r[i_r++] = cp;
7131 /* Here, we are finished going through at least one of the lists, which
7132 * means there is something remaining in at most one. We check if the list
7133 * that has been exhausted is positioned such that we are in the middle
7134 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7135 * the ones we care about.) There are four cases:
7136 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7137 * nothing left in the intersection.
7138 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7139 * above 2. What should be output is exactly that which is in the
7140 * non-exhausted set, as everything it has is also in the intersection
7141 * set, and everything it doesn't have can't be in the intersection
7142 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7143 * gets incremented to 2. Like the previous case, the intersection is
7144 * everything that remains in the non-exhausted set.
7145 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7146 * remains 1. And the intersection has nothing more. */
7147 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7148 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7153 /* The final length is what we've output so far plus what else is in the
7154 * intersection. At most one of the subexpressions below will be non-zero */
7157 len_r += (len_a - i_a) + (len_b - i_b);
7160 /* Set result to final length, which can change the pointer to array_r, so
7162 if (len_r != invlist_len(r)) {
7163 invlist_set_len(r, len_r);
7165 array_r = invlist_array(r);
7168 /* Finish outputting any remaining */
7169 if (count >= 2) { /* At most one will have a non-zero copy count */
7171 if ((copy_count = len_a - i_a) > 0) {
7172 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7174 else if ((copy_count = len_b - i_b) > 0) {
7175 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7179 /* We may be removing a reference to one of the inputs */
7180 if (a == *i || b == *i) {
7184 /* If we've changed b, restore it */
7194 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7196 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7197 * set. A pointer to the inversion list is returned. This may actually be
7198 * a new list, in which case the passed in one has been destroyed. The
7199 * passed in inversion list can be NULL, in which case a new one is created
7200 * with just the one range in it */
7205 if (invlist == NULL) {
7206 invlist = _new_invlist(2);
7210 len = invlist_len(invlist);
7213 /* If comes after the final entry, can just append it to the end */
7215 || start >= invlist_array(invlist)
7216 [invlist_len(invlist) - 1])
7218 _append_range_to_invlist(invlist, start, end);
7222 /* Here, can't just append things, create and return a new inversion list
7223 * which is the union of this range and the existing inversion list */
7224 range_invlist = _new_invlist(2);
7225 _append_range_to_invlist(range_invlist, start, end);
7227 _invlist_union(invlist, range_invlist, &invlist);
7229 /* The temporary can be freed */
7230 SvREFCNT_dec(range_invlist);
7237 PERL_STATIC_INLINE SV*
7238 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7239 return _add_range_to_invlist(invlist, cp, cp);
7242 #ifndef PERL_IN_XSUB_RE
7244 Perl__invlist_invert(pTHX_ SV* const invlist)
7246 /* Complement the input inversion list. This adds a 0 if the list didn't
7247 * have a zero; removes it otherwise. As described above, the data
7248 * structure is set up so that this is very efficient */
7250 UV* len_pos = get_invlist_len_addr(invlist);
7252 PERL_ARGS_ASSERT__INVLIST_INVERT;
7254 /* The inverse of matching nothing is matching everything */
7255 if (*len_pos == 0) {
7256 _append_range_to_invlist(invlist, 0, UV_MAX);
7260 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7261 * zero element was a 0, so it is being removed, so the length decrements
7262 * by 1; and vice-versa. SvCUR is unaffected */
7263 if (*get_invlist_zero_addr(invlist) ^= 1) {
7272 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7274 /* Complement the input inversion list (which must be a Unicode property,
7275 * all of which don't match above the Unicode maximum code point.) And
7276 * Perl has chosen to not have the inversion match above that either. This
7277 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7283 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7285 _invlist_invert(invlist);
7287 len = invlist_len(invlist);
7289 if (len != 0) { /* If empty do nothing */
7290 array = invlist_array(invlist);
7291 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7292 /* Add 0x110000. First, grow if necessary */
7294 if (invlist_max(invlist) < len) {
7295 invlist_extend(invlist, len);
7296 array = invlist_array(invlist);
7298 invlist_set_len(invlist, len);
7299 array[len - 1] = PERL_UNICODE_MAX + 1;
7301 else { /* Remove the 0x110000 */
7302 invlist_set_len(invlist, len - 1);
7310 PERL_STATIC_INLINE SV*
7311 S_invlist_clone(pTHX_ SV* const invlist)
7314 /* Return a new inversion list that is a copy of the input one, which is
7317 /* Need to allocate extra space to accommodate Perl's addition of a
7318 * trailing NUL to SvPV's, since it thinks they are always strings */
7319 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7320 STRLEN length = SvCUR(invlist);
7322 PERL_ARGS_ASSERT_INVLIST_CLONE;
7324 SvCUR_set(new_invlist, length); /* This isn't done automatically */
7325 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7330 PERL_STATIC_INLINE UV*
7331 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7333 /* Return the address of the UV that contains the current iteration
7336 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7338 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7341 PERL_STATIC_INLINE UV*
7342 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7344 /* Return the address of the UV that contains the version id. */
7346 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7348 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7351 PERL_STATIC_INLINE void
7352 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
7354 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7356 *get_invlist_iter_addr(invlist) = 0;
7360 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7362 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7363 * This call sets in <*start> and <*end>, the next range in <invlist>.
7364 * Returns <TRUE> if successful and the next call will return the next
7365 * range; <FALSE> if was already at the end of the list. If the latter,
7366 * <*start> and <*end> are unchanged, and the next call to this function
7367 * will start over at the beginning of the list */
7369 UV* pos = get_invlist_iter_addr(invlist);
7370 UV len = invlist_len(invlist);
7373 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7376 *pos = UV_MAX; /* Force iternit() to be required next time */
7380 array = invlist_array(invlist);
7382 *start = array[(*pos)++];
7388 *end = array[(*pos)++] - 1;
7394 #ifndef PERL_IN_XSUB_RE
7396 Perl__invlist_contents(pTHX_ SV* const invlist)
7398 /* Get the contents of an inversion list into a string SV so that they can
7399 * be printed out. It uses the format traditionally done for debug tracing
7403 SV* output = newSVpvs("\n");
7405 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7407 invlist_iterinit(invlist);
7408 while (invlist_iternext(invlist, &start, &end)) {
7409 if (end == UV_MAX) {
7410 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7412 else if (end != start) {
7413 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7417 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7427 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7429 /* Dumps out the ranges in an inversion list. The string 'header'
7430 * if present is output on a line before the first range */
7434 if (header && strlen(header)) {
7435 PerlIO_printf(Perl_debug_log, "%s\n", header);
7437 invlist_iterinit(invlist);
7438 while (invlist_iternext(invlist, &start, &end)) {
7439 if (end == UV_MAX) {
7440 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7443 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7449 #undef HEADER_LENGTH
7450 #undef INVLIST_INITIAL_LENGTH
7451 #undef TO_INTERNAL_SIZE
7452 #undef FROM_INTERNAL_SIZE
7453 #undef INVLIST_LEN_OFFSET
7454 #undef INVLIST_ZERO_OFFSET
7455 #undef INVLIST_ITER_OFFSET
7456 #undef INVLIST_VERSION_ID
7458 /* End of inversion list object */
7461 - reg - regular expression, i.e. main body or parenthesized thing
7463 * Caller must absorb opening parenthesis.
7465 * Combining parenthesis handling with the base level of regular expression
7466 * is a trifle forced, but the need to tie the tails of the branches to what
7467 * follows makes it hard to avoid.
7469 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7471 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7473 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7477 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7478 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7481 register regnode *ret; /* Will be the head of the group. */
7482 register regnode *br;
7483 register regnode *lastbr;
7484 register regnode *ender = NULL;
7485 register I32 parno = 0;
7487 U32 oregflags = RExC_flags;
7488 bool have_branch = 0;
7490 I32 freeze_paren = 0;
7491 I32 after_freeze = 0;
7493 /* for (?g), (?gc), and (?o) warnings; warning
7494 about (?c) will warn about (?g) -- japhy */
7496 #define WASTED_O 0x01
7497 #define WASTED_G 0x02
7498 #define WASTED_C 0x04
7499 #define WASTED_GC (0x02|0x04)
7500 I32 wastedflags = 0x00;
7502 char * parse_start = RExC_parse; /* MJD */
7503 char * const oregcomp_parse = RExC_parse;
7505 GET_RE_DEBUG_FLAGS_DECL;
7507 PERL_ARGS_ASSERT_REG;
7508 DEBUG_PARSE("reg ");
7510 *flagp = 0; /* Tentatively. */
7513 /* Make an OPEN node, if parenthesized. */
7515 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7516 char *start_verb = RExC_parse;
7517 STRLEN verb_len = 0;
7518 char *start_arg = NULL;
7519 unsigned char op = 0;
7521 int internal_argval = 0; /* internal_argval is only useful if !argok */
7522 while ( *RExC_parse && *RExC_parse != ')' ) {
7523 if ( *RExC_parse == ':' ) {
7524 start_arg = RExC_parse + 1;
7530 verb_len = RExC_parse - start_verb;
7533 while ( *RExC_parse && *RExC_parse != ')' )
7535 if ( *RExC_parse != ')' )
7536 vFAIL("Unterminated verb pattern argument");
7537 if ( RExC_parse == start_arg )
7540 if ( *RExC_parse != ')' )
7541 vFAIL("Unterminated verb pattern");
7544 switch ( *start_verb ) {
7545 case 'A': /* (*ACCEPT) */
7546 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7548 internal_argval = RExC_nestroot;
7551 case 'C': /* (*COMMIT) */
7552 if ( memEQs(start_verb,verb_len,"COMMIT") )
7555 case 'F': /* (*FAIL) */
7556 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7561 case ':': /* (*:NAME) */
7562 case 'M': /* (*MARK:NAME) */
7563 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7568 case 'P': /* (*PRUNE) */
7569 if ( memEQs(start_verb,verb_len,"PRUNE") )
7572 case 'S': /* (*SKIP) */
7573 if ( memEQs(start_verb,verb_len,"SKIP") )
7576 case 'T': /* (*THEN) */
7577 /* [19:06] <TimToady> :: is then */
7578 if ( memEQs(start_verb,verb_len,"THEN") ) {
7580 RExC_seen |= REG_SEEN_CUTGROUP;
7586 vFAIL3("Unknown verb pattern '%.*s'",
7587 verb_len, start_verb);
7590 if ( start_arg && internal_argval ) {
7591 vFAIL3("Verb pattern '%.*s' may not have an argument",
7592 verb_len, start_verb);
7593 } else if ( argok < 0 && !start_arg ) {
7594 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7595 verb_len, start_verb);
7597 ret = reganode(pRExC_state, op, internal_argval);
7598 if ( ! internal_argval && ! SIZE_ONLY ) {
7600 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7601 ARG(ret) = add_data( pRExC_state, 1, "S" );
7602 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7609 if (!internal_argval)
7610 RExC_seen |= REG_SEEN_VERBARG;
7611 } else if ( start_arg ) {
7612 vFAIL3("Verb pattern '%.*s' may not have an argument",
7613 verb_len, start_verb);
7615 ret = reg_node(pRExC_state, op);
7617 nextchar(pRExC_state);
7620 if (*RExC_parse == '?') { /* (?...) */
7621 bool is_logical = 0;
7622 const char * const seqstart = RExC_parse;
7623 bool has_use_defaults = FALSE;
7626 paren = *RExC_parse++;
7627 ret = NULL; /* For look-ahead/behind. */
7630 case 'P': /* (?P...) variants for those used to PCRE/Python */
7631 paren = *RExC_parse++;
7632 if ( paren == '<') /* (?P<...>) named capture */
7634 else if (paren == '>') { /* (?P>name) named recursion */
7635 goto named_recursion;
7637 else if (paren == '=') { /* (?P=...) named backref */
7638 /* this pretty much dupes the code for \k<NAME> in regatom(), if
7639 you change this make sure you change that */
7640 char* name_start = RExC_parse;
7642 SV *sv_dat = reg_scan_name(pRExC_state,
7643 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7644 if (RExC_parse == name_start || *RExC_parse != ')')
7645 vFAIL2("Sequence %.3s... not terminated",parse_start);
7648 num = add_data( pRExC_state, 1, "S" );
7649 RExC_rxi->data->data[num]=(void*)sv_dat;
7650 SvREFCNT_inc_simple_void(sv_dat);
7653 ret = reganode(pRExC_state,
7656 : (MORE_ASCII_RESTRICTED)
7658 : (AT_LEAST_UNI_SEMANTICS)
7666 Set_Node_Offset(ret, parse_start+1);
7667 Set_Node_Cur_Length(ret); /* MJD */
7669 nextchar(pRExC_state);
7673 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7675 case '<': /* (?<...) */
7676 if (*RExC_parse == '!')
7678 else if (*RExC_parse != '=')
7684 case '\'': /* (?'...') */
7685 name_start= RExC_parse;
7686 svname = reg_scan_name(pRExC_state,
7687 SIZE_ONLY ? /* reverse test from the others */
7688 REG_RSN_RETURN_NAME :
7689 REG_RSN_RETURN_NULL);
7690 if (RExC_parse == name_start) {
7692 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7695 if (*RExC_parse != paren)
7696 vFAIL2("Sequence (?%c... not terminated",
7697 paren=='>' ? '<' : paren);
7701 if (!svname) /* shouldn't happen */
7703 "panic: reg_scan_name returned NULL");
7704 if (!RExC_paren_names) {
7705 RExC_paren_names= newHV();
7706 sv_2mortal(MUTABLE_SV(RExC_paren_names));
7708 RExC_paren_name_list= newAV();
7709 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7712 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7714 sv_dat = HeVAL(he_str);
7716 /* croak baby croak */
7718 "panic: paren_name hash element allocation failed");
7719 } else if ( SvPOK(sv_dat) ) {
7720 /* (?|...) can mean we have dupes so scan to check
7721 its already been stored. Maybe a flag indicating
7722 we are inside such a construct would be useful,
7723 but the arrays are likely to be quite small, so
7724 for now we punt -- dmq */
7725 IV count = SvIV(sv_dat);
7726 I32 *pv = (I32*)SvPVX(sv_dat);
7728 for ( i = 0 ; i < count ; i++ ) {
7729 if ( pv[i] == RExC_npar ) {
7735 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7736 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7737 pv[count] = RExC_npar;
7738 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7741 (void)SvUPGRADE(sv_dat,SVt_PVNV);
7742 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7744 SvIV_set(sv_dat, 1);
7747 /* Yes this does cause a memory leak in debugging Perls */
7748 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7749 SvREFCNT_dec(svname);
7752 /*sv_dump(sv_dat);*/
7754 nextchar(pRExC_state);
7756 goto capturing_parens;
7758 RExC_seen |= REG_SEEN_LOOKBEHIND;
7759 RExC_in_lookbehind++;
7761 case '=': /* (?=...) */
7762 RExC_seen_zerolen++;
7764 case '!': /* (?!...) */
7765 RExC_seen_zerolen++;
7766 if (*RExC_parse == ')') {
7767 ret=reg_node(pRExC_state, OPFAIL);
7768 nextchar(pRExC_state);
7772 case '|': /* (?|...) */
7773 /* branch reset, behave like a (?:...) except that
7774 buffers in alternations share the same numbers */
7776 after_freeze = freeze_paren = RExC_npar;
7778 case ':': /* (?:...) */
7779 case '>': /* (?>...) */
7781 case '$': /* (?$...) */
7782 case '@': /* (?@...) */
7783 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7785 case '#': /* (?#...) */
7786 while (*RExC_parse && *RExC_parse != ')')
7788 if (*RExC_parse != ')')
7789 FAIL("Sequence (?#... not terminated");
7790 nextchar(pRExC_state);
7793 case '0' : /* (?0) */
7794 case 'R' : /* (?R) */
7795 if (*RExC_parse != ')')
7796 FAIL("Sequence (?R) not terminated");
7797 ret = reg_node(pRExC_state, GOSTART);
7798 *flagp |= POSTPONED;
7799 nextchar(pRExC_state);
7802 { /* named and numeric backreferences */
7804 case '&': /* (?&NAME) */
7805 parse_start = RExC_parse - 1;
7808 SV *sv_dat = reg_scan_name(pRExC_state,
7809 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7810 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7812 goto gen_recurse_regop;
7815 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7817 vFAIL("Illegal pattern");
7819 goto parse_recursion;
7821 case '-': /* (?-1) */
7822 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7823 RExC_parse--; /* rewind to let it be handled later */
7827 case '1': case '2': case '3': case '4': /* (?1) */
7828 case '5': case '6': case '7': case '8': case '9':
7831 num = atoi(RExC_parse);
7832 parse_start = RExC_parse - 1; /* MJD */
7833 if (*RExC_parse == '-')
7835 while (isDIGIT(*RExC_parse))
7837 if (*RExC_parse!=')')
7838 vFAIL("Expecting close bracket");
7841 if ( paren == '-' ) {
7843 Diagram of capture buffer numbering.
7844 Top line is the normal capture buffer numbers
7845 Bottom line is the negative indexing as from
7849 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7853 num = RExC_npar + num;
7856 vFAIL("Reference to nonexistent group");
7858 } else if ( paren == '+' ) {
7859 num = RExC_npar + num - 1;
7862 ret = reganode(pRExC_state, GOSUB, num);
7864 if (num > (I32)RExC_rx->nparens) {
7866 vFAIL("Reference to nonexistent group");
7868 ARG2L_SET( ret, RExC_recurse_count++);
7870 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7871 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7875 RExC_seen |= REG_SEEN_RECURSE;
7876 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7877 Set_Node_Offset(ret, parse_start); /* MJD */
7879 *flagp |= POSTPONED;
7880 nextchar(pRExC_state);
7882 } /* named and numeric backreferences */
7885 case '?': /* (??...) */
7887 if (*RExC_parse != '{') {
7889 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7892 *flagp |= POSTPONED;
7893 paren = *RExC_parse++;
7895 case '{': /* (?{...}) */
7900 char *s = RExC_parse;
7902 RExC_seen_zerolen++;
7903 RExC_seen |= REG_SEEN_EVAL;
7904 while (count && (c = *RExC_parse)) {
7915 if (*RExC_parse != ')') {
7917 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7921 OP_4tree *sop, *rop;
7922 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7925 Perl_save_re_context(aTHX);
7926 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7927 sop->op_private |= OPpREFCOUNTED;
7928 /* re_dup will OpREFCNT_inc */
7929 OpREFCNT_set(sop, 1);
7932 n = add_data(pRExC_state, 3, "nop");
7933 RExC_rxi->data->data[n] = (void*)rop;
7934 RExC_rxi->data->data[n+1] = (void*)sop;
7935 RExC_rxi->data->data[n+2] = (void*)pad;
7938 else { /* First pass */
7939 if (PL_reginterp_cnt < ++RExC_seen_evals
7941 /* No compiled RE interpolated, has runtime
7942 components ===> unsafe. */
7943 FAIL("Eval-group not allowed at runtime, use re 'eval'");
7944 if (PL_tainting && PL_tainted)
7945 FAIL("Eval-group in insecure regular expression");
7946 #if PERL_VERSION > 8
7947 if (IN_PERL_COMPILETIME)
7952 nextchar(pRExC_state);
7954 ret = reg_node(pRExC_state, LOGICAL);
7957 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7958 /* deal with the length of this later - MJD */
7961 ret = reganode(pRExC_state, EVAL, n);
7962 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7963 Set_Node_Offset(ret, parse_start);
7966 case '(': /* (?(?{...})...) and (?(?=...)...) */
7969 if (RExC_parse[0] == '?') { /* (?(?...)) */
7970 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7971 || RExC_parse[1] == '<'
7972 || RExC_parse[1] == '{') { /* Lookahead or eval. */
7975 ret = reg_node(pRExC_state, LOGICAL);
7978 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7982 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
7983 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7985 char ch = RExC_parse[0] == '<' ? '>' : '\'';
7986 char *name_start= RExC_parse++;
7988 SV *sv_dat=reg_scan_name(pRExC_state,
7989 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7990 if (RExC_parse == name_start || *RExC_parse != ch)
7991 vFAIL2("Sequence (?(%c... not terminated",
7992 (ch == '>' ? '<' : ch));
7995 num = add_data( pRExC_state, 1, "S" );
7996 RExC_rxi->data->data[num]=(void*)sv_dat;
7997 SvREFCNT_inc_simple_void(sv_dat);
7999 ret = reganode(pRExC_state,NGROUPP,num);
8000 goto insert_if_check_paren;
8002 else if (RExC_parse[0] == 'D' &&
8003 RExC_parse[1] == 'E' &&
8004 RExC_parse[2] == 'F' &&
8005 RExC_parse[3] == 'I' &&
8006 RExC_parse[4] == 'N' &&
8007 RExC_parse[5] == 'E')
8009 ret = reganode(pRExC_state,DEFINEP,0);
8012 goto insert_if_check_paren;
8014 else if (RExC_parse[0] == 'R') {
8017 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8018 parno = atoi(RExC_parse++);
8019 while (isDIGIT(*RExC_parse))
8021 } else if (RExC_parse[0] == '&') {
8024 sv_dat = reg_scan_name(pRExC_state,
8025 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8026 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8028 ret = reganode(pRExC_state,INSUBP,parno);
8029 goto insert_if_check_paren;
8031 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8034 parno = atoi(RExC_parse++);
8036 while (isDIGIT(*RExC_parse))
8038 ret = reganode(pRExC_state, GROUPP, parno);
8040 insert_if_check_paren:
8041 if ((c = *nextchar(pRExC_state)) != ')')
8042 vFAIL("Switch condition not recognized");
8044 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8045 br = regbranch(pRExC_state, &flags, 1,depth+1);
8047 br = reganode(pRExC_state, LONGJMP, 0);
8049 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8050 c = *nextchar(pRExC_state);
8055 vFAIL("(?(DEFINE)....) does not allow branches");
8056 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8057 regbranch(pRExC_state, &flags, 1,depth+1);
8058 REGTAIL(pRExC_state, ret, lastbr);
8061 c = *nextchar(pRExC_state);
8066 vFAIL("Switch (?(condition)... contains too many branches");
8067 ender = reg_node(pRExC_state, TAIL);
8068 REGTAIL(pRExC_state, br, ender);
8070 REGTAIL(pRExC_state, lastbr, ender);
8071 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8074 REGTAIL(pRExC_state, ret, ender);
8075 RExC_size++; /* XXX WHY do we need this?!!
8076 For large programs it seems to be required
8077 but I can't figure out why. -- dmq*/
8081 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8085 RExC_parse--; /* for vFAIL to print correctly */
8086 vFAIL("Sequence (? incomplete");
8088 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8090 has_use_defaults = TRUE;
8091 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8092 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8093 ? REGEX_UNICODE_CHARSET
8094 : REGEX_DEPENDS_CHARSET);
8098 parse_flags: /* (?i) */
8100 U32 posflags = 0, negflags = 0;
8101 U32 *flagsp = &posflags;
8102 char has_charset_modifier = '\0';
8103 regex_charset cs = get_regex_charset(RExC_flags);
8104 if (cs == REGEX_DEPENDS_CHARSET
8105 && (RExC_utf8 || RExC_uni_semantics))
8107 cs = REGEX_UNICODE_CHARSET;
8110 while (*RExC_parse) {
8111 /* && strchr("iogcmsx", *RExC_parse) */
8112 /* (?g), (?gc) and (?o) are useless here
8113 and must be globally applied -- japhy */
8114 switch (*RExC_parse) {
8115 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8116 case LOCALE_PAT_MOD:
8117 if (has_charset_modifier) {
8118 goto excess_modifier;
8120 else if (flagsp == &negflags) {
8123 cs = REGEX_LOCALE_CHARSET;
8124 has_charset_modifier = LOCALE_PAT_MOD;
8125 RExC_contains_locale = 1;
8127 case UNICODE_PAT_MOD:
8128 if (has_charset_modifier) {
8129 goto excess_modifier;
8131 else if (flagsp == &negflags) {
8134 cs = REGEX_UNICODE_CHARSET;
8135 has_charset_modifier = UNICODE_PAT_MOD;
8137 case ASCII_RESTRICT_PAT_MOD:
8138 if (flagsp == &negflags) {
8141 if (has_charset_modifier) {
8142 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8143 goto excess_modifier;
8145 /* Doubled modifier implies more restricted */
8146 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8149 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8151 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8153 case DEPENDS_PAT_MOD:
8154 if (has_use_defaults) {
8155 goto fail_modifiers;
8157 else if (flagsp == &negflags) {
8160 else if (has_charset_modifier) {
8161 goto excess_modifier;
8164 /* The dual charset means unicode semantics if the
8165 * pattern (or target, not known until runtime) are
8166 * utf8, or something in the pattern indicates unicode
8168 cs = (RExC_utf8 || RExC_uni_semantics)
8169 ? REGEX_UNICODE_CHARSET
8170 : REGEX_DEPENDS_CHARSET;
8171 has_charset_modifier = DEPENDS_PAT_MOD;
8175 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8176 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8178 else if (has_charset_modifier == *(RExC_parse - 1)) {
8179 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8182 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8187 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8189 case ONCE_PAT_MOD: /* 'o' */
8190 case GLOBAL_PAT_MOD: /* 'g' */
8191 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8192 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8193 if (! (wastedflags & wflagbit) ) {
8194 wastedflags |= wflagbit;
8197 "Useless (%s%c) - %suse /%c modifier",
8198 flagsp == &negflags ? "?-" : "?",
8200 flagsp == &negflags ? "don't " : "",
8207 case CONTINUE_PAT_MOD: /* 'c' */
8208 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8209 if (! (wastedflags & WASTED_C) ) {
8210 wastedflags |= WASTED_GC;
8213 "Useless (%sc) - %suse /gc modifier",
8214 flagsp == &negflags ? "?-" : "?",
8215 flagsp == &negflags ? "don't " : ""
8220 case KEEPCOPY_PAT_MOD: /* 'p' */
8221 if (flagsp == &negflags) {
8223 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8225 *flagsp |= RXf_PMf_KEEPCOPY;
8229 /* A flag is a default iff it is following a minus, so
8230 * if there is a minus, it means will be trying to
8231 * re-specify a default which is an error */
8232 if (has_use_defaults || flagsp == &negflags) {
8235 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8239 wastedflags = 0; /* reset so (?g-c) warns twice */
8245 RExC_flags |= posflags;
8246 RExC_flags &= ~negflags;
8247 set_regex_charset(&RExC_flags, cs);
8249 oregflags |= posflags;
8250 oregflags &= ~negflags;
8251 set_regex_charset(&oregflags, cs);
8253 nextchar(pRExC_state);
8264 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8269 }} /* one for the default block, one for the switch */
8276 ret = reganode(pRExC_state, OPEN, parno);
8279 RExC_nestroot = parno;
8280 if (RExC_seen & REG_SEEN_RECURSE
8281 && !RExC_open_parens[parno-1])
8283 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8284 "Setting open paren #%"IVdf" to %d\n",
8285 (IV)parno, REG_NODE_NUM(ret)));
8286 RExC_open_parens[parno-1]= ret;
8289 Set_Node_Length(ret, 1); /* MJD */
8290 Set_Node_Offset(ret, RExC_parse); /* MJD */
8298 /* Pick up the branches, linking them together. */
8299 parse_start = RExC_parse; /* MJD */
8300 br = regbranch(pRExC_state, &flags, 1,depth+1);
8302 /* branch_len = (paren != 0); */
8306 if (*RExC_parse == '|') {
8307 if (!SIZE_ONLY && RExC_extralen) {
8308 reginsert(pRExC_state, BRANCHJ, br, depth+1);
8311 reginsert(pRExC_state, BRANCH, br, depth+1);
8312 Set_Node_Length(br, paren != 0);
8313 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8317 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
8319 else if (paren == ':') {
8320 *flagp |= flags&SIMPLE;
8322 if (is_open) { /* Starts with OPEN. */
8323 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
8325 else if (paren != '?') /* Not Conditional */
8327 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8329 while (*RExC_parse == '|') {
8330 if (!SIZE_ONLY && RExC_extralen) {
8331 ender = reganode(pRExC_state, LONGJMP,0);
8332 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8335 RExC_extralen += 2; /* Account for LONGJMP. */
8336 nextchar(pRExC_state);
8338 if (RExC_npar > after_freeze)
8339 after_freeze = RExC_npar;
8340 RExC_npar = freeze_paren;
8342 br = regbranch(pRExC_state, &flags, 0, depth+1);
8346 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
8348 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8351 if (have_branch || paren != ':') {
8352 /* Make a closing node, and hook it on the end. */
8355 ender = reg_node(pRExC_state, TAIL);
8358 ender = reganode(pRExC_state, CLOSE, parno);
8359 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8360 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8361 "Setting close paren #%"IVdf" to %d\n",
8362 (IV)parno, REG_NODE_NUM(ender)));
8363 RExC_close_parens[parno-1]= ender;
8364 if (RExC_nestroot == parno)
8367 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8368 Set_Node_Length(ender,1); /* MJD */
8374 *flagp &= ~HASWIDTH;
8377 ender = reg_node(pRExC_state, SUCCEED);
8380 ender = reg_node(pRExC_state, END);
8382 assert(!RExC_opend); /* there can only be one! */
8387 REGTAIL(pRExC_state, lastbr, ender);
8389 if (have_branch && !SIZE_ONLY) {
8391 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8393 /* Hook the tails of the branches to the closing node. */
8394 for (br = ret; br; br = regnext(br)) {
8395 const U8 op = PL_regkind[OP(br)];
8397 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8399 else if (op == BRANCHJ) {
8400 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8408 static const char parens[] = "=!<,>";
8410 if (paren && (p = strchr(parens, paren))) {
8411 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8412 int flag = (p - parens) > 1;
8415 node = SUSPEND, flag = 0;
8416 reginsert(pRExC_state, node,ret, depth+1);
8417 Set_Node_Cur_Length(ret);
8418 Set_Node_Offset(ret, parse_start + 1);
8420 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8424 /* Check for proper termination. */
8426 RExC_flags = oregflags;
8427 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8428 RExC_parse = oregcomp_parse;
8429 vFAIL("Unmatched (");
8432 else if (!paren && RExC_parse < RExC_end) {
8433 if (*RExC_parse == ')') {
8435 vFAIL("Unmatched )");
8438 FAIL("Junk on end of regexp"); /* "Can't happen". */
8442 if (RExC_in_lookbehind) {
8443 RExC_in_lookbehind--;
8445 if (after_freeze > RExC_npar)
8446 RExC_npar = after_freeze;
8451 - regbranch - one alternative of an | operator
8453 * Implements the concatenation operator.
8456 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8459 register regnode *ret;
8460 register regnode *chain = NULL;
8461 register regnode *latest;
8462 I32 flags = 0, c = 0;
8463 GET_RE_DEBUG_FLAGS_DECL;
8465 PERL_ARGS_ASSERT_REGBRANCH;
8467 DEBUG_PARSE("brnc");
8472 if (!SIZE_ONLY && RExC_extralen)
8473 ret = reganode(pRExC_state, BRANCHJ,0);
8475 ret = reg_node(pRExC_state, BRANCH);
8476 Set_Node_Length(ret, 1);
8480 if (!first && SIZE_ONLY)
8481 RExC_extralen += 1; /* BRANCHJ */
8483 *flagp = WORST; /* Tentatively. */
8486 nextchar(pRExC_state);
8487 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8489 latest = regpiece(pRExC_state, &flags,depth+1);
8490 if (latest == NULL) {
8491 if (flags & TRYAGAIN)
8495 else if (ret == NULL)
8497 *flagp |= flags&(HASWIDTH|POSTPONED);
8498 if (chain == NULL) /* First piece. */
8499 *flagp |= flags&SPSTART;
8502 REGTAIL(pRExC_state, chain, latest);
8507 if (chain == NULL) { /* Loop ran zero times. */
8508 chain = reg_node(pRExC_state, NOTHING);
8513 *flagp |= flags&SIMPLE;
8520 - regpiece - something followed by possible [*+?]
8522 * Note that the branching code sequences used for ? and the general cases
8523 * of * and + are somewhat optimized: they use the same NOTHING node as
8524 * both the endmarker for their branch list and the body of the last branch.
8525 * It might seem that this node could be dispensed with entirely, but the
8526 * endmarker role is not redundant.
8529 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8532 register regnode *ret;
8534 register char *next;
8536 const char * const origparse = RExC_parse;
8538 I32 max = REG_INFTY;
8539 #ifdef RE_TRACK_PATTERN_OFFSETS
8542 const char *maxpos = NULL;
8543 GET_RE_DEBUG_FLAGS_DECL;
8545 PERL_ARGS_ASSERT_REGPIECE;
8547 DEBUG_PARSE("piec");
8549 ret = regatom(pRExC_state, &flags,depth+1);
8551 if (flags & TRYAGAIN)
8558 if (op == '{' && regcurly(RExC_parse)) {
8560 #ifdef RE_TRACK_PATTERN_OFFSETS
8561 parse_start = RExC_parse; /* MJD */
8563 next = RExC_parse + 1;
8564 while (isDIGIT(*next) || *next == ',') {
8573 if (*next == '}') { /* got one */
8577 min = atoi(RExC_parse);
8581 maxpos = RExC_parse;
8583 if (!max && *maxpos != '0')
8584 max = REG_INFTY; /* meaning "infinity" */
8585 else if (max >= REG_INFTY)
8586 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8588 nextchar(pRExC_state);
8591 if ((flags&SIMPLE)) {
8592 RExC_naughty += 2 + RExC_naughty / 2;
8593 reginsert(pRExC_state, CURLY, ret, depth+1);
8594 Set_Node_Offset(ret, parse_start+1); /* MJD */
8595 Set_Node_Cur_Length(ret);
8598 regnode * const w = reg_node(pRExC_state, WHILEM);
8601 REGTAIL(pRExC_state, ret, w);
8602 if (!SIZE_ONLY && RExC_extralen) {
8603 reginsert(pRExC_state, LONGJMP,ret, depth+1);
8604 reginsert(pRExC_state, NOTHING,ret, depth+1);
8605 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
8607 reginsert(pRExC_state, CURLYX,ret, depth+1);
8609 Set_Node_Offset(ret, parse_start+1);
8610 Set_Node_Length(ret,
8611 op == '{' ? (RExC_parse - parse_start) : 1);
8613 if (!SIZE_ONLY && RExC_extralen)
8614 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
8615 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8617 RExC_whilem_seen++, RExC_extralen += 3;
8618 RExC_naughty += 4 + RExC_naughty; /* compound interest */
8627 vFAIL("Can't do {n,m} with n > m");
8629 ARG1_SET(ret, (U16)min);
8630 ARG2_SET(ret, (U16)max);
8642 #if 0 /* Now runtime fix should be reliable. */
8644 /* if this is reinstated, don't forget to put this back into perldiag:
8646 =item Regexp *+ operand could be empty at {#} in regex m/%s/
8648 (F) The part of the regexp subject to either the * or + quantifier
8649 could match an empty string. The {#} shows in the regular
8650 expression about where the problem was discovered.
8654 if (!(flags&HASWIDTH) && op != '?')
8655 vFAIL("Regexp *+ operand could be empty");
8658 #ifdef RE_TRACK_PATTERN_OFFSETS
8659 parse_start = RExC_parse;
8661 nextchar(pRExC_state);
8663 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8665 if (op == '*' && (flags&SIMPLE)) {
8666 reginsert(pRExC_state, STAR, ret, depth+1);
8670 else if (op == '*') {
8674 else if (op == '+' && (flags&SIMPLE)) {
8675 reginsert(pRExC_state, PLUS, ret, depth+1);
8679 else if (op == '+') {
8683 else if (op == '?') {
8688 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8689 ckWARN3reg(RExC_parse,
8690 "%.*s matches null string many times",
8691 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8695 if (RExC_parse < RExC_end && *RExC_parse == '?') {
8696 nextchar(pRExC_state);
8697 reginsert(pRExC_state, MINMOD, ret, depth+1);
8698 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8700 #ifndef REG_ALLOW_MINMOD_SUSPEND
8703 if (RExC_parse < RExC_end && *RExC_parse == '+') {
8705 nextchar(pRExC_state);
8706 ender = reg_node(pRExC_state, SUCCEED);
8707 REGTAIL(pRExC_state, ret, ender);
8708 reginsert(pRExC_state, SUSPEND, ret, depth+1);
8710 ender = reg_node(pRExC_state, TAIL);
8711 REGTAIL(pRExC_state, ret, ender);
8715 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8717 vFAIL("Nested quantifiers");
8724 /* reg_namedseq(pRExC_state,UVp, UV depth)
8726 This is expected to be called by a parser routine that has
8727 recognized '\N' and needs to handle the rest. RExC_parse is
8728 expected to point at the first char following the N at the time
8731 The \N may be inside (indicated by valuep not being NULL) or outside a
8734 \N may begin either a named sequence, or if outside a character class, mean
8735 to match a non-newline. For non single-quoted regexes, the tokenizer has
8736 attempted to decide which, and in the case of a named sequence converted it
8737 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8738 where c1... are the characters in the sequence. For single-quoted regexes,
8739 the tokenizer passes the \N sequence through unchanged; this code will not
8740 attempt to determine this nor expand those. The net effect is that if the
8741 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8742 signals that this \N occurrence means to match a non-newline.
8744 Only the \N{U+...} form should occur in a character class, for the same
8745 reason that '.' inside a character class means to just match a period: it
8746 just doesn't make sense.
8748 If valuep is non-null then it is assumed that we are parsing inside
8749 of a charclass definition and the first codepoint in the resolved
8750 string is returned via *valuep and the routine will return NULL.
8751 In this mode if a multichar string is returned from the charnames
8752 handler, a warning will be issued, and only the first char in the
8753 sequence will be examined. If the string returned is zero length
8754 then the value of *valuep is undefined and NON-NULL will
8755 be returned to indicate failure. (This will NOT be a valid pointer
8758 If valuep is null then it is assumed that we are parsing normal text and a
8759 new EXACT node is inserted into the program containing the resolved string,
8760 and a pointer to the new node is returned. But if the string is zero length
8761 a NOTHING node is emitted instead.
8763 On success RExC_parse is set to the char following the endbrace.
8764 Parsing failures will generate a fatal error via vFAIL(...)
8767 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8769 char * endbrace; /* '}' following the name */
8770 regnode *ret = NULL;
8773 GET_RE_DEBUG_FLAGS_DECL;
8775 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8779 /* The [^\n] meaning of \N ignores spaces and comments under the /x
8780 * modifier. The other meaning does not */
8781 p = (RExC_flags & RXf_PMf_EXTENDED)
8782 ? regwhite( pRExC_state, RExC_parse )
8785 /* Disambiguate between \N meaning a named character versus \N meaning
8786 * [^\n]. The former is assumed when it can't be the latter. */
8787 if (*p != '{' || regcurly(p)) {
8790 /* no bare \N in a charclass */
8791 vFAIL("\\N in a character class must be a named character: \\N{...}");
8793 nextchar(pRExC_state);
8794 ret = reg_node(pRExC_state, REG_ANY);
8795 *flagp |= HASWIDTH|SIMPLE;
8798 Set_Node_Length(ret, 1); /* MJD */
8802 /* Here, we have decided it should be a named sequence */
8804 /* The test above made sure that the next real character is a '{', but
8805 * under the /x modifier, it could be separated by space (or a comment and
8806 * \n) and this is not allowed (for consistency with \x{...} and the
8807 * tokenizer handling of \N{NAME}). */
8808 if (*RExC_parse != '{') {
8809 vFAIL("Missing braces on \\N{}");
8812 RExC_parse++; /* Skip past the '{' */
8814 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8815 || ! (endbrace == RExC_parse /* nothing between the {} */
8816 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
8817 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8819 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
8820 vFAIL("\\N{NAME} must be resolved by the lexer");
8823 if (endbrace == RExC_parse) { /* empty: \N{} */
8825 RExC_parse = endbrace + 1;
8826 return reg_node(pRExC_state,NOTHING);
8830 ckWARNreg(RExC_parse,
8831 "Ignoring zero length \\N{} in character class"
8833 RExC_parse = endbrace + 1;
8836 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8839 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
8840 RExC_parse += 2; /* Skip past the 'U+' */
8842 if (valuep) { /* In a bracketed char class */
8843 /* We only pay attention to the first char of
8844 multichar strings being returned. I kinda wonder
8845 if this makes sense as it does change the behaviour
8846 from earlier versions, OTOH that behaviour was broken
8847 as well. XXX Solution is to recharacterize as
8848 [rest-of-class]|multi1|multi2... */
8850 STRLEN length_of_hex;
8851 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8852 | PERL_SCAN_DISALLOW_PREFIX
8853 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8855 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8856 if (endchar < endbrace) {
8857 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8860 length_of_hex = (STRLEN)(endchar - RExC_parse);
8861 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8863 /* The tokenizer should have guaranteed validity, but it's possible to
8864 * bypass it by using single quoting, so check */
8865 if (length_of_hex == 0
8866 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8868 RExC_parse += length_of_hex; /* Includes all the valid */
8869 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
8870 ? UTF8SKIP(RExC_parse)
8872 /* Guard against malformed utf8 */
8873 if (RExC_parse >= endchar) RExC_parse = endchar;
8874 vFAIL("Invalid hexadecimal number in \\N{U+...}");
8877 RExC_parse = endbrace + 1;
8878 if (endchar == endbrace) return NULL;
8880 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
8882 else { /* Not a char class */
8884 /* What is done here is to convert this to a sub-pattern of the form
8885 * (?:\x{char1}\x{char2}...)
8886 * and then call reg recursively. That way, it retains its atomicness,
8887 * while not having to worry about special handling that some code
8888 * points may have. toke.c has converted the original Unicode values
8889 * to native, so that we can just pass on the hex values unchanged. We
8890 * do have to set a flag to keep recoding from happening in the
8893 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8895 char *endchar; /* Points to '.' or '}' ending cur char in the input
8897 char *orig_end = RExC_end;
8899 while (RExC_parse < endbrace) {
8901 /* Code points are separated by dots. If none, there is only one
8902 * code point, and is terminated by the brace */
8903 endchar = RExC_parse + strcspn(RExC_parse, ".}");
8905 /* Convert to notation the rest of the code understands */
8906 sv_catpv(substitute_parse, "\\x{");
8907 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8908 sv_catpv(substitute_parse, "}");
8910 /* Point to the beginning of the next character in the sequence. */
8911 RExC_parse = endchar + 1;
8913 sv_catpv(substitute_parse, ")");
8915 RExC_parse = SvPV(substitute_parse, len);
8917 /* Don't allow empty number */
8919 vFAIL("Invalid hexadecimal number in \\N{U+...}");
8921 RExC_end = RExC_parse + len;
8923 /* The values are Unicode, and therefore not subject to recoding */
8924 RExC_override_recoding = 1;
8926 ret = reg(pRExC_state, 1, flagp, depth+1);
8928 RExC_parse = endbrace;
8929 RExC_end = orig_end;
8930 RExC_override_recoding = 0;
8932 nextchar(pRExC_state);
8942 * It returns the code point in utf8 for the value in *encp.
8943 * value: a code value in the source encoding
8944 * encp: a pointer to an Encode object
8946 * If the result from Encode is not a single character,
8947 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8950 S_reg_recode(pTHX_ const char value, SV **encp)
8953 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8954 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8955 const STRLEN newlen = SvCUR(sv);
8956 UV uv = UNICODE_REPLACEMENT;
8958 PERL_ARGS_ASSERT_REG_RECODE;
8962 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8965 if (!newlen || numlen != newlen) {
8966 uv = UNICODE_REPLACEMENT;
8974 - regatom - the lowest level
8976 Try to identify anything special at the start of the pattern. If there
8977 is, then handle it as required. This may involve generating a single regop,
8978 such as for an assertion; or it may involve recursing, such as to
8979 handle a () structure.
8981 If the string doesn't start with something special then we gobble up
8982 as much literal text as we can.
8984 Once we have been able to handle whatever type of thing started the
8985 sequence, we return.
8987 Note: we have to be careful with escapes, as they can be both literal
8988 and special, and in the case of \10 and friends can either, depending
8989 on context. Specifically there are two separate switches for handling
8990 escape sequences, with the one for handling literal escapes requiring
8991 a dummy entry for all of the special escapes that are actually handled
8996 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8999 register regnode *ret = NULL;
9001 char *parse_start = RExC_parse;
9003 GET_RE_DEBUG_FLAGS_DECL;
9004 DEBUG_PARSE("atom");
9005 *flagp = WORST; /* Tentatively. */
9007 PERL_ARGS_ASSERT_REGATOM;
9010 switch ((U8)*RExC_parse) {
9012 RExC_seen_zerolen++;
9013 nextchar(pRExC_state);
9014 if (RExC_flags & RXf_PMf_MULTILINE)
9015 ret = reg_node(pRExC_state, MBOL);
9016 else if (RExC_flags & RXf_PMf_SINGLELINE)
9017 ret = reg_node(pRExC_state, SBOL);
9019 ret = reg_node(pRExC_state, BOL);
9020 Set_Node_Length(ret, 1); /* MJD */
9023 nextchar(pRExC_state);
9025 RExC_seen_zerolen++;
9026 if (RExC_flags & RXf_PMf_MULTILINE)
9027 ret = reg_node(pRExC_state, MEOL);
9028 else if (RExC_flags & RXf_PMf_SINGLELINE)
9029 ret = reg_node(pRExC_state, SEOL);
9031 ret = reg_node(pRExC_state, EOL);
9032 Set_Node_Length(ret, 1); /* MJD */
9035 nextchar(pRExC_state);
9036 if (RExC_flags & RXf_PMf_SINGLELINE)
9037 ret = reg_node(pRExC_state, SANY);
9039 ret = reg_node(pRExC_state, REG_ANY);
9040 *flagp |= HASWIDTH|SIMPLE;
9042 Set_Node_Length(ret, 1); /* MJD */
9046 char * const oregcomp_parse = ++RExC_parse;
9047 ret = regclass(pRExC_state,depth+1);
9048 if (*RExC_parse != ']') {
9049 RExC_parse = oregcomp_parse;
9050 vFAIL("Unmatched [");
9052 nextchar(pRExC_state);
9053 *flagp |= HASWIDTH|SIMPLE;
9054 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9058 nextchar(pRExC_state);
9059 ret = reg(pRExC_state, 1, &flags,depth+1);
9061 if (flags & TRYAGAIN) {
9062 if (RExC_parse == RExC_end) {
9063 /* Make parent create an empty node if needed. */
9071 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9075 if (flags & TRYAGAIN) {
9079 vFAIL("Internal urp");
9080 /* Supposed to be caught earlier. */
9086 vFAIL("Quantifier follows nothing");
9091 This switch handles escape sequences that resolve to some kind
9092 of special regop and not to literal text. Escape sequnces that
9093 resolve to literal text are handled below in the switch marked
9096 Every entry in this switch *must* have a corresponding entry
9097 in the literal escape switch. However, the opposite is not
9098 required, as the default for this switch is to jump to the
9099 literal text handling code.
9101 switch ((U8)*++RExC_parse) {
9102 /* Special Escapes */
9104 RExC_seen_zerolen++;
9105 ret = reg_node(pRExC_state, SBOL);
9107 goto finish_meta_pat;
9109 ret = reg_node(pRExC_state, GPOS);
9110 RExC_seen |= REG_SEEN_GPOS;
9112 goto finish_meta_pat;
9114 RExC_seen_zerolen++;
9115 ret = reg_node(pRExC_state, KEEPS);
9117 /* XXX:dmq : disabling in-place substitution seems to
9118 * be necessary here to avoid cases of memory corruption, as
9119 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9121 RExC_seen |= REG_SEEN_LOOKBEHIND;
9122 goto finish_meta_pat;
9124 ret = reg_node(pRExC_state, SEOL);
9126 RExC_seen_zerolen++; /* Do not optimize RE away */
9127 goto finish_meta_pat;
9129 ret = reg_node(pRExC_state, EOS);
9131 RExC_seen_zerolen++; /* Do not optimize RE away */
9132 goto finish_meta_pat;
9134 ret = reg_node(pRExC_state, CANY);
9135 RExC_seen |= REG_SEEN_CANY;
9136 *flagp |= HASWIDTH|SIMPLE;
9137 goto finish_meta_pat;
9139 ret = reg_node(pRExC_state, CLUMP);
9141 goto finish_meta_pat;
9143 switch (get_regex_charset(RExC_flags)) {
9144 case REGEX_LOCALE_CHARSET:
9147 case REGEX_UNICODE_CHARSET:
9150 case REGEX_ASCII_RESTRICTED_CHARSET:
9151 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9154 case REGEX_DEPENDS_CHARSET:
9160 ret = reg_node(pRExC_state, op);
9161 *flagp |= HASWIDTH|SIMPLE;
9162 goto finish_meta_pat;
9164 switch (get_regex_charset(RExC_flags)) {
9165 case REGEX_LOCALE_CHARSET:
9168 case REGEX_UNICODE_CHARSET:
9171 case REGEX_ASCII_RESTRICTED_CHARSET:
9172 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9175 case REGEX_DEPENDS_CHARSET:
9181 ret = reg_node(pRExC_state, op);
9182 *flagp |= HASWIDTH|SIMPLE;
9183 goto finish_meta_pat;
9185 RExC_seen_zerolen++;
9186 RExC_seen |= REG_SEEN_LOOKBEHIND;
9187 switch (get_regex_charset(RExC_flags)) {
9188 case REGEX_LOCALE_CHARSET:
9191 case REGEX_UNICODE_CHARSET:
9194 case REGEX_ASCII_RESTRICTED_CHARSET:
9195 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9198 case REGEX_DEPENDS_CHARSET:
9204 ret = reg_node(pRExC_state, op);
9205 FLAGS(ret) = get_regex_charset(RExC_flags);
9207 goto finish_meta_pat;
9209 RExC_seen_zerolen++;
9210 RExC_seen |= REG_SEEN_LOOKBEHIND;
9211 switch (get_regex_charset(RExC_flags)) {
9212 case REGEX_LOCALE_CHARSET:
9215 case REGEX_UNICODE_CHARSET:
9218 case REGEX_ASCII_RESTRICTED_CHARSET:
9219 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9222 case REGEX_DEPENDS_CHARSET:
9228 ret = reg_node(pRExC_state, op);
9229 FLAGS(ret) = get_regex_charset(RExC_flags);
9231 goto finish_meta_pat;
9233 switch (get_regex_charset(RExC_flags)) {
9234 case REGEX_LOCALE_CHARSET:
9237 case REGEX_UNICODE_CHARSET:
9240 case REGEX_ASCII_RESTRICTED_CHARSET:
9241 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9244 case REGEX_DEPENDS_CHARSET:
9250 ret = reg_node(pRExC_state, op);
9251 *flagp |= HASWIDTH|SIMPLE;
9252 goto finish_meta_pat;
9254 switch (get_regex_charset(RExC_flags)) {
9255 case REGEX_LOCALE_CHARSET:
9258 case REGEX_UNICODE_CHARSET:
9261 case REGEX_ASCII_RESTRICTED_CHARSET:
9262 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9265 case REGEX_DEPENDS_CHARSET:
9271 ret = reg_node(pRExC_state, op);
9272 *flagp |= HASWIDTH|SIMPLE;
9273 goto finish_meta_pat;
9275 switch (get_regex_charset(RExC_flags)) {
9276 case REGEX_LOCALE_CHARSET:
9279 case REGEX_ASCII_RESTRICTED_CHARSET:
9280 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9283 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9284 case REGEX_UNICODE_CHARSET:
9290 ret = reg_node(pRExC_state, op);
9291 *flagp |= HASWIDTH|SIMPLE;
9292 goto finish_meta_pat;
9294 switch (get_regex_charset(RExC_flags)) {
9295 case REGEX_LOCALE_CHARSET:
9298 case REGEX_ASCII_RESTRICTED_CHARSET:
9299 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9302 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9303 case REGEX_UNICODE_CHARSET:
9309 ret = reg_node(pRExC_state, op);
9310 *flagp |= HASWIDTH|SIMPLE;
9311 goto finish_meta_pat;
9313 ret = reg_node(pRExC_state, LNBREAK);
9314 *flagp |= HASWIDTH|SIMPLE;
9315 goto finish_meta_pat;
9317 ret = reg_node(pRExC_state, HORIZWS);
9318 *flagp |= HASWIDTH|SIMPLE;
9319 goto finish_meta_pat;
9321 ret = reg_node(pRExC_state, NHORIZWS);
9322 *flagp |= HASWIDTH|SIMPLE;
9323 goto finish_meta_pat;
9325 ret = reg_node(pRExC_state, VERTWS);
9326 *flagp |= HASWIDTH|SIMPLE;
9327 goto finish_meta_pat;
9329 ret = reg_node(pRExC_state, NVERTWS);
9330 *flagp |= HASWIDTH|SIMPLE;
9332 nextchar(pRExC_state);
9333 Set_Node_Length(ret, 2); /* MJD */
9338 char* const oldregxend = RExC_end;
9340 char* parse_start = RExC_parse - 2;
9343 if (RExC_parse[1] == '{') {
9344 /* a lovely hack--pretend we saw [\pX] instead */
9345 RExC_end = strchr(RExC_parse, '}');
9347 const U8 c = (U8)*RExC_parse;
9349 RExC_end = oldregxend;
9350 vFAIL2("Missing right brace on \\%c{}", c);
9355 RExC_end = RExC_parse + 2;
9356 if (RExC_end > oldregxend)
9357 RExC_end = oldregxend;
9361 ret = regclass(pRExC_state,depth+1);
9363 RExC_end = oldregxend;
9366 Set_Node_Offset(ret, parse_start + 2);
9367 Set_Node_Cur_Length(ret);
9368 nextchar(pRExC_state);
9369 *flagp |= HASWIDTH|SIMPLE;
9373 /* Handle \N and \N{NAME} here and not below because it can be
9374 multicharacter. join_exact() will join them up later on.
9375 Also this makes sure that things like /\N{BLAH}+/ and
9376 \N{BLAH} being multi char Just Happen. dmq*/
9378 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9380 case 'k': /* Handle \k<NAME> and \k'NAME' */
9383 char ch= RExC_parse[1];
9384 if (ch != '<' && ch != '\'' && ch != '{') {
9386 vFAIL2("Sequence %.2s... not terminated",parse_start);
9388 /* this pretty much dupes the code for (?P=...) in reg(), if
9389 you change this make sure you change that */
9390 char* name_start = (RExC_parse += 2);
9392 SV *sv_dat = reg_scan_name(pRExC_state,
9393 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9394 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9395 if (RExC_parse == name_start || *RExC_parse != ch)
9396 vFAIL2("Sequence %.3s... not terminated",parse_start);
9399 num = add_data( pRExC_state, 1, "S" );
9400 RExC_rxi->data->data[num]=(void*)sv_dat;
9401 SvREFCNT_inc_simple_void(sv_dat);
9405 ret = reganode(pRExC_state,
9408 : (MORE_ASCII_RESTRICTED)
9410 : (AT_LEAST_UNI_SEMANTICS)
9418 /* override incorrect value set in reganode MJD */
9419 Set_Node_Offset(ret, parse_start+1);
9420 Set_Node_Cur_Length(ret); /* MJD */
9421 nextchar(pRExC_state);
9427 case '1': case '2': case '3': case '4':
9428 case '5': case '6': case '7': case '8': case '9':
9431 bool isg = *RExC_parse == 'g';
9436 if (*RExC_parse == '{') {
9440 if (*RExC_parse == '-') {
9444 if (hasbrace && !isDIGIT(*RExC_parse)) {
9445 if (isrel) RExC_parse--;
9447 goto parse_named_seq;
9449 num = atoi(RExC_parse);
9450 if (isg && num == 0)
9451 vFAIL("Reference to invalid group 0");
9453 num = RExC_npar - num;
9455 vFAIL("Reference to nonexistent or unclosed group");
9457 if (!isg && num > 9 && num >= RExC_npar)
9460 char * const parse_start = RExC_parse - 1; /* MJD */
9461 while (isDIGIT(*RExC_parse))
9463 if (parse_start == RExC_parse - 1)
9464 vFAIL("Unterminated \\g... pattern");
9466 if (*RExC_parse != '}')
9467 vFAIL("Unterminated \\g{...} pattern");
9471 if (num > (I32)RExC_rx->nparens)
9472 vFAIL("Reference to nonexistent group");
9475 ret = reganode(pRExC_state,
9478 : (MORE_ASCII_RESTRICTED)
9480 : (AT_LEAST_UNI_SEMANTICS)
9488 /* override incorrect value set in reganode MJD */
9489 Set_Node_Offset(ret, parse_start+1);
9490 Set_Node_Cur_Length(ret); /* MJD */
9492 nextchar(pRExC_state);
9497 if (RExC_parse >= RExC_end)
9498 FAIL("Trailing \\");
9501 /* Do not generate "unrecognized" warnings here, we fall
9502 back into the quick-grab loop below */
9509 if (RExC_flags & RXf_PMf_EXTENDED) {
9510 if ( reg_skipcomment( pRExC_state ) )
9517 parse_start = RExC_parse - 1;
9522 register STRLEN len;
9527 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9530 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
9531 * it is folded to 'ss' even if not utf8 */
9532 bool is_exactfu_sharp_s;
9535 node_type = ((! FOLD) ? EXACT
9538 : (MORE_ASCII_RESTRICTED)
9540 : (AT_LEAST_UNI_SEMANTICS)
9543 ret = reg_node(pRExC_state, node_type);
9546 /* XXX The node can hold up to 255 bytes, yet this only goes to
9547 * 127. I (khw) do not know why. Keeping it somewhat less than
9548 * 255 allows us to not have to worry about overflow due to
9549 * converting to utf8 and fold expansion, but that value is
9550 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
9551 * split up by this limit into a single one using the real max of
9552 * 255. Even at 127, this breaks under rare circumstances. If
9553 * folding, we do not want to split a node at a character that is a
9554 * non-final in a multi-char fold, as an input string could just
9555 * happen to want to match across the node boundary. The join
9556 * would solve that problem if the join actually happens. But a
9557 * series of more than two nodes in a row each of 127 would cause
9558 * the first join to succeed to get to 254, but then there wouldn't
9559 * be room for the next one, which could at be one of those split
9560 * multi-char folds. I don't know of any fool-proof solution. One
9561 * could back off to end with only a code point that isn't such a
9562 * non-final, but it is possible for there not to be any in the
9564 for (len = 0, p = RExC_parse - 1;
9565 len < 127 && p < RExC_end;
9568 char * const oldp = p;
9570 if (RExC_flags & RXf_PMf_EXTENDED)
9571 p = regwhite( pRExC_state, p );
9582 /* Literal Escapes Switch
9584 This switch is meant to handle escape sequences that
9585 resolve to a literal character.
9587 Every escape sequence that represents something
9588 else, like an assertion or a char class, is handled
9589 in the switch marked 'Special Escapes' above in this
9590 routine, but also has an entry here as anything that
9591 isn't explicitly mentioned here will be treated as
9592 an unescaped equivalent literal.
9596 /* These are all the special escapes. */
9597 case 'A': /* Start assertion */
9598 case 'b': case 'B': /* Word-boundary assertion*/
9599 case 'C': /* Single char !DANGEROUS! */
9600 case 'd': case 'D': /* digit class */
9601 case 'g': case 'G': /* generic-backref, pos assertion */
9602 case 'h': case 'H': /* HORIZWS */
9603 case 'k': case 'K': /* named backref, keep marker */
9604 case 'N': /* named char sequence */
9605 case 'p': case 'P': /* Unicode property */
9606 case 'R': /* LNBREAK */
9607 case 's': case 'S': /* space class */
9608 case 'v': case 'V': /* VERTWS */
9609 case 'w': case 'W': /* word class */
9610 case 'X': /* eXtended Unicode "combining character sequence" */
9611 case 'z': case 'Z': /* End of line/string assertion */
9615 /* Anything after here is an escape that resolves to a
9616 literal. (Except digits, which may or may not)
9635 ender = ASCII_TO_NATIVE('\033');
9639 ender = ASCII_TO_NATIVE('\007');
9644 STRLEN brace_len = len;
9646 const char* error_msg;
9648 bool valid = grok_bslash_o(p,
9655 RExC_parse = p; /* going to die anyway; point
9656 to exact spot of failure */
9663 if (PL_encoding && ender < 0x100) {
9664 goto recode_encoding;
9673 char* const e = strchr(p, '}');
9677 vFAIL("Missing right brace on \\x{}");
9680 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9681 | PERL_SCAN_DISALLOW_PREFIX;
9682 STRLEN numlen = e - p - 1;
9683 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9690 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9692 ender = grok_hex(p, &numlen, &flags, NULL);
9695 if (PL_encoding && ender < 0x100)
9696 goto recode_encoding;
9700 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9702 case '0': case '1': case '2': case '3':case '4':
9703 case '5': case '6': case '7': case '8':case '9':
9705 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9707 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9709 ender = grok_oct(p, &numlen, &flags, NULL);
9719 if (PL_encoding && ender < 0x100)
9720 goto recode_encoding;
9723 if (! RExC_override_recoding) {
9724 SV* enc = PL_encoding;
9725 ender = reg_recode((const char)(U8)ender, &enc);
9726 if (!enc && SIZE_ONLY)
9727 ckWARNreg(p, "Invalid escape in the specified encoding");
9733 FAIL("Trailing \\");
9736 if (!SIZE_ONLY&& isALPHA(*p)) {
9737 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
9739 goto normal_default;
9743 /* Currently we don't warn when the lbrace is at the start
9744 * of a construct. This catches it in the middle of a
9745 * literal string, or when its the first thing after
9746 * something like "\b" */
9748 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
9750 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
9755 if (UTF8_IS_START(*p) && UTF) {
9757 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9758 &numlen, UTF8_ALLOW_DEFAULT);
9764 } /* End of switch on the literal */
9766 is_exactfu_sharp_s = (node_type == EXACTFU
9767 && ender == LATIN_SMALL_LETTER_SHARP_S);
9768 if ( RExC_flags & RXf_PMf_EXTENDED)
9769 p = regwhite( pRExC_state, p );
9770 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9771 /* Prime the casefolded buffer. Locale rules, which apply
9772 * only to code points < 256, aren't known until execution,
9773 * so for them, just output the original character using
9774 * utf8. If we start to fold non-UTF patterns, be sure to
9775 * update join_exact() */
9776 if (LOC && ender < 256) {
9777 if (UNI_IS_INVARIANT(ender)) {
9778 *tmpbuf = (U8) ender;
9781 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9782 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9786 else if (isASCII(ender)) { /* Note: Here can't also be LOC
9788 ender = toLOWER(ender);
9789 *tmpbuf = (U8) ender;
9792 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9794 /* Locale and /aa require more selectivity about the
9795 * fold, so are handled below. Otherwise, here, just
9797 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9800 /* Under locale rules or /aa we are not to mix,
9801 * respectively, ords < 256 or ASCII with non-. So
9802 * reject folds that mix them, using only the
9803 * non-folded code point. So do the fold to a
9804 * temporary, and inspect each character in it. */
9805 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9807 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9808 U8* e = s + foldlen;
9809 bool fold_ok = TRUE;
9813 || (LOC && (UTF8_IS_INVARIANT(*s)
9814 || UTF8_IS_DOWNGRADEABLE_START(*s))))
9822 Copy(trialbuf, tmpbuf, foldlen, U8);
9826 uvuni_to_utf8(tmpbuf, ender);
9827 foldlen = UNISKIP(ender);
9831 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9834 else if (UTF || is_exactfu_sharp_s) {
9836 /* Emit all the Unicode characters. */
9838 for (foldbuf = tmpbuf;
9840 foldlen -= numlen) {
9842 /* tmpbuf has been constructed by us, so we
9843 * know it is valid utf8 */
9844 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
9846 const STRLEN unilen = reguni(pRExC_state, ender, s);
9849 /* In EBCDIC the numlen
9850 * and unilen can differ. */
9852 if (numlen >= foldlen)
9856 break; /* "Can't happen." */
9860 const STRLEN unilen = reguni(pRExC_state, ender, s);
9869 REGC((char)ender, s++);
9873 if (UTF || is_exactfu_sharp_s) {
9875 /* Emit all the Unicode characters. */
9877 for (foldbuf = tmpbuf;
9879 foldlen -= numlen) {
9880 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
9882 const STRLEN unilen = reguni(pRExC_state, ender, s);
9885 /* In EBCDIC the numlen
9886 * and unilen can differ. */
9888 if (numlen >= foldlen)
9896 const STRLEN unilen = reguni(pRExC_state, ender, s);
9905 REGC((char)ender, s++);
9908 loopdone: /* Jumped to when encounters something that shouldn't be in
9911 Set_Node_Cur_Length(ret); /* MJD */
9912 nextchar(pRExC_state);
9914 /* len is STRLEN which is unsigned, need to copy to signed */
9917 vFAIL("Internal disaster");
9921 if (len == 1 && UNI_IS_INVARIANT(ender))
9925 RExC_size += STR_SZ(len);
9928 RExC_emit += STR_SZ(len);
9936 /* Jumped to when an unrecognized character set is encountered */
9938 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9943 S_regwhite( RExC_state_t *pRExC_state, char *p )
9945 const char *e = RExC_end;
9947 PERL_ARGS_ASSERT_REGWHITE;
9952 else if (*p == '#') {
9961 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9969 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9970 Character classes ([:foo:]) can also be negated ([:^foo:]).
9971 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9972 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9973 but trigger failures because they are currently unimplemented. */
9975 #define POSIXCC_DONE(c) ((c) == ':')
9976 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9977 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9980 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9983 I32 namedclass = OOB_NAMEDCLASS;
9985 PERL_ARGS_ASSERT_REGPPOSIXCC;
9987 if (value == '[' && RExC_parse + 1 < RExC_end &&
9988 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9989 POSIXCC(UCHARAT(RExC_parse))) {
9990 const char c = UCHARAT(RExC_parse);
9991 char* const s = RExC_parse++;
9993 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9995 if (RExC_parse == RExC_end)
9996 /* Grandfather lone [:, [=, [. */
9999 const char* const t = RExC_parse++; /* skip over the c */
10002 if (UCHARAT(RExC_parse) == ']') {
10003 const char *posixcc = s + 1;
10004 RExC_parse++; /* skip over the ending ] */
10007 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10008 const I32 skip = t - posixcc;
10010 /* Initially switch on the length of the name. */
10013 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10014 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10017 /* Names all of length 5. */
10018 /* alnum alpha ascii blank cntrl digit graph lower
10019 print punct space upper */
10020 /* Offset 4 gives the best switch position. */
10021 switch (posixcc[4]) {
10023 if (memEQ(posixcc, "alph", 4)) /* alpha */
10024 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10027 if (memEQ(posixcc, "spac", 4)) /* space */
10028 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10031 if (memEQ(posixcc, "grap", 4)) /* graph */
10032 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10035 if (memEQ(posixcc, "asci", 4)) /* ascii */
10036 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10039 if (memEQ(posixcc, "blan", 4)) /* blank */
10040 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10043 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10044 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10047 if (memEQ(posixcc, "alnu", 4)) /* alnum */
10048 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10051 if (memEQ(posixcc, "lowe", 4)) /* lower */
10052 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10053 else if (memEQ(posixcc, "uppe", 4)) /* upper */
10054 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10057 if (memEQ(posixcc, "digi", 4)) /* digit */
10058 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10059 else if (memEQ(posixcc, "prin", 4)) /* print */
10060 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10061 else if (memEQ(posixcc, "punc", 4)) /* punct */
10062 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10067 if (memEQ(posixcc, "xdigit", 6))
10068 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10072 if (namedclass == OOB_NAMEDCLASS)
10073 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10075 assert (posixcc[skip] == ':');
10076 assert (posixcc[skip+1] == ']');
10077 } else if (!SIZE_ONLY) {
10078 /* [[=foo=]] and [[.foo.]] are still future. */
10080 /* adjust RExC_parse so the warning shows after
10081 the class closes */
10082 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10084 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10087 /* Maternal grandfather:
10088 * "[:" ending in ":" but not in ":]" */
10098 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10102 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10104 if (POSIXCC(UCHARAT(RExC_parse))) {
10105 const char *s = RExC_parse;
10106 const char c = *s++;
10108 while (isALNUM(*s))
10110 if (*s && c == *s && s[1] == ']') {
10112 "POSIX syntax [%c %c] belongs inside character classes",
10115 /* [[=foo=]] and [[.foo.]] are still future. */
10116 if (POSIXCC_NOTYET(c)) {
10117 /* adjust RExC_parse so the error shows after
10118 the class closes */
10119 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10121 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10127 /* Generate the code to add a full posix character <class> to the bracketed
10128 * character class given by <node>. (<node> is needed only under locale rules)
10129 * destlist is the inversion list for non-locale rules that this class is
10131 * sourcelist is the ASCII-range inversion list to add under /a rules
10132 * Xsourcelist is the full Unicode range list to use otherwise. */
10133 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10135 SV* scratch_list = NULL; \
10137 /* Set this class in the node for runtime matching */ \
10138 ANYOF_CLASS_SET(node, class); \
10140 /* For above Latin1 code points, we use the full Unicode range */ \
10141 _invlist_intersection(PL_AboveLatin1, \
10144 /* And set the output to it, adding instead if there already is an \
10145 * output. Checking if <destlist> is NULL first saves an extra \
10146 * clone. Its reference count will be decremented at the next \
10147 * union, etc, or if this is the only instance, at the end of the \
10149 if (! destlist) { \
10150 destlist = scratch_list; \
10153 _invlist_union(destlist, scratch_list, &destlist); \
10154 SvREFCNT_dec(scratch_list); \
10158 /* For non-locale, just add it to any existing list */ \
10159 _invlist_union(destlist, \
10160 (AT_LEAST_ASCII_RESTRICTED) \
10166 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10168 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10170 SV* scratch_list = NULL; \
10171 ANYOF_CLASS_SET(node, class); \
10172 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10173 if (! destlist) { \
10174 destlist = scratch_list; \
10177 _invlist_union(destlist, scratch_list, &destlist); \
10178 SvREFCNT_dec(scratch_list); \
10182 _invlist_union_complement_2nd(destlist, \
10183 (AT_LEAST_ASCII_RESTRICTED) \
10187 /* Under /d, everything in the upper half of the Latin1 range \
10188 * matches this complement */ \
10189 if (DEPENDS_SEMANTICS) { \
10190 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10194 /* Generate the code to add a posix character <class> to the bracketed
10195 * character class given by <node>. (<node> is needed only under locale rules)
10196 * destlist is the inversion list for non-locale rules that this class is
10198 * sourcelist is the ASCII-range inversion list to add under /a rules
10199 * l1_sourcelist is the Latin1 range list to use otherwise.
10200 * Xpropertyname is the name to add to <run_time_list> of the property to
10201 * specify the code points above Latin1 that will have to be
10202 * determined at run-time
10203 * run_time_list is a SV* that contains text names of properties that are to
10204 * be computed at run time. This concatenates <Xpropertyname>
10205 * to it, apppropriately
10206 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10208 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10209 l1_sourcelist, Xpropertyname, run_time_list) \
10210 /* First, resolve whether to use the ASCII-only list or the L1 \
10212 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
10213 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10214 Xpropertyname, run_time_list)
10216 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10217 Xpropertyname, run_time_list) \
10218 /* If not /a matching, there are going to be code points we will have \
10219 * to defer to runtime to look-up */ \
10220 if (! AT_LEAST_ASCII_RESTRICTED) { \
10221 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10224 ANYOF_CLASS_SET(node, class); \
10227 _invlist_union(destlist, sourcelist, &destlist); \
10230 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10231 * this and DO_N_POSIX */
10232 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10233 l1_sourcelist, Xpropertyname, run_time_list) \
10234 if (AT_LEAST_ASCII_RESTRICTED) { \
10235 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10238 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10240 ANYOF_CLASS_SET(node, namedclass); \
10243 SV* scratch_list = NULL; \
10244 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10245 if (! destlist) { \
10246 destlist = scratch_list; \
10249 _invlist_union(destlist, scratch_list, &destlist); \
10250 SvREFCNT_dec(scratch_list); \
10252 if (DEPENDS_SEMANTICS) { \
10253 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10259 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10262 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10263 * Locale folding is done at run-time, so this function should not be
10264 * called for nodes that are for locales.
10266 * This function sets the bit corresponding to the fold of the input
10267 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
10270 * It also knows about the characters that are in the bitmap that have
10271 * folds that are matchable only outside it, and sets the appropriate lists
10274 * It returns the number of bits that actually changed from 0 to 1 */
10279 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10281 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10284 /* It assumes the bit for 'value' has already been set */
10285 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10286 ANYOF_BITMAP_SET(node, fold);
10289 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10290 /* Certain Latin1 characters have matches outside the bitmap. To get
10291 * here, 'value' is one of those characters. None of these matches is
10292 * valid for ASCII characters under /aa, which have been excluded by
10293 * the 'if' above. The matches fall into three categories:
10294 * 1) They are singly folded-to or -from an above 255 character, as
10295 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10297 * 2) They are part of a multi-char fold with another character in the
10298 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10299 * 3) They are part of a multi-char fold with a character not in the
10300 * bitmap, such as various ligatures.
10301 * We aren't dealing fully with multi-char folds, except we do deal
10302 * with the pattern containing a character that has a multi-char fold
10303 * (not so much the inverse).
10304 * For types 1) and 3), the matches only happen when the target string
10305 * is utf8; that's not true for 2), and we set a flag for it.
10307 * The code below adds to the passed in inversion list the single fold
10308 * closures for 'value'. The values are hard-coded here so that an
10309 * innocent-looking character class, like /[ks]/i won't have to go out
10310 * to disk to find the possible matches. XXX It would be better to
10311 * generate these via regen, in case a new version of the Unicode
10312 * standard adds new mappings, though that is not really likely. */
10317 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10321 /* LATIN SMALL LETTER LONG S */
10322 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10325 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10326 GREEK_SMALL_LETTER_MU);
10327 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10328 GREEK_CAPITAL_LETTER_MU);
10330 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10331 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10332 /* ANGSTROM SIGN */
10333 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10334 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
10335 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10336 PL_fold_latin1[value]);
10339 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10340 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10341 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10343 case LATIN_SMALL_LETTER_SHARP_S:
10344 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10345 LATIN_CAPITAL_LETTER_SHARP_S);
10347 /* Under /a, /d, and /u, this can match the two chars "ss" */
10348 if (! MORE_ASCII_RESTRICTED) {
10349 add_alternate(alternate_ptr, (U8 *) "ss", 2);
10351 /* And under /u or /a, it can match even if the target is
10353 if (AT_LEAST_UNI_SEMANTICS) {
10354 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10358 case 'F': case 'f':
10359 case 'I': case 'i':
10360 case 'L': case 'l':
10361 case 'T': case 't':
10362 case 'A': case 'a':
10363 case 'H': case 'h':
10364 case 'J': case 'j':
10365 case 'N': case 'n':
10366 case 'W': case 'w':
10367 case 'Y': case 'y':
10368 /* These all are targets of multi-character folds from code
10369 * points that require UTF8 to express, so they can't match
10370 * unless the target string is in UTF-8, so no action here is
10371 * necessary, as regexec.c properly handles the general case
10372 * for UTF-8 matching */
10375 /* Use deprecated warning to increase the chances of this
10377 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10381 else if (DEPENDS_SEMANTICS
10382 && ! isASCII(value)
10383 && PL_fold_latin1[value] != value)
10385 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10386 * folds only when the target string is in UTF-8. We add the fold
10387 * here to the list of things to match outside the bitmap, which
10388 * won't be looked at unless it is UTF8 (or else if something else
10389 * says to look even if not utf8, but those things better not happen
10390 * under DEPENDS semantics. */
10391 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10398 PERL_STATIC_INLINE U8
10399 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10401 /* This inline function sets a bit in the bitmap if not already set, and if
10402 * appropriate, its fold, returning the number of bits that actually
10403 * changed from 0 to 1 */
10407 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10409 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
10413 ANYOF_BITMAP_SET(node, value);
10416 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
10417 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10424 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10426 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10427 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10428 * the multi-character folds of characters in the node */
10431 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10433 if (! *alternate_ptr) {
10434 *alternate_ptr = newAV();
10436 sv = newSVpvn_utf8((char*)string, len, TRUE);
10437 av_push(*alternate_ptr, sv);
10442 parse a class specification and produce either an ANYOF node that
10443 matches the pattern or perhaps will be optimized into an EXACTish node
10444 instead. The node contains a bit map for the first 256 characters, with the
10445 corresponding bit set if that character is in the list. For characters
10446 above 255, a range list is used */
10449 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10452 register UV nextvalue;
10453 register IV prevvalue = OOB_UNICODE;
10454 register IV range = 0;
10455 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10456 register regnode *ret;
10459 char *rangebegin = NULL;
10460 bool need_class = 0;
10461 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
10463 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10464 than just initialized. */
10465 SV* properties = NULL; /* Code points that match \p{} \P{} */
10466 UV element_count = 0; /* Number of distinct elements in the class.
10467 Optimizations may be possible if this is tiny */
10470 /* Unicode properties are stored in a swash; this holds the current one
10471 * being parsed. If this swash is the only above-latin1 component of the
10472 * character class, an optimization is to pass it directly on to the
10473 * execution engine. Otherwise, it is set to NULL to indicate that there
10474 * are other things in the class that have to be dealt with at execution
10476 SV* swash = NULL; /* Code points that match \p{} \P{} */
10478 /* Set if a component of this character class is user-defined; just passed
10479 * on to the engine */
10480 UV has_user_defined_property = 0;
10482 /* code points this node matches that can't be stored in the bitmap */
10483 SV* nonbitmap = NULL;
10485 /* The items that are to match that aren't stored in the bitmap, but are a
10486 * result of things that are stored there. This is the fold closure of
10487 * such a character, either because it has DEPENDS semantics and shouldn't
10488 * be matched unless the target string is utf8, or is a code point that is
10489 * too large for the bit map, as for example, the fold of the MICRO SIGN is
10490 * above 255. This all is solely for performance reasons. By having this
10491 * code know the outside-the-bitmap folds that the bitmapped characters are
10492 * involved with, we don't have to go out to disk to find the list of
10493 * matches, unless the character class includes code points that aren't
10494 * storable in the bit map. That means that a character class with an 's'
10495 * in it, for example, doesn't need to go out to disk to find everything
10496 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
10497 * empty unless there is something whose fold we don't know about, and will
10498 * have to go out to the disk to find. */
10499 SV* l1_fold_invlist = NULL;
10501 /* List of multi-character folds that are matched by this node */
10502 AV* unicode_alternate = NULL;
10504 UV literal_endpoint = 0;
10506 UV stored = 0; /* how many chars stored in the bitmap */
10508 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10509 case we need to change the emitted regop to an EXACT. */
10510 const char * orig_parse = RExC_parse;
10511 GET_RE_DEBUG_FLAGS_DECL;
10513 PERL_ARGS_ASSERT_REGCLASS;
10515 PERL_UNUSED_ARG(depth);
10518 DEBUG_PARSE("clas");
10520 /* Assume we are going to generate an ANYOF node. */
10521 ret = reganode(pRExC_state, ANYOF, 0);
10525 ANYOF_FLAGS(ret) = 0;
10528 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
10532 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10534 /* We have decided to not allow multi-char folds in inverted character
10535 * classes, due to the confusion that can happen, especially with
10536 * classes that are designed for a non-Unicode world: You have the
10537 * peculiar case that:
10538 "s s" =~ /^[^\xDF]+$/i => Y
10539 "ss" =~ /^[^\xDF]+$/i => N
10541 * See [perl #89750] */
10542 allow_full_fold = FALSE;
10546 RExC_size += ANYOF_SKIP;
10547 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10550 RExC_emit += ANYOF_SKIP;
10552 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10554 ANYOF_BITMAP_ZERO(ret);
10555 listsv = newSVpvs("# comment\n");
10556 initial_listsv_len = SvCUR(listsv);
10559 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10561 if (!SIZE_ONLY && POSIXCC(nextvalue))
10562 checkposixcc(pRExC_state);
10564 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10565 if (UCHARAT(RExC_parse) == ']')
10566 goto charclassloop;
10569 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10573 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10576 rangebegin = RExC_parse;
10580 value = utf8n_to_uvchr((U8*)RExC_parse,
10581 RExC_end - RExC_parse,
10582 &numlen, UTF8_ALLOW_DEFAULT);
10583 RExC_parse += numlen;
10586 value = UCHARAT(RExC_parse++);
10588 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10589 if (value == '[' && POSIXCC(nextvalue))
10590 namedclass = regpposixcc(pRExC_state, value);
10591 else if (value == '\\') {
10593 value = utf8n_to_uvchr((U8*)RExC_parse,
10594 RExC_end - RExC_parse,
10595 &numlen, UTF8_ALLOW_DEFAULT);
10596 RExC_parse += numlen;
10599 value = UCHARAT(RExC_parse++);
10600 /* Some compilers cannot handle switching on 64-bit integer
10601 * values, therefore value cannot be an UV. Yes, this will
10602 * be a problem later if we want switch on Unicode.
10603 * A similar issue a little bit later when switching on
10604 * namedclass. --jhi */
10605 switch ((I32)value) {
10606 case 'w': namedclass = ANYOF_ALNUM; break;
10607 case 'W': namedclass = ANYOF_NALNUM; break;
10608 case 's': namedclass = ANYOF_SPACE; break;
10609 case 'S': namedclass = ANYOF_NSPACE; break;
10610 case 'd': namedclass = ANYOF_DIGIT; break;
10611 case 'D': namedclass = ANYOF_NDIGIT; break;
10612 case 'v': namedclass = ANYOF_VERTWS; break;
10613 case 'V': namedclass = ANYOF_NVERTWS; break;
10614 case 'h': namedclass = ANYOF_HORIZWS; break;
10615 case 'H': namedclass = ANYOF_NHORIZWS; break;
10616 case 'N': /* Handle \N{NAME} in class */
10618 /* We only pay attention to the first char of
10619 multichar strings being returned. I kinda wonder
10620 if this makes sense as it does change the behaviour
10621 from earlier versions, OTOH that behaviour was broken
10623 UV v; /* value is register so we cant & it /grrr */
10624 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10634 if (RExC_parse >= RExC_end)
10635 vFAIL2("Empty \\%c{}", (U8)value);
10636 if (*RExC_parse == '{') {
10637 const U8 c = (U8)value;
10638 e = strchr(RExC_parse++, '}');
10640 vFAIL2("Missing right brace on \\%c{}", c);
10641 while (isSPACE(UCHARAT(RExC_parse)))
10643 if (e == RExC_parse)
10644 vFAIL2("Empty \\%c{}", c);
10645 n = e - RExC_parse;
10646 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10657 if (UCHARAT(RExC_parse) == '^') {
10660 value = value == 'p' ? 'P' : 'p'; /* toggle */
10661 while (isSPACE(UCHARAT(RExC_parse))) {
10666 /* Try to get the definition of the property into
10667 * <invlist>. If /i is in effect, the effective property
10668 * will have its name be <__NAME_i>. The design is
10669 * discussed in commit
10670 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10671 Newx(name, n + sizeof("_i__\n"), char);
10673 sprintf(name, "%s%.*s%s\n",
10674 (FOLD) ? "__" : "",
10680 /* Look up the property name, and get its swash and
10681 * inversion list, if the property is found */
10683 SvREFCNT_dec(swash);
10685 swash = _core_swash_init("utf8", name, &PL_sv_undef,
10688 TRUE, /* this routine will handle
10689 undefined properties */
10690 NULL, FALSE /* No inversion list */
10694 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10696 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10698 || ! (invlist = *invlistsvp))
10701 SvREFCNT_dec(swash);
10705 /* Here didn't find it. It could be a user-defined
10706 * property that will be available at run-time. Add it
10707 * to the list to look up then */
10708 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10709 (value == 'p' ? '+' : '!'),
10711 has_user_defined_property = 1;
10713 /* We don't know yet, so have to assume that the
10714 * property could match something in the Latin1 range,
10715 * hence something that isn't utf8 */
10716 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10720 /* Here, did get the swash and its inversion list. If
10721 * the swash is from a user-defined property, then this
10722 * whole character class should be regarded as such */
10723 SV** user_defined_svp =
10724 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10725 "USER_DEFINED", FALSE);
10726 if (user_defined_svp) {
10727 has_user_defined_property
10728 |= SvUV(*user_defined_svp);
10731 /* Invert if asking for the complement */
10732 if (value == 'P') {
10733 _invlist_union_complement_2nd(properties, invlist, &properties);
10735 /* The swash can't be used as-is, because we've
10736 * inverted things; delay removing it to here after
10737 * have copied its invlist above */
10738 SvREFCNT_dec(swash);
10742 _invlist_union(properties, invlist, &properties);
10747 RExC_parse = e + 1;
10748 namedclass = ANYOF_MAX; /* no official name, but it's named */
10750 /* \p means they want Unicode semantics */
10751 RExC_uni_semantics = 1;
10754 case 'n': value = '\n'; break;
10755 case 'r': value = '\r'; break;
10756 case 't': value = '\t'; break;
10757 case 'f': value = '\f'; break;
10758 case 'b': value = '\b'; break;
10759 case 'e': value = ASCII_TO_NATIVE('\033');break;
10760 case 'a': value = ASCII_TO_NATIVE('\007');break;
10762 RExC_parse--; /* function expects to be pointed at the 'o' */
10764 const char* error_msg;
10765 bool valid = grok_bslash_o(RExC_parse,
10770 RExC_parse += numlen;
10775 if (PL_encoding && value < 0x100) {
10776 goto recode_encoding;
10780 if (*RExC_parse == '{') {
10781 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10782 | PERL_SCAN_DISALLOW_PREFIX;
10783 char * const e = strchr(RExC_parse++, '}');
10785 vFAIL("Missing right brace on \\x{}");
10787 numlen = e - RExC_parse;
10788 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10789 RExC_parse = e + 1;
10792 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10794 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10795 RExC_parse += numlen;
10797 if (PL_encoding && value < 0x100)
10798 goto recode_encoding;
10801 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10803 case '0': case '1': case '2': case '3': case '4':
10804 case '5': case '6': case '7':
10806 /* Take 1-3 octal digits */
10807 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10809 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10810 RExC_parse += numlen;
10811 if (PL_encoding && value < 0x100)
10812 goto recode_encoding;
10816 if (! RExC_override_recoding) {
10817 SV* enc = PL_encoding;
10818 value = reg_recode((const char)(U8)value, &enc);
10819 if (!enc && SIZE_ONLY)
10820 ckWARNreg(RExC_parse,
10821 "Invalid escape in the specified encoding");
10825 /* Allow \_ to not give an error */
10826 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10827 ckWARN2reg(RExC_parse,
10828 "Unrecognized escape \\%c in character class passed through",
10833 } /* end of \blah */
10836 literal_endpoint++;
10839 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10841 /* What matches in a locale is not known until runtime, so need to
10842 * (one time per class) allocate extra space to pass to regexec.
10843 * The space will contain a bit for each named class that is to be
10844 * matched against. This isn't needed for \p{} and pseudo-classes,
10845 * as they are not affected by locale, and hence are dealt with
10847 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10850 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10853 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10854 ANYOF_CLASS_ZERO(ret);
10856 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10859 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
10860 * literal, as is the character that began the false range, i.e.
10861 * the 'a' in the examples */
10865 RExC_parse >= rangebegin ?
10866 RExC_parse - rangebegin : 0;
10867 ckWARN4reg(RExC_parse,
10868 "False [] range \"%*.*s\"",
10872 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10873 if (prevvalue < 256) {
10875 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10878 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10882 range = 0; /* this was not a true range */
10887 /* Possible truncation here but in some 64-bit environments
10888 * the compiler gets heartburn about switch on 64-bit values.
10889 * A similar issue a little earlier when switching on value.
10891 switch ((I32)namedclass) {
10893 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10894 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10895 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10897 case ANYOF_NALNUMC:
10898 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10899 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10902 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10903 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10906 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10907 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10911 ANYOF_CLASS_SET(ret, namedclass);
10914 _invlist_union(properties, PL_ASCII, &properties);
10919 ANYOF_CLASS_SET(ret, namedclass);
10922 _invlist_union_complement_2nd(properties,
10923 PL_ASCII, &properties);
10924 if (DEPENDS_SEMANTICS) {
10925 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10930 DO_POSIX(ret, namedclass, properties,
10931 PL_PosixBlank, PL_XPosixBlank);
10934 DO_N_POSIX(ret, namedclass, properties,
10935 PL_PosixBlank, PL_XPosixBlank);
10938 DO_POSIX(ret, namedclass, properties,
10939 PL_PosixCntrl, PL_XPosixCntrl);
10942 DO_N_POSIX(ret, namedclass, properties,
10943 PL_PosixCntrl, PL_XPosixCntrl);
10946 /* There are no digits in the Latin1 range outside of
10947 * ASCII, so call the macro that doesn't have to resolve
10949 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
10950 PL_PosixDigit, "XPosixDigit", listsv);
10953 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10954 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10957 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10958 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10961 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10962 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10964 case ANYOF_HORIZWS:
10965 /* For these, we use the nonbitmap, as /d doesn't make a
10966 * difference in what these match. There would be problems
10967 * if these characters had folds other than themselves, as
10968 * nonbitmap is subject to folding. It turns out that \h
10969 * is just a synonym for XPosixBlank */
10970 _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
10972 case ANYOF_NHORIZWS:
10973 _invlist_union_complement_2nd(nonbitmap,
10974 PL_XPosixBlank, &nonbitmap);
10978 { /* These require special handling, as they differ under
10979 folding, matching Cased there (which in the ASCII range
10980 is the same as Alpha */
10986 if (FOLD && ! LOC) {
10987 ascii_source = PL_PosixAlpha;
10988 l1_source = PL_L1Cased;
10992 ascii_source = PL_PosixLower;
10993 l1_source = PL_L1PosixLower;
10994 Xname = "XPosixLower";
10996 if (namedclass == ANYOF_LOWER) {
10997 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10998 ascii_source, l1_source, Xname, listsv);
11001 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11002 properties, ascii_source, l1_source, Xname, listsv);
11007 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11008 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11011 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11012 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11015 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11016 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11019 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11020 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11023 DO_POSIX(ret, namedclass, properties,
11024 PL_PosixSpace, PL_XPosixSpace);
11026 case ANYOF_NPSXSPC:
11027 DO_N_POSIX(ret, namedclass, properties,
11028 PL_PosixSpace, PL_XPosixSpace);
11031 DO_POSIX(ret, namedclass, properties,
11032 PL_PerlSpace, PL_XPerlSpace);
11035 DO_N_POSIX(ret, namedclass, properties,
11036 PL_PerlSpace, PL_XPerlSpace);
11038 case ANYOF_UPPER: /* Same as LOWER, above */
11045 if (FOLD && ! LOC) {
11046 ascii_source = PL_PosixAlpha;
11047 l1_source = PL_L1Cased;
11051 ascii_source = PL_PosixUpper;
11052 l1_source = PL_L1PosixUpper;
11053 Xname = "XPosixUpper";
11055 if (namedclass == ANYOF_UPPER) {
11056 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11057 ascii_source, l1_source, Xname, listsv);
11060 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11061 properties, ascii_source, l1_source, Xname, listsv);
11065 case ANYOF_ALNUM: /* Really is 'Word' */
11066 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11067 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11070 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11071 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11074 /* For these, we use the nonbitmap, as /d doesn't make a
11075 * difference in what these match. There would be problems
11076 * if these characters had folds other than themselves, as
11077 * nonbitmap is subject to folding */
11078 _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11080 case ANYOF_NVERTWS:
11081 _invlist_union_complement_2nd(nonbitmap,
11082 PL_VertSpace, &nonbitmap);
11085 DO_POSIX(ret, namedclass, properties,
11086 PL_PosixXDigit, PL_XPosixXDigit);
11088 case ANYOF_NXDIGIT:
11089 DO_N_POSIX(ret, namedclass, properties,
11090 PL_PosixXDigit, PL_XPosixXDigit);
11093 /* this is to handle \p and \P */
11096 vFAIL("Invalid [::] class");
11102 } /* end of namedclass \blah */
11105 if (prevvalue > (IV)value) /* b-a */ {
11106 const int w = RExC_parse - rangebegin;
11107 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11108 range = 0; /* not a valid range */
11112 prevvalue = value; /* save the beginning of the range */
11113 if (RExC_parse+1 < RExC_end
11114 && *RExC_parse == '-'
11115 && RExC_parse[1] != ']')
11119 /* a bad range like \w-, [:word:]- ? */
11120 if (namedclass > OOB_NAMEDCLASS) {
11121 if (ckWARN(WARN_REGEXP)) {
11123 RExC_parse >= rangebegin ?
11124 RExC_parse - rangebegin : 0;
11126 "False [] range \"%*.*s\"",
11131 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11133 range = 1; /* yeah, it's a range! */
11134 continue; /* but do it the next time */
11138 /* non-Latin1 code point implies unicode semantics. Must be set in
11139 * pass1 so is there for the whole of pass 2 */
11141 RExC_uni_semantics = 1;
11144 /* now is the next time */
11146 if (prevvalue < 256) {
11147 const IV ceilvalue = value < 256 ? value : 255;
11150 /* In EBCDIC [\x89-\x91] should include
11151 * the \x8e but [i-j] should not. */
11152 if (literal_endpoint == 2 &&
11153 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11154 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11156 if (isLOWER(prevvalue)) {
11157 for (i = prevvalue; i <= ceilvalue; i++)
11158 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11160 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11163 for (i = prevvalue; i <= ceilvalue; i++)
11164 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11166 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11172 for (i = prevvalue; i <= ceilvalue; i++) {
11173 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11177 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
11178 const UV natvalue = NATIVE_TO_UNI(value);
11179 nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11182 literal_endpoint = 0;
11186 range = 0; /* this range (if it was one) is done now */
11193 /****** !SIZE_ONLY AFTER HERE *********/
11195 /* If folding and there are code points above 255, we calculate all
11196 * characters that could fold to or from the ones already on the list */
11197 if (FOLD && nonbitmap) {
11198 UV start, end; /* End points of code point ranges */
11200 SV* fold_intersection = NULL;
11202 /* This is a list of all the characters that participate in folds
11203 * (except marks, etc in multi-char folds */
11204 if (! PL_utf8_foldable) {
11205 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11206 PL_utf8_foldable = _swash_to_invlist(swash);
11207 SvREFCNT_dec(swash);
11210 /* This is a hash that for a particular fold gives all characters
11211 * that are involved in it */
11212 if (! PL_utf8_foldclosures) {
11214 /* If we were unable to find any folds, then we likely won't be
11215 * able to find the closures. So just create an empty list.
11216 * Folding will effectively be restricted to the non-Unicode rules
11217 * hard-coded into Perl. (This case happens legitimately during
11218 * compilation of Perl itself before the Unicode tables are
11220 if (invlist_len(PL_utf8_foldable) == 0) {
11221 PL_utf8_foldclosures = newHV();
11223 /* If the folds haven't been read in, call a fold function
11225 if (! PL_utf8_tofold) {
11226 U8 dummy[UTF8_MAXBYTES+1];
11229 /* This particular string is above \xff in both UTF-8 and
11231 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11232 assert(PL_utf8_tofold); /* Verify that worked */
11234 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11238 /* Only the characters in this class that participate in folds need be
11239 * checked. Get the intersection of this class and all the possible
11240 * characters that are foldable. This can quickly narrow down a large
11242 _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11244 /* Now look at the foldable characters in this class individually */
11245 invlist_iterinit(fold_intersection);
11246 while (invlist_iternext(fold_intersection, &start, &end)) {
11249 /* Look at every character in the range */
11250 for (j = start; j <= end; j++) {
11253 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11256 _to_uni_fold_flags(j, foldbuf, &foldlen,
11257 (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
11259 if (foldlen > (STRLEN)UNISKIP(f)) {
11261 /* Any multicharacter foldings (disallowed in lookbehind
11262 * patterns) require the following transform: [ABCDEF] ->
11263 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11264 * folds into "rst", all other characters fold to single
11265 * characters. We save away these multicharacter foldings,
11266 * to be later saved as part of the additional "s" data. */
11267 if (! RExC_in_lookbehind) {
11269 U8* e = foldbuf + foldlen;
11271 /* If any of the folded characters of this are in the
11272 * Latin1 range, tell the regex engine that this can
11273 * match a non-utf8 target string. The only multi-byte
11274 * fold whose source is in the Latin1 range (U+00DF)
11275 * applies only when the target string is utf8, or
11276 * under unicode rules */
11277 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11280 /* Can't mix ascii with non- under /aa */
11281 if (MORE_ASCII_RESTRICTED
11282 && (isASCII(*loc) != isASCII(j)))
11284 goto end_multi_fold;
11286 if (UTF8_IS_INVARIANT(*loc)
11287 || UTF8_IS_DOWNGRADEABLE_START(*loc))
11289 /* Can't mix above and below 256 under LOC
11292 goto end_multi_fold;
11295 |= ANYOF_NONBITMAP_NON_UTF8;
11298 loc += UTF8SKIP(loc);
11302 add_alternate(&unicode_alternate, foldbuf, foldlen);
11306 /* This is special-cased, as it is the only letter which
11307 * has both a multi-fold and single-fold in Latin1. All
11308 * the other chars that have single and multi-folds are
11309 * always in utf8, and the utf8 folding algorithm catches
11311 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11312 stored += set_regclass_bit(pRExC_state,
11314 LATIN_SMALL_LETTER_SHARP_S,
11315 &l1_fold_invlist, &unicode_alternate);
11319 /* Single character fold. Add everything in its fold
11320 * closure to the list that this node should match */
11323 /* The fold closures data structure is a hash with the keys
11324 * being every character that is folded to, like 'k', and
11325 * the values each an array of everything that folds to its
11326 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
11327 if ((listp = hv_fetch(PL_utf8_foldclosures,
11328 (char *) foldbuf, foldlen, FALSE)))
11330 AV* list = (AV*) *listp;
11332 for (k = 0; k <= av_len(list); k++) {
11333 SV** c_p = av_fetch(list, k, FALSE);
11336 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11340 /* /aa doesn't allow folds between ASCII and non-;
11341 * /l doesn't allow them between above and below
11343 if ((MORE_ASCII_RESTRICTED
11344 && (isASCII(c) != isASCII(j)))
11345 || (LOC && ((c < 256) != (j < 256))))
11350 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11351 stored += set_regclass_bit(pRExC_state,
11354 &l1_fold_invlist, &unicode_alternate);
11356 /* It may be that the code point is already in
11357 * this range or already in the bitmap, in
11358 * which case we need do nothing */
11359 else if ((c < start || c > end)
11361 || ! ANYOF_BITMAP_TEST(ret, c)))
11363 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11370 SvREFCNT_dec(fold_intersection);
11373 /* Combine the two lists into one. */
11374 if (l1_fold_invlist) {
11376 _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11377 SvREFCNT_dec(l1_fold_invlist);
11380 nonbitmap = l1_fold_invlist;
11384 /* And combine the result (if any) with any inversion list from properties.
11385 * The lists are kept separate up to now because we don't want to fold the
11389 _invlist_union(nonbitmap, properties, &nonbitmap);
11390 SvREFCNT_dec(properties);
11393 nonbitmap = properties;
11397 /* Here, <nonbitmap> contains all the code points we can determine at
11398 * compile time that we haven't put into the bitmap. Go through it, and
11399 * for things that belong in the bitmap, put them there, and delete from
11403 /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11404 * possibly only should match when the target string is UTF-8 */
11405 UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11407 /* This gets set if we actually need to modify things */
11408 bool change_invlist = FALSE;
11412 /* Start looking through <nonbitmap> */
11413 invlist_iterinit(nonbitmap);
11414 while (invlist_iternext(nonbitmap, &start, &end)) {
11418 /* Quit if are above what we should change */
11419 if (start > max_cp_to_set) {
11423 change_invlist = TRUE;
11425 /* Set all the bits in the range, up to the max that we are doing */
11426 high = (end < max_cp_to_set) ? end : max_cp_to_set;
11427 for (i = start; i <= (int) high; i++) {
11428 if (! ANYOF_BITMAP_TEST(ret, i)) {
11429 ANYOF_BITMAP_SET(ret, i);
11437 /* Done with loop; remove any code points that are in the bitmap from
11439 if (change_invlist) {
11440 _invlist_subtract(nonbitmap,
11441 (DEPENDS_SEMANTICS)
11447 /* If have completely emptied it, remove it completely */
11448 if (invlist_len(nonbitmap) == 0) {
11449 SvREFCNT_dec(nonbitmap);
11454 /* Here, we have calculated what code points should be in the character
11455 * class. <nonbitmap> does not overlap the bitmap except possibly in the
11456 * case of DEPENDS rules.
11458 * Now we can see about various optimizations. Fold calculation (which we
11459 * did above) needs to take place before inversion. Otherwise /[^k]/i
11460 * would invert to include K, which under /i would match k, which it
11463 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
11464 * set the FOLD flag yet, so this does optimize those. It doesn't
11465 * optimize locale. Doing so perhaps could be done as long as there is
11466 * nothing like \w in it; some thought also would have to be given to the
11467 * interaction with above 0x100 chars */
11468 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11470 && ! unicode_alternate
11471 /* In case of /d, there are some things that should match only when in
11472 * not in the bitmap, i.e., they require UTF8 to match. These are
11473 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11474 * case, they don't require UTF8, so can invert here */
11476 || ! DEPENDS_SEMANTICS
11477 || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11478 && SvCUR(listsv) == initial_listsv_len)
11482 for (i = 0; i < 256; ++i) {
11483 if (ANYOF_BITMAP_TEST(ret, i)) {
11484 ANYOF_BITMAP_CLEAR(ret, i);
11487 ANYOF_BITMAP_SET(ret, i);
11492 /* The inversion means that everything above 255 is matched */
11493 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11496 /* Here, also has things outside the bitmap that may overlap with
11497 * the bitmap. We have to sync them up, so that they get inverted
11498 * in both places. Earlier, we removed all overlaps except in the
11499 * case of /d rules, so no syncing is needed except for this case
11501 SV *remove_list = NULL;
11503 if (DEPENDS_SEMANTICS) {
11506 /* Set the bits that correspond to the ones that aren't in the
11507 * bitmap. Otherwise, when we invert, we'll miss these.
11508 * Earlier, we removed from the nonbitmap all code points
11509 * < 128, so there is no extra work here */
11510 invlist_iterinit(nonbitmap);
11511 while (invlist_iternext(nonbitmap, &start, &end)) {
11512 if (start > 255) { /* The bit map goes to 255 */
11518 for (i = start; i <= (int) end; ++i) {
11519 ANYOF_BITMAP_SET(ret, i);
11526 /* Now invert both the bitmap and the nonbitmap. Anything in the
11527 * bitmap has to also be removed from the non-bitmap, but again,
11528 * there should not be overlap unless is /d rules. */
11529 _invlist_invert(nonbitmap);
11531 /* Any swash can't be used as-is, because we've inverted things */
11533 SvREFCNT_dec(swash);
11537 for (i = 0; i < 256; ++i) {
11538 if (ANYOF_BITMAP_TEST(ret, i)) {
11539 ANYOF_BITMAP_CLEAR(ret, i);
11540 if (DEPENDS_SEMANTICS) {
11541 if (! remove_list) {
11542 remove_list = _new_invlist(2);
11544 remove_list = add_cp_to_invlist(remove_list, i);
11548 ANYOF_BITMAP_SET(ret, i);
11554 /* And do the removal */
11555 if (DEPENDS_SEMANTICS) {
11557 _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11558 SvREFCNT_dec(remove_list);
11562 /* There is no overlap for non-/d, so just delete anything
11564 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
11568 stored = 256 - stored;
11570 /* Clear the invert flag since have just done it here */
11571 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11574 /* Folding in the bitmap is taken care of above, but not for locale (for
11575 * which we have to wait to see what folding is in effect at runtime), and
11576 * for some things not in the bitmap (only the upper latin folds in this
11577 * case, as all other single-char folding has been set above). Set
11578 * run-time fold flag for these */
11580 || (DEPENDS_SEMANTICS
11582 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11583 || unicode_alternate))
11585 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11588 /* A single character class can be "optimized" into an EXACTish node.
11589 * Note that since we don't currently count how many characters there are
11590 * outside the bitmap, we are XXX missing optimization possibilities for
11591 * them. This optimization can't happen unless this is a truly single
11592 * character class, which means that it can't be an inversion into a
11593 * many-character class, and there must be no possibility of there being
11594 * things outside the bitmap. 'stored' (only) for locales doesn't include
11595 * \w, etc, so have to make a special test that they aren't present
11597 * Similarly A 2-character class of the very special form like [bB] can be
11598 * optimized into an EXACTFish node, but only for non-locales, and for
11599 * characters which only have the two folds; so things like 'fF' and 'Ii'
11600 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11603 && ! unicode_alternate
11604 && SvCUR(listsv) == initial_listsv_len
11605 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11606 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11607 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11608 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11609 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11610 /* If the latest code point has a fold whose
11611 * bit is set, it must be the only other one */
11612 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11613 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11615 /* Note that the information needed to decide to do this optimization
11616 * is not currently available until the 2nd pass, and that the actually
11617 * used EXACTish node takes less space than the calculated ANYOF node,
11618 * and hence the amount of space calculated in the first pass is larger
11619 * than actually used, so this optimization doesn't gain us any space.
11620 * But an EXACT node is faster than an ANYOF node, and can be combined
11621 * with any adjacent EXACT nodes later by the optimizer for further
11622 * gains. The speed of executing an EXACTF is similar to an ANYOF
11623 * node, so the optimization advantage comes from the ability to join
11624 * it to adjacent EXACT nodes */
11626 const char * cur_parse= RExC_parse;
11628 RExC_emit = (regnode *)orig_emit;
11629 RExC_parse = (char *)orig_parse;
11633 /* A locale node with one point can be folded; all the other cases
11634 * with folding will have two points, since we calculate them above
11636 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11643 else { /* else 2 chars in the bit map: the folds of each other */
11645 /* Use the folded value, which for the cases where we get here,
11646 * is just the lower case of the current one (which may resolve to
11647 * itself, or to the other one */
11648 value = toLOWER_LATIN1(value);
11650 /* To join adjacent nodes, they must be the exact EXACTish type.
11651 * Try to use the most likely type, by using EXACTFA if possible,
11652 * then EXACTFU if the regex calls for it, or is required because
11653 * the character is non-ASCII. (If <value> is ASCII, its fold is
11654 * also ASCII for the cases where we get here.) */
11655 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11658 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11661 else { /* Otherwise, more likely to be EXACTF type */
11666 ret = reg_node(pRExC_state, op);
11667 RExC_parse = (char *)cur_parse;
11668 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11669 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11670 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11672 RExC_emit += STR_SZ(2);
11675 *STRING(ret)= (char)value;
11677 RExC_emit += STR_SZ(1);
11679 SvREFCNT_dec(listsv);
11683 /* If there is a swash and more than one element, we can't use the swash in
11684 * the optimization below. */
11685 if (swash && element_count > 1) {
11686 SvREFCNT_dec(swash);
11690 && SvCUR(listsv) == initial_listsv_len
11691 && ! unicode_alternate)
11693 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11694 SvREFCNT_dec(listsv);
11695 SvREFCNT_dec(unicode_alternate);
11698 /* av[0] stores the character class description in its textual form:
11699 * used later (regexec.c:Perl_regclass_swash()) to initialize the
11700 * appropriate swash, and is also useful for dumping the regnode.
11701 * av[1] if NULL, is a placeholder to later contain the swash computed
11702 * from av[0]. But if no further computation need be done, the
11703 * swash is stored there now.
11704 * av[2] stores the multicharacter foldings, used later in
11705 * regexec.c:S_reginclass().
11706 * av[3] stores the nonbitmap inversion list for use in addition or
11707 * instead of av[0]; not used if av[1] isn't NULL
11708 * av[4] is set if any component of the class is from a user-defined
11709 * property; not used if av[1] isn't NULL */
11710 AV * const av = newAV();
11713 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11717 av_store(av, 1, swash);
11718 SvREFCNT_dec(nonbitmap);
11721 av_store(av, 1, NULL);
11723 av_store(av, 3, nonbitmap);
11724 av_store(av, 4, newSVuv(has_user_defined_property));
11728 /* Store any computed multi-char folds only if we are allowing
11730 if (allow_full_fold) {
11731 av_store(av, 2, MUTABLE_SV(unicode_alternate));
11732 if (unicode_alternate) { /* This node is variable length */
11737 av_store(av, 2, NULL);
11739 rv = newRV_noinc(MUTABLE_SV(av));
11740 n = add_data(pRExC_state, 1, "s");
11741 RExC_rxi->data->data[n] = (void*)rv;
11748 /* reg_skipcomment()
11750 Absorbs an /x style # comments from the input stream.
11751 Returns true if there is more text remaining in the stream.
11752 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11753 terminates the pattern without including a newline.
11755 Note its the callers responsibility to ensure that we are
11756 actually in /x mode
11761 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11765 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11767 while (RExC_parse < RExC_end)
11768 if (*RExC_parse++ == '\n') {
11773 /* we ran off the end of the pattern without ending
11774 the comment, so we have to add an \n when wrapping */
11775 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11783 Advances the parse position, and optionally absorbs
11784 "whitespace" from the inputstream.
11786 Without /x "whitespace" means (?#...) style comments only,
11787 with /x this means (?#...) and # comments and whitespace proper.
11789 Returns the RExC_parse point from BEFORE the scan occurs.
11791 This is the /x friendly way of saying RExC_parse++.
11795 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11797 char* const retval = RExC_parse++;
11799 PERL_ARGS_ASSERT_NEXTCHAR;
11802 if (RExC_end - RExC_parse >= 3
11803 && *RExC_parse == '('
11804 && RExC_parse[1] == '?'
11805 && RExC_parse[2] == '#')
11807 while (*RExC_parse != ')') {
11808 if (RExC_parse == RExC_end)
11809 FAIL("Sequence (?#... not terminated");
11815 if (RExC_flags & RXf_PMf_EXTENDED) {
11816 if (isSPACE(*RExC_parse)) {
11820 else if (*RExC_parse == '#') {
11821 if ( reg_skipcomment( pRExC_state ) )
11830 - reg_node - emit a node
11832 STATIC regnode * /* Location. */
11833 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11836 register regnode *ptr;
11837 regnode * const ret = RExC_emit;
11838 GET_RE_DEBUG_FLAGS_DECL;
11840 PERL_ARGS_ASSERT_REG_NODE;
11843 SIZE_ALIGN(RExC_size);
11847 if (RExC_emit >= RExC_emit_bound)
11848 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11849 op, RExC_emit, RExC_emit_bound);
11851 NODE_ALIGN_FILL(ret);
11853 FILL_ADVANCE_NODE(ptr, op);
11854 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
11855 #ifdef RE_TRACK_PATTERN_OFFSETS
11856 if (RExC_offsets) { /* MJD */
11857 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11858 "reg_node", __LINE__,
11860 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11861 ? "Overwriting end of array!\n" : "OK",
11862 (UV)(RExC_emit - RExC_emit_start),
11863 (UV)(RExC_parse - RExC_start),
11864 (UV)RExC_offsets[0]));
11865 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11873 - reganode - emit a node with an argument
11875 STATIC regnode * /* Location. */
11876 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11879 register regnode *ptr;
11880 regnode * const ret = RExC_emit;
11881 GET_RE_DEBUG_FLAGS_DECL;
11883 PERL_ARGS_ASSERT_REGANODE;
11886 SIZE_ALIGN(RExC_size);
11891 assert(2==regarglen[op]+1);
11893 Anything larger than this has to allocate the extra amount.
11894 If we changed this to be:
11896 RExC_size += (1 + regarglen[op]);
11898 then it wouldn't matter. Its not clear what side effect
11899 might come from that so its not done so far.
11904 if (RExC_emit >= RExC_emit_bound)
11905 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11906 op, RExC_emit, RExC_emit_bound);
11908 NODE_ALIGN_FILL(ret);
11910 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11911 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
11912 #ifdef RE_TRACK_PATTERN_OFFSETS
11913 if (RExC_offsets) { /* MJD */
11914 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11918 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11919 "Overwriting end of array!\n" : "OK",
11920 (UV)(RExC_emit - RExC_emit_start),
11921 (UV)(RExC_parse - RExC_start),
11922 (UV)RExC_offsets[0]));
11923 Set_Cur_Node_Offset;
11931 - reguni - emit (if appropriate) a Unicode character
11934 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11938 PERL_ARGS_ASSERT_REGUNI;
11940 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11944 - reginsert - insert an operator in front of already-emitted operand
11946 * Means relocating the operand.
11949 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11952 register regnode *src;
11953 register regnode *dst;
11954 register regnode *place;
11955 const int offset = regarglen[(U8)op];
11956 const int size = NODE_STEP_REGNODE + offset;
11957 GET_RE_DEBUG_FLAGS_DECL;
11959 PERL_ARGS_ASSERT_REGINSERT;
11960 PERL_UNUSED_ARG(depth);
11961 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11962 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11971 if (RExC_open_parens) {
11973 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11974 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11975 if ( RExC_open_parens[paren] >= opnd ) {
11976 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11977 RExC_open_parens[paren] += size;
11979 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11981 if ( RExC_close_parens[paren] >= opnd ) {
11982 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11983 RExC_close_parens[paren] += size;
11985 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11990 while (src > opnd) {
11991 StructCopy(--src, --dst, regnode);
11992 #ifdef RE_TRACK_PATTERN_OFFSETS
11993 if (RExC_offsets) { /* MJD 20010112 */
11994 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11998 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11999 ? "Overwriting end of array!\n" : "OK",
12000 (UV)(src - RExC_emit_start),
12001 (UV)(dst - RExC_emit_start),
12002 (UV)RExC_offsets[0]));
12003 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12004 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12010 place = opnd; /* Op node, where operand used to be. */
12011 #ifdef RE_TRACK_PATTERN_OFFSETS
12012 if (RExC_offsets) { /* MJD */
12013 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12017 (UV)(place - RExC_emit_start) > RExC_offsets[0]
12018 ? "Overwriting end of array!\n" : "OK",
12019 (UV)(place - RExC_emit_start),
12020 (UV)(RExC_parse - RExC_start),
12021 (UV)RExC_offsets[0]));
12022 Set_Node_Offset(place, RExC_parse);
12023 Set_Node_Length(place, 1);
12026 src = NEXTOPER(place);
12027 FILL_ADVANCE_NODE(place, op);
12028 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
12029 Zero(src, offset, regnode);
12033 - regtail - set the next-pointer at the end of a node chain of p to val.
12034 - SEE ALSO: regtail_study
12036 /* TODO: All three parms should be const */
12038 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12041 register regnode *scan;
12042 GET_RE_DEBUG_FLAGS_DECL;
12044 PERL_ARGS_ASSERT_REGTAIL;
12046 PERL_UNUSED_ARG(depth);
12052 /* Find last node. */
12055 regnode * const temp = regnext(scan);
12057 SV * const mysv=sv_newmortal();
12058 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12059 regprop(RExC_rx, mysv, scan);
12060 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12061 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12062 (temp == NULL ? "->" : ""),
12063 (temp == NULL ? PL_reg_name[OP(val)] : "")
12071 if (reg_off_by_arg[OP(scan)]) {
12072 ARG_SET(scan, val - scan);
12075 NEXT_OFF(scan) = val - scan;
12081 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12082 - Look for optimizable sequences at the same time.
12083 - currently only looks for EXACT chains.
12085 This is experimental code. The idea is to use this routine to perform
12086 in place optimizations on branches and groups as they are constructed,
12087 with the long term intention of removing optimization from study_chunk so
12088 that it is purely analytical.
12090 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12091 to control which is which.
12094 /* TODO: All four parms should be const */
12097 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12100 register regnode *scan;
12102 #ifdef EXPERIMENTAL_INPLACESCAN
12105 GET_RE_DEBUG_FLAGS_DECL;
12107 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12113 /* Find last node. */
12117 regnode * const temp = regnext(scan);
12118 #ifdef EXPERIMENTAL_INPLACESCAN
12119 if (PL_regkind[OP(scan)] == EXACT) {
12120 bool has_exactf_sharp_s; /* Unexamined in this routine */
12121 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12126 switch (OP(scan)) {
12132 case EXACTFU_TRICKYFOLD:
12134 if( exact == PSEUDO )
12136 else if ( exact != OP(scan) )
12145 SV * const mysv=sv_newmortal();
12146 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12147 regprop(RExC_rx, mysv, scan);
12148 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12149 SvPV_nolen_const(mysv),
12150 REG_NODE_NUM(scan),
12151 PL_reg_name[exact]);
12158 SV * const mysv_val=sv_newmortal();
12159 DEBUG_PARSE_MSG("");
12160 regprop(RExC_rx, mysv_val, val);
12161 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12162 SvPV_nolen_const(mysv_val),
12163 (IV)REG_NODE_NUM(val),
12167 if (reg_off_by_arg[OP(scan)]) {
12168 ARG_SET(scan, val - scan);
12171 NEXT_OFF(scan) = val - scan;
12179 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12183 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12189 for (bit=0; bit<32; bit++) {
12190 if (flags & (1<<bit)) {
12191 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12194 if (!set++ && lead)
12195 PerlIO_printf(Perl_debug_log, "%s",lead);
12196 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12199 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12200 if (!set++ && lead) {
12201 PerlIO_printf(Perl_debug_log, "%s",lead);
12204 case REGEX_UNICODE_CHARSET:
12205 PerlIO_printf(Perl_debug_log, "UNICODE");
12207 case REGEX_LOCALE_CHARSET:
12208 PerlIO_printf(Perl_debug_log, "LOCALE");
12210 case REGEX_ASCII_RESTRICTED_CHARSET:
12211 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12213 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12214 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12217 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12223 PerlIO_printf(Perl_debug_log, "\n");
12225 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12231 Perl_regdump(pTHX_ const regexp *r)
12235 SV * const sv = sv_newmortal();
12236 SV *dsv= sv_newmortal();
12237 RXi_GET_DECL(r,ri);
12238 GET_RE_DEBUG_FLAGS_DECL;
12240 PERL_ARGS_ASSERT_REGDUMP;
12242 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12244 /* Header fields of interest. */
12245 if (r->anchored_substr) {
12246 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12247 RE_SV_DUMPLEN(r->anchored_substr), 30);
12248 PerlIO_printf(Perl_debug_log,
12249 "anchored %s%s at %"IVdf" ",
12250 s, RE_SV_TAIL(r->anchored_substr),
12251 (IV)r->anchored_offset);
12252 } else if (r->anchored_utf8) {
12253 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12254 RE_SV_DUMPLEN(r->anchored_utf8), 30);
12255 PerlIO_printf(Perl_debug_log,
12256 "anchored utf8 %s%s at %"IVdf" ",
12257 s, RE_SV_TAIL(r->anchored_utf8),
12258 (IV)r->anchored_offset);
12260 if (r->float_substr) {
12261 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12262 RE_SV_DUMPLEN(r->float_substr), 30);
12263 PerlIO_printf(Perl_debug_log,
12264 "floating %s%s at %"IVdf"..%"UVuf" ",
12265 s, RE_SV_TAIL(r->float_substr),
12266 (IV)r->float_min_offset, (UV)r->float_max_offset);
12267 } else if (r->float_utf8) {
12268 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12269 RE_SV_DUMPLEN(r->float_utf8), 30);
12270 PerlIO_printf(Perl_debug_log,
12271 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12272 s, RE_SV_TAIL(r->float_utf8),
12273 (IV)r->float_min_offset, (UV)r->float_max_offset);
12275 if (r->check_substr || r->check_utf8)
12276 PerlIO_printf(Perl_debug_log,
12278 (r->check_substr == r->float_substr
12279 && r->check_utf8 == r->float_utf8
12280 ? "(checking floating" : "(checking anchored"));
12281 if (r->extflags & RXf_NOSCAN)
12282 PerlIO_printf(Perl_debug_log, " noscan");
12283 if (r->extflags & RXf_CHECK_ALL)
12284 PerlIO_printf(Perl_debug_log, " isall");
12285 if (r->check_substr || r->check_utf8)
12286 PerlIO_printf(Perl_debug_log, ") ");
12288 if (ri->regstclass) {
12289 regprop(r, sv, ri->regstclass);
12290 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12292 if (r->extflags & RXf_ANCH) {
12293 PerlIO_printf(Perl_debug_log, "anchored");
12294 if (r->extflags & RXf_ANCH_BOL)
12295 PerlIO_printf(Perl_debug_log, "(BOL)");
12296 if (r->extflags & RXf_ANCH_MBOL)
12297 PerlIO_printf(Perl_debug_log, "(MBOL)");
12298 if (r->extflags & RXf_ANCH_SBOL)
12299 PerlIO_printf(Perl_debug_log, "(SBOL)");
12300 if (r->extflags & RXf_ANCH_GPOS)
12301 PerlIO_printf(Perl_debug_log, "(GPOS)");
12302 PerlIO_putc(Perl_debug_log, ' ');
12304 if (r->extflags & RXf_GPOS_SEEN)
12305 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12306 if (r->intflags & PREGf_SKIP)
12307 PerlIO_printf(Perl_debug_log, "plus ");
12308 if (r->intflags & PREGf_IMPLICIT)
12309 PerlIO_printf(Perl_debug_log, "implicit ");
12310 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12311 if (r->extflags & RXf_EVAL_SEEN)
12312 PerlIO_printf(Perl_debug_log, "with eval ");
12313 PerlIO_printf(Perl_debug_log, "\n");
12314 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12316 PERL_ARGS_ASSERT_REGDUMP;
12317 PERL_UNUSED_CONTEXT;
12318 PERL_UNUSED_ARG(r);
12319 #endif /* DEBUGGING */
12323 - regprop - printable representation of opcode
12325 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12328 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12329 if (flags & ANYOF_INVERT) \
12330 /*make sure the invert info is in each */ \
12331 sv_catpvs(sv, "^"); \
12337 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12342 RXi_GET_DECL(prog,progi);
12343 GET_RE_DEBUG_FLAGS_DECL;
12345 PERL_ARGS_ASSERT_REGPROP;
12349 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
12350 /* It would be nice to FAIL() here, but this may be called from
12351 regexec.c, and it would be hard to supply pRExC_state. */
12352 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12353 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12355 k = PL_regkind[OP(o)];
12358 sv_catpvs(sv, " ");
12359 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12360 * is a crude hack but it may be the best for now since
12361 * we have no flag "this EXACTish node was UTF-8"
12363 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12364 PERL_PV_ESCAPE_UNI_DETECT |
12365 PERL_PV_ESCAPE_NONASCII |
12366 PERL_PV_PRETTY_ELLIPSES |
12367 PERL_PV_PRETTY_LTGT |
12368 PERL_PV_PRETTY_NOCLEAR
12370 } else if (k == TRIE) {
12371 /* print the details of the trie in dumpuntil instead, as
12372 * progi->data isn't available here */
12373 const char op = OP(o);
12374 const U32 n = ARG(o);
12375 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12376 (reg_ac_data *)progi->data->data[n] :
12378 const reg_trie_data * const trie
12379 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12381 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12382 DEBUG_TRIE_COMPILE_r(
12383 Perl_sv_catpvf(aTHX_ sv,
12384 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12385 (UV)trie->startstate,
12386 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12387 (UV)trie->wordcount,
12390 (UV)TRIE_CHARCOUNT(trie),
12391 (UV)trie->uniquecharcount
12394 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12396 int rangestart = -1;
12397 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12398 sv_catpvs(sv, "[");
12399 for (i = 0; i <= 256; i++) {
12400 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12401 if (rangestart == -1)
12403 } else if (rangestart != -1) {
12404 if (i <= rangestart + 3)
12405 for (; rangestart < i; rangestart++)
12406 put_byte(sv, rangestart);
12408 put_byte(sv, rangestart);
12409 sv_catpvs(sv, "-");
12410 put_byte(sv, i - 1);
12415 sv_catpvs(sv, "]");
12418 } else if (k == CURLY) {
12419 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12420 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12421 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12423 else if (k == WHILEM && o->flags) /* Ordinal/of */
12424 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12425 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12426 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
12427 if ( RXp_PAREN_NAMES(prog) ) {
12428 if ( k != REF || (OP(o) < NREF)) {
12429 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12430 SV **name= av_fetch(list, ARG(o), 0 );
12432 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12435 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12436 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12437 I32 *nums=(I32*)SvPVX(sv_dat);
12438 SV **name= av_fetch(list, nums[0], 0 );
12441 for ( n=0; n<SvIVX(sv_dat); n++ ) {
12442 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12443 (n ? "," : ""), (IV)nums[n]);
12445 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12449 } else if (k == GOSUB)
12450 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12451 else if (k == VERB) {
12453 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12454 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12455 } else if (k == LOGICAL)
12456 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
12457 else if (k == ANYOF) {
12458 int i, rangestart = -1;
12459 const U8 flags = ANYOF_FLAGS(o);
12462 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12463 static const char * const anyofs[] = {
12496 if (flags & ANYOF_LOCALE)
12497 sv_catpvs(sv, "{loc}");
12498 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12499 sv_catpvs(sv, "{i}");
12500 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12501 if (flags & ANYOF_INVERT)
12502 sv_catpvs(sv, "^");
12504 /* output what the standard cp 0-255 bitmap matches */
12505 for (i = 0; i <= 256; i++) {
12506 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12507 if (rangestart == -1)
12509 } else if (rangestart != -1) {
12510 if (i <= rangestart + 3)
12511 for (; rangestart < i; rangestart++)
12512 put_byte(sv, rangestart);
12514 put_byte(sv, rangestart);
12515 sv_catpvs(sv, "-");
12516 put_byte(sv, i - 1);
12523 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12524 /* output any special charclass tests (used entirely under use locale) */
12525 if (ANYOF_CLASS_TEST_ANY_SET(o))
12526 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12527 if (ANYOF_CLASS_TEST(o,i)) {
12528 sv_catpv(sv, anyofs[i]);
12532 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12534 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12535 sv_catpvs(sv, "{non-utf8-latin1-all}");
12538 /* output information about the unicode matching */
12539 if (flags & ANYOF_UNICODE_ALL)
12540 sv_catpvs(sv, "{unicode_all}");
12541 else if (ANYOF_NONBITMAP(o))
12542 sv_catpvs(sv, "{unicode}");
12543 if (flags & ANYOF_NONBITMAP_NON_UTF8)
12544 sv_catpvs(sv, "{outside bitmap}");
12546 if (ANYOF_NONBITMAP(o)) {
12547 SV *lv; /* Set if there is something outside the bit map */
12548 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12549 bool byte_output = FALSE; /* If something in the bitmap has been
12552 if (lv && lv != &PL_sv_undef) {
12554 U8 s[UTF8_MAXBYTES_CASE+1];
12556 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12557 uvchr_to_utf8(s, i);
12560 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
12564 && swash_fetch(sw, s, TRUE))
12566 if (rangestart == -1)
12568 } else if (rangestart != -1) {
12569 byte_output = TRUE;
12570 if (i <= rangestart + 3)
12571 for (; rangestart < i; rangestart++) {
12572 put_byte(sv, rangestart);
12575 put_byte(sv, rangestart);
12576 sv_catpvs(sv, "-");
12585 char *s = savesvpv(lv);
12586 char * const origs = s;
12588 while (*s && *s != '\n')
12592 const char * const t = ++s;
12595 sv_catpvs(sv, " ");
12601 /* Truncate very long output */
12602 if (s - origs > 256) {
12603 Perl_sv_catpvf(aTHX_ sv,
12605 (int) (s - origs - 1),
12611 else if (*s == '\t') {
12630 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12632 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12633 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12635 PERL_UNUSED_CONTEXT;
12636 PERL_UNUSED_ARG(sv);
12637 PERL_UNUSED_ARG(o);
12638 PERL_UNUSED_ARG(prog);
12639 #endif /* DEBUGGING */
12643 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12644 { /* Assume that RE_INTUIT is set */
12646 struct regexp *const prog = (struct regexp *)SvANY(r);
12647 GET_RE_DEBUG_FLAGS_DECL;
12649 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12650 PERL_UNUSED_CONTEXT;
12654 const char * const s = SvPV_nolen_const(prog->check_substr
12655 ? prog->check_substr : prog->check_utf8);
12657 if (!PL_colorset) reginitcolors();
12658 PerlIO_printf(Perl_debug_log,
12659 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12661 prog->check_substr ? "" : "utf8 ",
12662 PL_colors[5],PL_colors[0],
12665 (strlen(s) > 60 ? "..." : ""));
12668 return prog->check_substr ? prog->check_substr : prog->check_utf8;
12674 handles refcounting and freeing the perl core regexp structure. When
12675 it is necessary to actually free the structure the first thing it
12676 does is call the 'free' method of the regexp_engine associated to
12677 the regexp, allowing the handling of the void *pprivate; member
12678 first. (This routine is not overridable by extensions, which is why
12679 the extensions free is called first.)
12681 See regdupe and regdupe_internal if you change anything here.
12683 #ifndef PERL_IN_XSUB_RE
12685 Perl_pregfree(pTHX_ REGEXP *r)
12691 Perl_pregfree2(pTHX_ REGEXP *rx)
12694 struct regexp *const r = (struct regexp *)SvANY(rx);
12695 GET_RE_DEBUG_FLAGS_DECL;
12697 PERL_ARGS_ASSERT_PREGFREE2;
12699 if (r->mother_re) {
12700 ReREFCNT_dec(r->mother_re);
12702 CALLREGFREE_PVT(rx); /* free the private data */
12703 SvREFCNT_dec(RXp_PAREN_NAMES(r));
12706 SvREFCNT_dec(r->anchored_substr);
12707 SvREFCNT_dec(r->anchored_utf8);
12708 SvREFCNT_dec(r->float_substr);
12709 SvREFCNT_dec(r->float_utf8);
12710 Safefree(r->substrs);
12712 RX_MATCH_COPY_FREE(rx);
12713 #ifdef PERL_OLD_COPY_ON_WRITE
12714 SvREFCNT_dec(r->saved_copy);
12721 This is a hacky workaround to the structural issue of match results
12722 being stored in the regexp structure which is in turn stored in
12723 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12724 could be PL_curpm in multiple contexts, and could require multiple
12725 result sets being associated with the pattern simultaneously, such
12726 as when doing a recursive match with (??{$qr})
12728 The solution is to make a lightweight copy of the regexp structure
12729 when a qr// is returned from the code executed by (??{$qr}) this
12730 lightweight copy doesn't actually own any of its data except for
12731 the starp/end and the actual regexp structure itself.
12737 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12739 struct regexp *ret;
12740 struct regexp *const r = (struct regexp *)SvANY(rx);
12741 register const I32 npar = r->nparens+1;
12743 PERL_ARGS_ASSERT_REG_TEMP_COPY;
12746 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12747 ret = (struct regexp *)SvANY(ret_x);
12749 (void)ReREFCNT_inc(rx);
12750 /* We can take advantage of the existing "copied buffer" mechanism in SVs
12751 by pointing directly at the buffer, but flagging that the allocated
12752 space in the copy is zero. As we've just done a struct copy, it's now
12753 a case of zero-ing that, rather than copying the current length. */
12754 SvPV_set(ret_x, RX_WRAPPED(rx));
12755 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12756 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12757 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12758 SvLEN_set(ret_x, 0);
12759 SvSTASH_set(ret_x, NULL);
12760 SvMAGIC_set(ret_x, NULL);
12761 Newx(ret->offs, npar, regexp_paren_pair);
12762 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12764 Newx(ret->substrs, 1, struct reg_substr_data);
12765 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12767 SvREFCNT_inc_void(ret->anchored_substr);
12768 SvREFCNT_inc_void(ret->anchored_utf8);
12769 SvREFCNT_inc_void(ret->float_substr);
12770 SvREFCNT_inc_void(ret->float_utf8);
12772 /* check_substr and check_utf8, if non-NULL, point to either their
12773 anchored or float namesakes, and don't hold a second reference. */
12775 RX_MATCH_COPIED_off(ret_x);
12776 #ifdef PERL_OLD_COPY_ON_WRITE
12777 ret->saved_copy = NULL;
12779 ret->mother_re = rx;
12785 /* regfree_internal()
12787 Free the private data in a regexp. This is overloadable by
12788 extensions. Perl takes care of the regexp structure in pregfree(),
12789 this covers the *pprivate pointer which technically perl doesn't
12790 know about, however of course we have to handle the
12791 regexp_internal structure when no extension is in use.
12793 Note this is called before freeing anything in the regexp
12798 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12801 struct regexp *const r = (struct regexp *)SvANY(rx);
12802 RXi_GET_DECL(r,ri);
12803 GET_RE_DEBUG_FLAGS_DECL;
12805 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12811 SV *dsv= sv_newmortal();
12812 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12813 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12814 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12815 PL_colors[4],PL_colors[5],s);
12818 #ifdef RE_TRACK_PATTERN_OFFSETS
12820 Safefree(ri->u.offsets); /* 20010421 MJD */
12823 int n = ri->data->count;
12824 PAD* new_comppad = NULL;
12829 /* If you add a ->what type here, update the comment in regcomp.h */
12830 switch (ri->data->what[n]) {
12835 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12838 Safefree(ri->data->data[n]);
12841 new_comppad = MUTABLE_AV(ri->data->data[n]);
12844 if (new_comppad == NULL)
12845 Perl_croak(aTHX_ "panic: pregfree comppad");
12846 PAD_SAVE_LOCAL(old_comppad,
12847 /* Watch out for global destruction's random ordering. */
12848 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12851 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12854 op_free((OP_4tree*)ri->data->data[n]);
12856 PAD_RESTORE_LOCAL(old_comppad);
12857 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12858 new_comppad = NULL;
12863 { /* Aho Corasick add-on structure for a trie node.
12864 Used in stclass optimization only */
12866 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12868 refcount = --aho->refcount;
12871 PerlMemShared_free(aho->states);
12872 PerlMemShared_free(aho->fail);
12873 /* do this last!!!! */
12874 PerlMemShared_free(ri->data->data[n]);
12875 PerlMemShared_free(ri->regstclass);
12881 /* trie structure. */
12883 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12885 refcount = --trie->refcount;
12888 PerlMemShared_free(trie->charmap);
12889 PerlMemShared_free(trie->states);
12890 PerlMemShared_free(trie->trans);
12892 PerlMemShared_free(trie->bitmap);
12894 PerlMemShared_free(trie->jump);
12895 PerlMemShared_free(trie->wordinfo);
12896 /* do this last!!!! */
12897 PerlMemShared_free(ri->data->data[n]);
12902 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12905 Safefree(ri->data->what);
12906 Safefree(ri->data);
12912 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12913 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12914 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
12917 re_dup - duplicate a regexp.
12919 This routine is expected to clone a given regexp structure. It is only
12920 compiled under USE_ITHREADS.
12922 After all of the core data stored in struct regexp is duplicated
12923 the regexp_engine.dupe method is used to copy any private data
12924 stored in the *pprivate pointer. This allows extensions to handle
12925 any duplication it needs to do.
12927 See pregfree() and regfree_internal() if you change anything here.
12929 #if defined(USE_ITHREADS)
12930 #ifndef PERL_IN_XSUB_RE
12932 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12936 const struct regexp *r = (const struct regexp *)SvANY(sstr);
12937 struct regexp *ret = (struct regexp *)SvANY(dstr);
12939 PERL_ARGS_ASSERT_RE_DUP_GUTS;
12941 npar = r->nparens+1;
12942 Newx(ret->offs, npar, regexp_paren_pair);
12943 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12945 /* no need to copy these */
12946 Newx(ret->swap, npar, regexp_paren_pair);
12949 if (ret->substrs) {
12950 /* Do it this way to avoid reading from *r after the StructCopy().
12951 That way, if any of the sv_dup_inc()s dislodge *r from the L1
12952 cache, it doesn't matter. */
12953 const bool anchored = r->check_substr
12954 ? r->check_substr == r->anchored_substr
12955 : r->check_utf8 == r->anchored_utf8;
12956 Newx(ret->substrs, 1, struct reg_substr_data);
12957 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12959 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12960 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12961 ret->float_substr = sv_dup_inc(ret->float_substr, param);
12962 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12964 /* check_substr and check_utf8, if non-NULL, point to either their
12965 anchored or float namesakes, and don't hold a second reference. */
12967 if (ret->check_substr) {
12969 assert(r->check_utf8 == r->anchored_utf8);
12970 ret->check_substr = ret->anchored_substr;
12971 ret->check_utf8 = ret->anchored_utf8;
12973 assert(r->check_substr == r->float_substr);
12974 assert(r->check_utf8 == r->float_utf8);
12975 ret->check_substr = ret->float_substr;
12976 ret->check_utf8 = ret->float_utf8;
12978 } else if (ret->check_utf8) {
12980 ret->check_utf8 = ret->anchored_utf8;
12982 ret->check_utf8 = ret->float_utf8;
12987 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12990 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12992 if (RX_MATCH_COPIED(dstr))
12993 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
12995 ret->subbeg = NULL;
12996 #ifdef PERL_OLD_COPY_ON_WRITE
12997 ret->saved_copy = NULL;
13000 if (ret->mother_re) {
13001 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13002 /* Our storage points directly to our mother regexp, but that's
13003 1: a buffer in a different thread
13004 2: something we no longer hold a reference on
13005 so we need to copy it locally. */
13006 /* Note we need to use SvCUR(), rather than
13007 SvLEN(), on our mother_re, because it, in
13008 turn, may well be pointing to its own mother_re. */
13009 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13010 SvCUR(ret->mother_re)+1));
13011 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13013 ret->mother_re = NULL;
13017 #endif /* PERL_IN_XSUB_RE */
13022 This is the internal complement to regdupe() which is used to copy
13023 the structure pointed to by the *pprivate pointer in the regexp.
13024 This is the core version of the extension overridable cloning hook.
13025 The regexp structure being duplicated will be copied by perl prior
13026 to this and will be provided as the regexp *r argument, however
13027 with the /old/ structures pprivate pointer value. Thus this routine
13028 may override any copying normally done by perl.
13030 It returns a pointer to the new regexp_internal structure.
13034 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13037 struct regexp *const r = (struct regexp *)SvANY(rx);
13038 regexp_internal *reti;
13040 RXi_GET_DECL(r,ri);
13042 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13046 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13047 Copy(ri->program, reti->program, len+1, regnode);
13050 reti->regstclass = NULL;
13053 struct reg_data *d;
13054 const int count = ri->data->count;
13057 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13058 char, struct reg_data);
13059 Newx(d->what, count, U8);
13062 for (i = 0; i < count; i++) {
13063 d->what[i] = ri->data->what[i];
13064 switch (d->what[i]) {
13065 /* legal options are one of: sSfpontTua
13066 see also regcomp.h and pregfree() */
13067 case 'a': /* actually an AV, but the dup function is identical. */
13070 case 'p': /* actually an AV, but the dup function is identical. */
13071 case 'u': /* actually an HV, but the dup function is identical. */
13072 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13075 /* This is cheating. */
13076 Newx(d->data[i], 1, struct regnode_charclass_class);
13077 StructCopy(ri->data->data[i], d->data[i],
13078 struct regnode_charclass_class);
13079 reti->regstclass = (regnode*)d->data[i];
13082 /* Compiled op trees are readonly and in shared memory,
13083 and can thus be shared without duplication. */
13085 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
13089 /* Trie stclasses are readonly and can thus be shared
13090 * without duplication. We free the stclass in pregfree
13091 * when the corresponding reg_ac_data struct is freed.
13093 reti->regstclass= ri->regstclass;
13097 ((reg_trie_data*)ri->data->data[i])->refcount++;
13101 d->data[i] = ri->data->data[i];
13104 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13113 reti->name_list_idx = ri->name_list_idx;
13115 #ifdef RE_TRACK_PATTERN_OFFSETS
13116 if (ri->u.offsets) {
13117 Newx(reti->u.offsets, 2*len+1, U32);
13118 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13121 SetProgLen(reti,len);
13124 return (void*)reti;
13127 #endif /* USE_ITHREADS */
13129 #ifndef PERL_IN_XSUB_RE
13132 - regnext - dig the "next" pointer out of a node
13135 Perl_regnext(pTHX_ register regnode *p)
13138 register I32 offset;
13143 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13144 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13147 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13156 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13159 STRLEN l1 = strlen(pat1);
13160 STRLEN l2 = strlen(pat2);
13163 const char *message;
13165 PERL_ARGS_ASSERT_RE_CROAK2;
13171 Copy(pat1, buf, l1 , char);
13172 Copy(pat2, buf + l1, l2 , char);
13173 buf[l1 + l2] = '\n';
13174 buf[l1 + l2 + 1] = '\0';
13176 /* ANSI variant takes additional second argument */
13177 va_start(args, pat2);
13181 msv = vmess(buf, &args);
13183 message = SvPV_const(msv,l1);
13186 Copy(message, buf, l1 , char);
13187 buf[l1-1] = '\0'; /* Overwrite \n */
13188 Perl_croak(aTHX_ "%s", buf);
13191 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13193 #ifndef PERL_IN_XSUB_RE
13195 Perl_save_re_context(pTHX)
13199 struct re_save_state *state;
13201 SAVEVPTR(PL_curcop);
13202 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13204 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13205 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13206 SSPUSHUV(SAVEt_RE_STATE);
13208 Copy(&PL_reg_state, state, 1, struct re_save_state);
13210 PL_reg_start_tmp = 0;
13211 PL_reg_start_tmpl = 0;
13212 PL_reg_oldsaved = NULL;
13213 PL_reg_oldsavedlen = 0;
13214 PL_reg_maxiter = 0;
13215 PL_reg_leftiter = 0;
13216 PL_reg_poscache = NULL;
13217 PL_reg_poscache_size = 0;
13218 #ifdef PERL_OLD_COPY_ON_WRITE
13222 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13224 const REGEXP * const rx = PM_GETRE(PL_curpm);
13227 for (i = 1; i <= RX_NPARENS(rx); i++) {
13228 char digits[TYPE_CHARS(long)];
13229 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13230 GV *const *const gvp
13231 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13234 GV * const gv = *gvp;
13235 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13245 clear_re(pTHX_ void *r)
13248 ReREFCNT_dec((REGEXP *)r);
13254 S_put_byte(pTHX_ SV *sv, int c)
13256 PERL_ARGS_ASSERT_PUT_BYTE;
13258 /* Our definition of isPRINT() ignores locales, so only bytes that are
13259 not part of UTF-8 are considered printable. I assume that the same
13260 holds for UTF-EBCDIC.
13261 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13262 which Wikipedia says:
13264 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13265 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13266 identical, to the ASCII delete (DEL) or rubout control character.
13267 ) So the old condition can be simplified to !isPRINT(c) */
13270 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13273 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13277 const char string = c;
13278 if (c == '-' || c == ']' || c == '\\' || c == '^')
13279 sv_catpvs(sv, "\\");
13280 sv_catpvn(sv, &string, 1);
13285 #define CLEAR_OPTSTART \
13286 if (optstart) STMT_START { \
13287 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13291 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13293 STATIC const regnode *
13294 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13295 const regnode *last, const regnode *plast,
13296 SV* sv, I32 indent, U32 depth)
13299 register U8 op = PSEUDO; /* Arbitrary non-END op. */
13300 register const regnode *next;
13301 const regnode *optstart= NULL;
13303 RXi_GET_DECL(r,ri);
13304 GET_RE_DEBUG_FLAGS_DECL;
13306 PERL_ARGS_ASSERT_DUMPUNTIL;
13308 #ifdef DEBUG_DUMPUNTIL
13309 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13310 last ? last-start : 0,plast ? plast-start : 0);
13313 if (plast && plast < last)
13316 while (PL_regkind[op] != END && (!last || node < last)) {
13317 /* While that wasn't END last time... */
13320 if (op == CLOSE || op == WHILEM)
13322 next = regnext((regnode *)node);
13325 if (OP(node) == OPTIMIZED) {
13326 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13333 regprop(r, sv, node);
13334 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13335 (int)(2*indent + 1), "", SvPVX_const(sv));
13337 if (OP(node) != OPTIMIZED) {
13338 if (next == NULL) /* Next ptr. */
13339 PerlIO_printf(Perl_debug_log, " (0)");
13340 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13341 PerlIO_printf(Perl_debug_log, " (FAIL)");
13343 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13344 (void)PerlIO_putc(Perl_debug_log, '\n');
13348 if (PL_regkind[(U8)op] == BRANCHJ) {
13351 register const regnode *nnode = (OP(next) == LONGJMP
13352 ? regnext((regnode *)next)
13354 if (last && nnode > last)
13356 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13359 else if (PL_regkind[(U8)op] == BRANCH) {
13361 DUMPUNTIL(NEXTOPER(node), next);
13363 else if ( PL_regkind[(U8)op] == TRIE ) {
13364 const regnode *this_trie = node;
13365 const char op = OP(node);
13366 const U32 n = ARG(node);
13367 const reg_ac_data * const ac = op>=AHOCORASICK ?
13368 (reg_ac_data *)ri->data->data[n] :
13370 const reg_trie_data * const trie =
13371 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13373 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13375 const regnode *nextbranch= NULL;
13378 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13379 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13381 PerlIO_printf(Perl_debug_log, "%*s%s ",
13382 (int)(2*(indent+3)), "",
13383 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13384 PL_colors[0], PL_colors[1],
13385 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13386 PERL_PV_PRETTY_ELLIPSES |
13387 PERL_PV_PRETTY_LTGT
13392 U16 dist= trie->jump[word_idx+1];
13393 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13394 (UV)((dist ? this_trie + dist : next) - start));
13397 nextbranch= this_trie + trie->jump[0];
13398 DUMPUNTIL(this_trie + dist, nextbranch);
13400 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13401 nextbranch= regnext((regnode *)nextbranch);
13403 PerlIO_printf(Perl_debug_log, "\n");
13406 if (last && next > last)
13411 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
13412 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13413 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13415 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13417 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13419 else if ( op == PLUS || op == STAR) {
13420 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13422 else if (PL_regkind[(U8)op] == ANYOF) {
13423 /* arglen 1 + class block */
13424 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13425 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13426 node = NEXTOPER(node);
13428 else if (PL_regkind[(U8)op] == EXACT) {
13429 /* Literal string, where present. */
13430 node += NODE_SZ_STR(node) - 1;
13431 node = NEXTOPER(node);
13434 node = NEXTOPER(node);
13435 node += regarglen[(U8)op];
13437 if (op == CURLYX || op == OPEN)
13441 #ifdef DEBUG_DUMPUNTIL
13442 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13447 #endif /* DEBUGGING */
13451 * c-indentation-style: bsd
13452 * c-basic-offset: 4
13453 * indent-tabs-mode: t
13456 * ex: set ts=8 sts=4 sw=4 noet: