5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
75 #undef PERL_IN_XSUB_RE
76 #define PERL_IN_XSUB_RE 1
78 #undef PERL_IN_XSUB_RE
80 #ifndef PERL_IN_XSUB_RE
85 #ifdef PERL_IN_XSUB_RE
91 #include "dquote_static.c"
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U32 flags; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 REGEXP *rx_sv; /* The SV that is the regexp. */
114 regexp *rx; /* perl core regexp structure */
115 regexp_internal *rxi; /* internal data for regexp object pprivate field */
116 char *start; /* Start of input for compile */
117 char *end; /* End of input for compile */
118 char *parse; /* Input-scan pointer. */
119 I32 whilem_seen; /* number of WHILEM in this expr */
120 regnode *emit_start; /* Start of emitted-code area */
121 regnode *emit_bound; /* First regnode outside of the allocated space */
122 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
123 I32 naughty; /* How bad is this pattern? */
124 I32 sawback; /* Did we see \1, ...? */
126 I32 size; /* Code size. */
127 I32 npar; /* Capture buffer count, (OPEN). */
128 I32 cpar; /* Capture buffer count, (CLOSE). */
129 I32 nestroot; /* root parens we are in - used by accept */
133 regnode **open_parens; /* pointers to open parens */
134 regnode **close_parens; /* pointers to close parens */
135 regnode *opend; /* END node in program */
136 I32 utf8; /* whether the pattern is utf8 or not */
137 I32 orig_utf8; /* whether the pattern was originally in utf8 */
138 /* XXX use this for future optimisation of case
139 * where pattern must be upgraded to utf8. */
140 I32 uni_semantics; /* If a d charset modifier should use unicode
141 rules, even if the pattern is not in
143 HV *paren_names; /* Paren names */
145 regnode **recurse; /* Recurse regops */
146 I32 recurse_count; /* Number of recurse regops */
149 I32 override_recoding;
151 char *starttry; /* -Dr: where regtry was called. */
152 #define RExC_starttry (pRExC_state->starttry)
155 const char *lastparse;
157 AV *paren_name_list; /* idx -> name */
158 #define RExC_lastparse (pRExC_state->lastparse)
159 #define RExC_lastnum (pRExC_state->lastnum)
160 #define RExC_paren_name_list (pRExC_state->paren_name_list)
164 #define RExC_flags (pRExC_state->flags)
165 #define RExC_precomp (pRExC_state->precomp)
166 #define RExC_rx_sv (pRExC_state->rx_sv)
167 #define RExC_rx (pRExC_state->rx)
168 #define RExC_rxi (pRExC_state->rxi)
169 #define RExC_start (pRExC_state->start)
170 #define RExC_end (pRExC_state->end)
171 #define RExC_parse (pRExC_state->parse)
172 #define RExC_whilem_seen (pRExC_state->whilem_seen)
173 #ifdef RE_TRACK_PATTERN_OFFSETS
174 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
176 #define RExC_emit (pRExC_state->emit)
177 #define RExC_emit_start (pRExC_state->emit_start)
178 #define RExC_emit_bound (pRExC_state->emit_bound)
179 #define RExC_naughty (pRExC_state->naughty)
180 #define RExC_sawback (pRExC_state->sawback)
181 #define RExC_seen (pRExC_state->seen)
182 #define RExC_size (pRExC_state->size)
183 #define RExC_npar (pRExC_state->npar)
184 #define RExC_nestroot (pRExC_state->nestroot)
185 #define RExC_extralen (pRExC_state->extralen)
186 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
187 #define RExC_seen_evals (pRExC_state->seen_evals)
188 #define RExC_utf8 (pRExC_state->utf8)
189 #define RExC_uni_semantics (pRExC_state->uni_semantics)
190 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
191 #define RExC_open_parens (pRExC_state->open_parens)
192 #define RExC_close_parens (pRExC_state->close_parens)
193 #define RExC_opend (pRExC_state->opend)
194 #define RExC_paren_names (pRExC_state->paren_names)
195 #define RExC_recurse (pRExC_state->recurse)
196 #define RExC_recurse_count (pRExC_state->recurse_count)
197 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
198 #define RExC_contains_locale (pRExC_state->contains_locale)
199 #define RExC_override_recoding (pRExC_state->override_recoding)
202 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
203 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
204 ((*s) == '{' && regcurly(s)))
207 #undef SPSTART /* dratted cpp namespace... */
210 * Flags to be passed up and down.
212 #define WORST 0 /* Worst case. */
213 #define HASWIDTH 0x01 /* Known to match non-null strings. */
215 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
216 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
218 #define SPSTART 0x04 /* Starts with * or +. */
219 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
220 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
222 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
224 /* whether trie related optimizations are enabled */
225 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
226 #define TRIE_STUDY_OPT
227 #define FULL_TRIE_STUDY
233 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
234 #define PBITVAL(paren) (1 << ((paren) & 7))
235 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
236 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
237 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
239 /* If not already in utf8, do a longjmp back to the beginning */
240 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
241 #define REQUIRE_UTF8 STMT_START { \
242 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
245 /* About scan_data_t.
247 During optimisation we recurse through the regexp program performing
248 various inplace (keyhole style) optimisations. In addition study_chunk
249 and scan_commit populate this data structure with information about
250 what strings MUST appear in the pattern. We look for the longest
251 string that must appear at a fixed location, and we look for the
252 longest string that may appear at a floating location. So for instance
257 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
258 strings (because they follow a .* construct). study_chunk will identify
259 both FOO and BAR as being the longest fixed and floating strings respectively.
261 The strings can be composites, for instance
265 will result in a composite fixed substring 'foo'.
267 For each string some basic information is maintained:
269 - offset or min_offset
270 This is the position the string must appear at, or not before.
271 It also implicitly (when combined with minlenp) tells us how many
272 characters must match before the string we are searching for.
273 Likewise when combined with minlenp and the length of the string it
274 tells us how many characters must appear after the string we have
278 Only used for floating strings. This is the rightmost point that
279 the string can appear at. If set to I32 max it indicates that the
280 string can occur infinitely far to the right.
283 A pointer to the minimum length of the pattern that the string
284 was found inside. This is important as in the case of positive
285 lookahead or positive lookbehind we can have multiple patterns
290 The minimum length of the pattern overall is 3, the minimum length
291 of the lookahead part is 3, but the minimum length of the part that
292 will actually match is 1. So 'FOO's minimum length is 3, but the
293 minimum length for the F is 1. This is important as the minimum length
294 is used to determine offsets in front of and behind the string being
295 looked for. Since strings can be composites this is the length of the
296 pattern at the time it was committed with a scan_commit. Note that
297 the length is calculated by study_chunk, so that the minimum lengths
298 are not known until the full pattern has been compiled, thus the
299 pointer to the value.
303 In the case of lookbehind the string being searched for can be
304 offset past the start point of the final matching string.
305 If this value was just blithely removed from the min_offset it would
306 invalidate some of the calculations for how many chars must match
307 before or after (as they are derived from min_offset and minlen and
308 the length of the string being searched for).
309 When the final pattern is compiled and the data is moved from the
310 scan_data_t structure into the regexp structure the information
311 about lookbehind is factored in, with the information that would
312 have been lost precalculated in the end_shift field for the
315 The fields pos_min and pos_delta are used to store the minimum offset
316 and the delta to the maximum offset at the current point in the pattern.
320 typedef struct scan_data_t {
321 /*I32 len_min; unused */
322 /*I32 len_delta; unused */
326 I32 last_end; /* min value, <0 unless valid. */
329 SV **longest; /* Either &l_fixed, or &l_float. */
330 SV *longest_fixed; /* longest fixed string found in pattern */
331 I32 offset_fixed; /* offset where it starts */
332 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
333 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
334 SV *longest_float; /* longest floating string found in pattern */
335 I32 offset_float_min; /* earliest point in string it can appear */
336 I32 offset_float_max; /* latest point in string it can appear */
337 I32 *minlen_float; /* pointer to the minlen relevant to the string */
338 I32 lookbehind_float; /* is the position of the string modified by LB */
342 struct regnode_charclass_class *start_class;
346 * Forward declarations for pregcomp()'s friends.
349 static const scan_data_t zero_scan_data =
350 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
352 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
353 #define SF_BEFORE_SEOL 0x0001
354 #define SF_BEFORE_MEOL 0x0002
355 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
356 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
359 # define SF_FIX_SHIFT_EOL (0+2)
360 # define SF_FL_SHIFT_EOL (0+4)
362 # define SF_FIX_SHIFT_EOL (+2)
363 # define SF_FL_SHIFT_EOL (+4)
366 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
367 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
369 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
370 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
371 #define SF_IS_INF 0x0040
372 #define SF_HAS_PAR 0x0080
373 #define SF_IN_PAR 0x0100
374 #define SF_HAS_EVAL 0x0200
375 #define SCF_DO_SUBSTR 0x0400
376 #define SCF_DO_STCLASS_AND 0x0800
377 #define SCF_DO_STCLASS_OR 0x1000
378 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
379 #define SCF_WHILEM_VISITED_POS 0x2000
381 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
382 #define SCF_SEEN_ACCEPT 0x8000
384 #define UTF cBOOL(RExC_utf8)
385 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
386 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
387 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
388 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
389 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
390 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
391 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
393 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
395 #define OOB_UNICODE 12345678
396 #define OOB_NAMEDCLASS -1
398 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
399 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
402 /* length of regex to show in messages that don't mark a position within */
403 #define RegexLengthToShowInErrorMessages 127
406 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
407 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
408 * op/pragma/warn/regcomp.
410 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
411 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
413 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
416 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
417 * arg. Show regex, up to a maximum length. If it's too long, chop and add
420 #define _FAIL(code) STMT_START { \
421 const char *ellipses = ""; \
422 IV len = RExC_end - RExC_precomp; \
425 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
426 if (len > RegexLengthToShowInErrorMessages) { \
427 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
428 len = RegexLengthToShowInErrorMessages - 10; \
434 #define FAIL(msg) _FAIL( \
435 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
436 msg, (int)len, RExC_precomp, ellipses))
438 #define FAIL2(msg,arg) _FAIL( \
439 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
440 arg, (int)len, RExC_precomp, ellipses))
443 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
445 #define Simple_vFAIL(m) STMT_START { \
446 const IV offset = RExC_parse - RExC_precomp; \
447 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
448 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
452 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
454 #define vFAIL(m) STMT_START { \
456 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
461 * Like Simple_vFAIL(), but accepts two arguments.
463 #define Simple_vFAIL2(m,a1) STMT_START { \
464 const IV offset = RExC_parse - RExC_precomp; \
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
472 #define vFAIL2(m,a1) STMT_START { \
474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
475 Simple_vFAIL2(m, a1); \
480 * Like Simple_vFAIL(), but accepts three arguments.
482 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
483 const IV offset = RExC_parse - RExC_precomp; \
484 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
485 (int)offset, RExC_precomp, RExC_precomp + offset); \
489 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
491 #define vFAIL3(m,a1,a2) STMT_START { \
493 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
494 Simple_vFAIL3(m, a1, a2); \
498 * Like Simple_vFAIL(), but accepts four arguments.
500 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
501 const IV offset = RExC_parse - RExC_precomp; \
502 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
503 (int)offset, RExC_precomp, RExC_precomp + offset); \
506 #define ckWARNreg(loc,m) STMT_START { \
507 const IV offset = loc - RExC_precomp; \
508 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 (int)offset, RExC_precomp, RExC_precomp + offset); \
512 #define ckWARNregdep(loc,m) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
516 (int)offset, RExC_precomp, RExC_precomp + offset); \
519 #define ckWARN2regdep(loc,m, a1) STMT_START { \
520 const IV offset = loc - RExC_precomp; \
521 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
523 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
526 #define ckWARN2reg(loc, m, a1) STMT_START { \
527 const IV offset = loc - RExC_precomp; \
528 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
529 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
532 #define vWARN3(loc, m, a1, a2) STMT_START { \
533 const IV offset = loc - RExC_precomp; \
534 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
535 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
538 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
539 const IV offset = loc - RExC_precomp; \
540 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
541 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
544 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
545 const IV offset = loc - RExC_precomp; \
546 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
547 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
550 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
551 const IV offset = loc - RExC_precomp; \
552 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
553 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
556 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
557 const IV offset = loc - RExC_precomp; \
558 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
559 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
563 /* Allow for side effects in s */
564 #define REGC(c,s) STMT_START { \
565 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
568 /* Macros for recording node offsets. 20001227 mjd@plover.com
569 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
570 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
571 * Element 0 holds the number n.
572 * Position is 1 indexed.
574 #ifndef RE_TRACK_PATTERN_OFFSETS
575 #define Set_Node_Offset_To_R(node,byte)
576 #define Set_Node_Offset(node,byte)
577 #define Set_Cur_Node_Offset
578 #define Set_Node_Length_To_R(node,len)
579 #define Set_Node_Length(node,len)
580 #define Set_Node_Cur_Length(node)
581 #define Node_Offset(n)
582 #define Node_Length(n)
583 #define Set_Node_Offset_Length(node,offset,len)
584 #define ProgLen(ri) ri->u.proglen
585 #define SetProgLen(ri,x) ri->u.proglen = x
587 #define ProgLen(ri) ri->u.offsets[0]
588 #define SetProgLen(ri,x) ri->u.offsets[0] = x
589 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
591 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
592 __LINE__, (int)(node), (int)(byte))); \
594 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
596 RExC_offsets[2*(node)-1] = (byte); \
601 #define Set_Node_Offset(node,byte) \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
603 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
605 #define Set_Node_Length_To_R(node,len) STMT_START { \
607 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
608 __LINE__, (int)(node), (int)(len))); \
610 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
612 RExC_offsets[2*(node)] = (len); \
617 #define Set_Node_Length(node,len) \
618 Set_Node_Length_To_R((node)-RExC_emit_start, len)
619 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
620 #define Set_Node_Cur_Length(node) \
621 Set_Node_Length(node, RExC_parse - parse_start)
623 /* Get offsets and lengths */
624 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
625 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
627 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
628 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
629 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
633 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
634 #define EXPERIMENTAL_INPLACESCAN
635 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
637 #define DEBUG_STUDYDATA(str,data,depth) \
638 DEBUG_OPTIMISE_MORE_r(if(data){ \
639 PerlIO_printf(Perl_debug_log, \
640 "%*s" str "Pos:%"IVdf"/%"IVdf \
641 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
642 (int)(depth)*2, "", \
643 (IV)((data)->pos_min), \
644 (IV)((data)->pos_delta), \
645 (UV)((data)->flags), \
646 (IV)((data)->whilem_c), \
647 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
648 is_inf ? "INF " : "" \
650 if ((data)->last_found) \
651 PerlIO_printf(Perl_debug_log, \
652 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
653 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
654 SvPVX_const((data)->last_found), \
655 (IV)((data)->last_end), \
656 (IV)((data)->last_start_min), \
657 (IV)((data)->last_start_max), \
658 ((data)->longest && \
659 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
660 SvPVX_const((data)->longest_fixed), \
661 (IV)((data)->offset_fixed), \
662 ((data)->longest && \
663 (data)->longest==&((data)->longest_float)) ? "*" : "", \
664 SvPVX_const((data)->longest_float), \
665 (IV)((data)->offset_float_min), \
666 (IV)((data)->offset_float_max) \
668 PerlIO_printf(Perl_debug_log,"\n"); \
671 static void clear_re(pTHX_ void *r);
673 /* Mark that we cannot extend a found fixed substring at this point.
674 Update the longest found anchored substring and the longest found
675 floating substrings if needed. */
678 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
680 const STRLEN l = CHR_SVLEN(data->last_found);
681 const STRLEN old_l = CHR_SVLEN(*data->longest);
682 GET_RE_DEBUG_FLAGS_DECL;
684 PERL_ARGS_ASSERT_SCAN_COMMIT;
686 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
687 SvSetMagicSV(*data->longest, data->last_found);
688 if (*data->longest == data->longest_fixed) {
689 data->offset_fixed = l ? data->last_start_min : data->pos_min;
690 if (data->flags & SF_BEFORE_EOL)
692 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
694 data->flags &= ~SF_FIX_BEFORE_EOL;
695 data->minlen_fixed=minlenp;
696 data->lookbehind_fixed=0;
698 else { /* *data->longest == data->longest_float */
699 data->offset_float_min = l ? data->last_start_min : data->pos_min;
700 data->offset_float_max = (l
701 ? data->last_start_max
702 : data->pos_min + data->pos_delta);
703 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
704 data->offset_float_max = I32_MAX;
705 if (data->flags & SF_BEFORE_EOL)
707 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
709 data->flags &= ~SF_FL_BEFORE_EOL;
710 data->minlen_float=minlenp;
711 data->lookbehind_float=0;
714 SvCUR_set(data->last_found, 0);
716 SV * const sv = data->last_found;
717 if (SvUTF8(sv) && SvMAGICAL(sv)) {
718 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
724 data->flags &= ~SF_BEFORE_EOL;
725 DEBUG_STUDYDATA("commit: ",data,0);
728 /* Can match anything (initialization) */
730 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
732 PERL_ARGS_ASSERT_CL_ANYTHING;
734 ANYOF_BITMAP_SETALL(cl);
735 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
736 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
738 /* If any portion of the regex is to operate under locale rules,
739 * initialization includes it. The reason this isn't done for all regexes
740 * is that the optimizer was written under the assumption that locale was
741 * all-or-nothing. Given the complexity and lack of documentation in the
742 * optimizer, and that there are inadequate test cases for locale, so many
743 * parts of it may not work properly, it is safest to avoid locale unless
745 if (RExC_contains_locale) {
746 ANYOF_CLASS_SETALL(cl); /* /l uses class */
747 cl->flags |= ANYOF_LOCALE;
750 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
754 /* Can match anything (initialization) */
756 S_cl_is_anything(const struct regnode_charclass_class *cl)
760 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
762 for (value = 0; value <= ANYOF_MAX; value += 2)
763 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
765 if (!(cl->flags & ANYOF_UNICODE_ALL))
767 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
772 /* Can match anything (initialization) */
774 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
776 PERL_ARGS_ASSERT_CL_INIT;
778 Zero(cl, 1, struct regnode_charclass_class);
780 cl_anything(pRExC_state, cl);
781 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
784 /* These two functions currently do the exact same thing */
785 #define cl_init_zero S_cl_init
787 /* 'AND' a given class with another one. Can create false positives. 'cl'
788 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
789 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
791 S_cl_and(struct regnode_charclass_class *cl,
792 const struct regnode_charclass_class *and_with)
794 PERL_ARGS_ASSERT_CL_AND;
796 assert(and_with->type == ANYOF);
798 /* I (khw) am not sure all these restrictions are necessary XXX */
799 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
800 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
801 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
802 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
803 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
806 if (and_with->flags & ANYOF_INVERT)
807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 cl->bitmap[i] &= ~and_with->bitmap[i];
810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 cl->bitmap[i] &= and_with->bitmap[i];
812 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
814 if (and_with->flags & ANYOF_INVERT) {
816 /* Here, the and'ed node is inverted. Get the AND of the flags that
817 * aren't affected by the inversion. Those that are affected are
818 * handled individually below */
819 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
820 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
821 cl->flags |= affected_flags;
823 /* We currently don't know how to deal with things that aren't in the
824 * bitmap, but we know that the intersection is no greater than what
825 * is already in cl, so let there be false positives that get sorted
826 * out after the synthetic start class succeeds, and the node is
827 * matched for real. */
829 /* The inversion of these two flags indicate that the resulting
830 * intersection doesn't have them */
831 if (and_with->flags & ANYOF_UNICODE_ALL) {
832 cl->flags &= ~ANYOF_UNICODE_ALL;
834 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
835 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
838 else { /* and'd node is not inverted */
839 U8 outside_bitmap_but_not_utf8; /* Temp variable */
841 if (! ANYOF_NONBITMAP(and_with)) {
843 /* Here 'and_with' doesn't match anything outside the bitmap
844 * (except possibly ANYOF_UNICODE_ALL), which means the
845 * intersection can't either, except for ANYOF_UNICODE_ALL, in
846 * which case we don't know what the intersection is, but it's no
847 * greater than what cl already has, so can just leave it alone,
848 * with possible false positives */
849 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
850 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
851 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
854 else if (! ANYOF_NONBITMAP(cl)) {
856 /* Here, 'and_with' does match something outside the bitmap, and cl
857 * doesn't have a list of things to match outside the bitmap. If
858 * cl can match all code points above 255, the intersection will
859 * be those above-255 code points that 'and_with' matches. If cl
860 * can't match all Unicode code points, it means that it can't
861 * match anything outside the bitmap (since the 'if' that got us
862 * into this block tested for that), so we leave the bitmap empty.
864 if (cl->flags & ANYOF_UNICODE_ALL) {
865 ARG_SET(cl, ARG(and_with));
867 /* and_with's ARG may match things that don't require UTF8.
868 * And now cl's will too, in spite of this being an 'and'. See
869 * the comments below about the kludge */
870 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
874 /* Here, both 'and_with' and cl match something outside the
875 * bitmap. Currently we do not do the intersection, so just match
876 * whatever cl had at the beginning. */
880 /* Take the intersection of the two sets of flags. However, the
881 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
882 * kludge around the fact that this flag is not treated like the others
883 * which are initialized in cl_anything(). The way the optimizer works
884 * is that the synthetic start class (SSC) is initialized to match
885 * anything, and then the first time a real node is encountered, its
886 * values are AND'd with the SSC's with the result being the values of
887 * the real node. However, there are paths through the optimizer where
888 * the AND never gets called, so those initialized bits are set
889 * inappropriately, which is not usually a big deal, as they just cause
890 * false positives in the SSC, which will just mean a probably
891 * imperceptible slow down in execution. However this bit has a
892 * higher false positive consequence in that it can cause utf8.pm,
893 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
894 * bigger slowdown and also causes significant extra memory to be used.
895 * In order to prevent this, the code now takes a different tack. The
896 * bit isn't set unless some part of the regular expression needs it,
897 * but once set it won't get cleared. This means that these extra
898 * modules won't get loaded unless there was some path through the
899 * pattern that would have required them anyway, and so any false
900 * positives that occur by not ANDing them out when they could be
901 * aren't as severe as they would be if we treated this bit like all
903 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
904 & ANYOF_NONBITMAP_NON_UTF8;
905 cl->flags &= and_with->flags;
906 cl->flags |= outside_bitmap_but_not_utf8;
910 /* 'OR' a given class with another one. Can create false positives. 'cl'
911 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
912 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
914 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
916 PERL_ARGS_ASSERT_CL_OR;
918 if (or_with->flags & ANYOF_INVERT) {
920 /* Here, the or'd node is to be inverted. This means we take the
921 * complement of everything not in the bitmap, but currently we don't
922 * know what that is, so give up and match anything */
923 if (ANYOF_NONBITMAP(or_with)) {
924 cl_anything(pRExC_state, cl);
927 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
928 * <= (B1 | !B2) | (CL1 | !CL2)
929 * which is wasteful if CL2 is small, but we ignore CL2:
930 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
931 * XXXX Can we handle case-fold? Unclear:
932 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
933 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
935 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
936 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
937 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
940 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
941 cl->bitmap[i] |= ~or_with->bitmap[i];
942 } /* XXXX: logic is complicated otherwise */
944 cl_anything(pRExC_state, cl);
947 /* And, we can just take the union of the flags that aren't affected
948 * by the inversion */
949 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
951 /* For the remaining flags:
952 ANYOF_UNICODE_ALL and inverted means to not match anything above
953 255, which means that the union with cl should just be
954 what cl has in it, so can ignore this flag
955 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
956 is 127-255 to match them, but then invert that, so the
957 union with cl should just be what cl has in it, so can
960 } else { /* 'or_with' is not inverted */
961 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
962 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
963 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
964 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
967 /* OR char bitmap and class bitmap separately */
968 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
969 cl->bitmap[i] |= or_with->bitmap[i];
970 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
971 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
972 cl->classflags[i] |= or_with->classflags[i];
973 cl->flags |= ANYOF_CLASS;
976 else { /* XXXX: logic is complicated, leave it along for a moment. */
977 cl_anything(pRExC_state, cl);
980 if (ANYOF_NONBITMAP(or_with)) {
982 /* Use the added node's outside-the-bit-map match if there isn't a
983 * conflict. If there is a conflict (both nodes match something
984 * outside the bitmap, but what they match outside is not the same
985 * pointer, and hence not easily compared until XXX we extend
986 * inversion lists this far), give up and allow the start class to
987 * match everything outside the bitmap. If that stuff is all above
988 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
989 if (! ANYOF_NONBITMAP(cl)) {
990 ARG_SET(cl, ARG(or_with));
992 else if (ARG(cl) != ARG(or_with)) {
994 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
995 cl_anything(pRExC_state, cl);
998 cl->flags |= ANYOF_UNICODE_ALL;
1003 /* Take the union */
1004 cl->flags |= or_with->flags;
1008 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1009 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1010 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1011 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1016 dump_trie(trie,widecharmap,revcharmap)
1017 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1018 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1020 These routines dump out a trie in a somewhat readable format.
1021 The _interim_ variants are used for debugging the interim
1022 tables that are used to generate the final compressed
1023 representation which is what dump_trie expects.
1025 Part of the reason for their existence is to provide a form
1026 of documentation as to how the different representations function.
1031 Dumps the final compressed table form of the trie to Perl_debug_log.
1032 Used for debugging make_trie().
1036 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1037 AV *revcharmap, U32 depth)
1040 SV *sv=sv_newmortal();
1041 int colwidth= widecharmap ? 6 : 4;
1043 GET_RE_DEBUG_FLAGS_DECL;
1045 PERL_ARGS_ASSERT_DUMP_TRIE;
1047 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1048 (int)depth * 2 + 2,"",
1049 "Match","Base","Ofs" );
1051 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1052 SV ** const tmp = av_fetch( revcharmap, state, 0);
1054 PerlIO_printf( Perl_debug_log, "%*s",
1056 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1057 PL_colors[0], PL_colors[1],
1058 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1059 PERL_PV_ESCAPE_FIRSTCHAR
1064 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1065 (int)depth * 2 + 2,"");
1067 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1068 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1069 PerlIO_printf( Perl_debug_log, "\n");
1071 for( state = 1 ; state < trie->statecount ; state++ ) {
1072 const U32 base = trie->states[ state ].trans.base;
1074 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1076 if ( trie->states[ state ].wordnum ) {
1077 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1079 PerlIO_printf( Perl_debug_log, "%6s", "" );
1082 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1087 while( ( base + ofs < trie->uniquecharcount ) ||
1088 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1089 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1092 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1094 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1095 if ( ( base + ofs >= trie->uniquecharcount ) &&
1096 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1097 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1099 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1101 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1103 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1107 PerlIO_printf( Perl_debug_log, "]");
1110 PerlIO_printf( Perl_debug_log, "\n" );
1112 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1113 for (word=1; word <= trie->wordcount; word++) {
1114 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1115 (int)word, (int)(trie->wordinfo[word].prev),
1116 (int)(trie->wordinfo[word].len));
1118 PerlIO_printf(Perl_debug_log, "\n" );
1121 Dumps a fully constructed but uncompressed trie in list form.
1122 List tries normally only are used for construction when the number of
1123 possible chars (trie->uniquecharcount) is very high.
1124 Used for debugging make_trie().
1127 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1128 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1132 SV *sv=sv_newmortal();
1133 int colwidth= widecharmap ? 6 : 4;
1134 GET_RE_DEBUG_FLAGS_DECL;
1136 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1138 /* print out the table precompression. */
1139 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1140 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1141 "------:-----+-----------------\n" );
1143 for( state=1 ; state < next_alloc ; state ++ ) {
1146 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1147 (int)depth * 2 + 2,"", (UV)state );
1148 if ( ! trie->states[ state ].wordnum ) {
1149 PerlIO_printf( Perl_debug_log, "%5s| ","");
1151 PerlIO_printf( Perl_debug_log, "W%4x| ",
1152 trie->states[ state ].wordnum
1155 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1156 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1158 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1160 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1161 PL_colors[0], PL_colors[1],
1162 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1163 PERL_PV_ESCAPE_FIRSTCHAR
1165 TRIE_LIST_ITEM(state,charid).forid,
1166 (UV)TRIE_LIST_ITEM(state,charid).newstate
1169 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1170 (int)((depth * 2) + 14), "");
1173 PerlIO_printf( Perl_debug_log, "\n");
1178 Dumps a fully constructed but uncompressed trie in table form.
1179 This is the normal DFA style state transition table, with a few
1180 twists to facilitate compression later.
1181 Used for debugging make_trie().
1184 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1185 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1190 SV *sv=sv_newmortal();
1191 int colwidth= widecharmap ? 6 : 4;
1192 GET_RE_DEBUG_FLAGS_DECL;
1194 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1197 print out the table precompression so that we can do a visual check
1198 that they are identical.
1201 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1203 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1206 PerlIO_printf( Perl_debug_log, "%*s",
1208 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1209 PL_colors[0], PL_colors[1],
1210 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1211 PERL_PV_ESCAPE_FIRSTCHAR
1217 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1219 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1220 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1223 PerlIO_printf( Perl_debug_log, "\n" );
1225 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1227 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1228 (int)depth * 2 + 2,"",
1229 (UV)TRIE_NODENUM( state ) );
1231 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1232 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1234 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1236 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1238 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1239 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1241 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1242 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1250 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1251 startbranch: the first branch in the whole branch sequence
1252 first : start branch of sequence of branch-exact nodes.
1253 May be the same as startbranch
1254 last : Thing following the last branch.
1255 May be the same as tail.
1256 tail : item following the branch sequence
1257 count : words in the sequence
1258 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1259 depth : indent depth
1261 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1263 A trie is an N'ary tree where the branches are determined by digital
1264 decomposition of the key. IE, at the root node you look up the 1st character and
1265 follow that branch repeat until you find the end of the branches. Nodes can be
1266 marked as "accepting" meaning they represent a complete word. Eg:
1270 would convert into the following structure. Numbers represent states, letters
1271 following numbers represent valid transitions on the letter from that state, if
1272 the number is in square brackets it represents an accepting state, otherwise it
1273 will be in parenthesis.
1275 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1279 (1) +-i->(6)-+-s->[7]
1281 +-s->(3)-+-h->(4)-+-e->[5]
1283 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1285 This shows that when matching against the string 'hers' we will begin at state 1
1286 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1287 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1288 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1289 single traverse. We store a mapping from accepting to state to which word was
1290 matched, and then when we have multiple possibilities we try to complete the
1291 rest of the regex in the order in which they occured in the alternation.
1293 The only prior NFA like behaviour that would be changed by the TRIE support is
1294 the silent ignoring of duplicate alternations which are of the form:
1296 / (DUPE|DUPE) X? (?{ ... }) Y /x
1298 Thus EVAL blocks following a trie may be called a different number of times with
1299 and without the optimisation. With the optimisations dupes will be silently
1300 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1301 the following demonstrates:
1303 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1305 which prints out 'word' three times, but
1307 'words'=~/(word|word|word)(?{ print $1 })S/
1309 which doesnt print it out at all. This is due to other optimisations kicking in.
1311 Example of what happens on a structural level:
1313 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1315 1: CURLYM[1] {1,32767}(18)
1326 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1327 and should turn into:
1329 1: CURLYM[1] {1,32767}(18)
1331 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1339 Cases where tail != last would be like /(?foo|bar)baz/:
1349 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1350 and would end up looking like:
1353 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1360 d = uvuni_to_utf8_flags(d, uv, 0);
1362 is the recommended Unicode-aware way of saying
1367 #define TRIE_STORE_REVCHAR \
1370 SV *zlopp = newSV(2); \
1371 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1372 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1373 SvCUR_set(zlopp, kapow - flrbbbbb); \
1376 av_push(revcharmap, zlopp); \
1378 char ooooff = (char)uvc; \
1379 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1383 #define TRIE_READ_CHAR STMT_START { \
1387 if ( foldlen > 0 ) { \
1388 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1393 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1394 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1395 foldlen -= UNISKIP( uvc ); \
1396 scan = foldbuf + UNISKIP( uvc ); \
1399 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1409 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1410 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1411 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1412 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1414 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1415 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1416 TRIE_LIST_CUR( state )++; \
1419 #define TRIE_LIST_NEW(state) STMT_START { \
1420 Newxz( trie->states[ state ].trans.list, \
1421 4, reg_trie_trans_le ); \
1422 TRIE_LIST_CUR( state ) = 1; \
1423 TRIE_LIST_LEN( state ) = 4; \
1426 #define TRIE_HANDLE_WORD(state) STMT_START { \
1427 U16 dupe= trie->states[ state ].wordnum; \
1428 regnode * const noper_next = regnext( noper ); \
1431 /* store the word for dumping */ \
1433 if (OP(noper) != NOTHING) \
1434 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1436 tmp = newSVpvn_utf8( "", 0, UTF ); \
1437 av_push( trie_words, tmp ); \
1441 trie->wordinfo[curword].prev = 0; \
1442 trie->wordinfo[curword].len = wordlen; \
1443 trie->wordinfo[curword].accept = state; \
1445 if ( noper_next < tail ) { \
1447 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1448 trie->jump[curword] = (U16)(noper_next - convert); \
1450 jumper = noper_next; \
1452 nextbranch= regnext(cur); \
1456 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1457 /* chain, so that when the bits of chain are later */\
1458 /* linked together, the dups appear in the chain */\
1459 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1460 trie->wordinfo[dupe].prev = curword; \
1462 /* we haven't inserted this word yet. */ \
1463 trie->states[ state ].wordnum = curword; \
1468 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1469 ( ( base + charid >= ucharcount \
1470 && base + charid < ubound \
1471 && state == trie->trans[ base - ucharcount + charid ].check \
1472 && trie->trans[ base - ucharcount + charid ].next ) \
1473 ? trie->trans[ base - ucharcount + charid ].next \
1474 : ( state==1 ? special : 0 ) \
1478 #define MADE_JUMP_TRIE 2
1479 #define MADE_EXACT_TRIE 4
1482 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1485 /* first pass, loop through and scan words */
1486 reg_trie_data *trie;
1487 HV *widecharmap = NULL;
1488 AV *revcharmap = newAV();
1490 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1495 regnode *jumper = NULL;
1496 regnode *nextbranch = NULL;
1497 regnode *convert = NULL;
1498 U32 *prev_states; /* temp array mapping each state to previous one */
1499 /* we just use folder as a flag in utf8 */
1500 const U8 * folder = NULL;
1503 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1504 AV *trie_words = NULL;
1505 /* along with revcharmap, this only used during construction but both are
1506 * useful during debugging so we store them in the struct when debugging.
1509 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1510 STRLEN trie_charcount=0;
1512 SV *re_trie_maxbuff;
1513 GET_RE_DEBUG_FLAGS_DECL;
1515 PERL_ARGS_ASSERT_MAKE_TRIE;
1517 PERL_UNUSED_ARG(depth);
1522 case EXACTFU: folder = PL_fold_latin1; break;
1523 case EXACTF: folder = PL_fold; break;
1524 case EXACTFL: folder = PL_fold_locale; break;
1527 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1529 trie->startstate = 1;
1530 trie->wordcount = word_count;
1531 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1532 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1533 if (!(UTF && folder))
1534 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1535 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1536 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1539 trie_words = newAV();
1542 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1543 if (!SvIOK(re_trie_maxbuff)) {
1544 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1547 PerlIO_printf( Perl_debug_log,
1548 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1549 (int)depth * 2 + 2, "",
1550 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1551 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1555 /* Find the node we are going to overwrite */
1556 if ( first == startbranch && OP( last ) != BRANCH ) {
1557 /* whole branch chain */
1560 /* branch sub-chain */
1561 convert = NEXTOPER( first );
1564 /* -- First loop and Setup --
1566 We first traverse the branches and scan each word to determine if it
1567 contains widechars, and how many unique chars there are, this is
1568 important as we have to build a table with at least as many columns as we
1571 We use an array of integers to represent the character codes 0..255
1572 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1573 native representation of the character value as the key and IV's for the
1576 *TODO* If we keep track of how many times each character is used we can
1577 remap the columns so that the table compression later on is more
1578 efficient in terms of memory by ensuring the most common value is in the
1579 middle and the least common are on the outside. IMO this would be better
1580 than a most to least common mapping as theres a decent chance the most
1581 common letter will share a node with the least common, meaning the node
1582 will not be compressible. With a middle is most common approach the worst
1583 case is when we have the least common nodes twice.
1587 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1588 regnode * const noper = NEXTOPER( cur );
1589 const U8 *uc = (U8*)STRING( noper );
1590 const U8 * const e = uc + STR_LEN( noper );
1592 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1593 const U8 *scan = (U8*)NULL;
1594 U32 wordlen = 0; /* required init */
1596 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1598 if (OP(noper) == NOTHING) {
1602 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1603 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1604 regardless of encoding */
1606 for ( ; uc < e ; uc += len ) {
1607 TRIE_CHARCOUNT(trie)++;
1611 if ( !trie->charmap[ uvc ] ) {
1612 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1614 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1618 /* store the codepoint in the bitmap, and its folded
1620 TRIE_BITMAP_SET(trie,uvc);
1622 /* store the folded codepoint */
1623 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1626 /* store first byte of utf8 representation of
1627 variant codepoints */
1628 if (! UNI_IS_INVARIANT(uvc)) {
1629 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1632 set_bit = 0; /* We've done our bit :-) */
1637 widecharmap = newHV();
1639 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1642 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1644 if ( !SvTRUE( *svpp ) ) {
1645 sv_setiv( *svpp, ++trie->uniquecharcount );
1650 if( cur == first ) {
1653 } else if (chars < trie->minlen) {
1655 } else if (chars > trie->maxlen) {
1659 } /* end first pass */
1660 DEBUG_TRIE_COMPILE_r(
1661 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1662 (int)depth * 2 + 2,"",
1663 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1664 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1665 (int)trie->minlen, (int)trie->maxlen )
1669 We now know what we are dealing with in terms of unique chars and
1670 string sizes so we can calculate how much memory a naive
1671 representation using a flat table will take. If it's over a reasonable
1672 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1673 conservative but potentially much slower representation using an array
1676 At the end we convert both representations into the same compressed
1677 form that will be used in regexec.c for matching with. The latter
1678 is a form that cannot be used to construct with but has memory
1679 properties similar to the list form and access properties similar
1680 to the table form making it both suitable for fast searches and
1681 small enough that its feasable to store for the duration of a program.
1683 See the comment in the code where the compressed table is produced
1684 inplace from the flat tabe representation for an explanation of how
1685 the compression works.
1690 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1693 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1695 Second Pass -- Array Of Lists Representation
1697 Each state will be represented by a list of charid:state records
1698 (reg_trie_trans_le) the first such element holds the CUR and LEN
1699 points of the allocated array. (See defines above).
1701 We build the initial structure using the lists, and then convert
1702 it into the compressed table form which allows faster lookups
1703 (but cant be modified once converted).
1706 STRLEN transcount = 1;
1708 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1709 "%*sCompiling trie using list compiler\n",
1710 (int)depth * 2 + 2, ""));
1712 trie->states = (reg_trie_state *)
1713 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1714 sizeof(reg_trie_state) );
1718 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1720 regnode * const noper = NEXTOPER( cur );
1721 U8 *uc = (U8*)STRING( noper );
1722 const U8 * const e = uc + STR_LEN( noper );
1723 U32 state = 1; /* required init */
1724 U16 charid = 0; /* sanity init */
1725 U8 *scan = (U8*)NULL; /* sanity init */
1726 STRLEN foldlen = 0; /* required init */
1727 U32 wordlen = 0; /* required init */
1728 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1730 if (OP(noper) != NOTHING) {
1731 for ( ; uc < e ; uc += len ) {
1736 charid = trie->charmap[ uvc ];
1738 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1742 charid=(U16)SvIV( *svpp );
1745 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1752 if ( !trie->states[ state ].trans.list ) {
1753 TRIE_LIST_NEW( state );
1755 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1756 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1757 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1762 newstate = next_alloc++;
1763 prev_states[newstate] = state;
1764 TRIE_LIST_PUSH( state, charid, newstate );
1769 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1773 TRIE_HANDLE_WORD(state);
1775 } /* end second pass */
1777 /* next alloc is the NEXT state to be allocated */
1778 trie->statecount = next_alloc;
1779 trie->states = (reg_trie_state *)
1780 PerlMemShared_realloc( trie->states,
1782 * sizeof(reg_trie_state) );
1784 /* and now dump it out before we compress it */
1785 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1786 revcharmap, next_alloc,
1790 trie->trans = (reg_trie_trans *)
1791 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1798 for( state=1 ; state < next_alloc ; state ++ ) {
1802 DEBUG_TRIE_COMPILE_MORE_r(
1803 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1807 if (trie->states[state].trans.list) {
1808 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1812 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1813 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1814 if ( forid < minid ) {
1816 } else if ( forid > maxid ) {
1820 if ( transcount < tp + maxid - minid + 1) {
1822 trie->trans = (reg_trie_trans *)
1823 PerlMemShared_realloc( trie->trans,
1825 * sizeof(reg_trie_trans) );
1826 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1828 base = trie->uniquecharcount + tp - minid;
1829 if ( maxid == minid ) {
1831 for ( ; zp < tp ; zp++ ) {
1832 if ( ! trie->trans[ zp ].next ) {
1833 base = trie->uniquecharcount + zp - minid;
1834 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1835 trie->trans[ zp ].check = state;
1841 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1842 trie->trans[ tp ].check = state;
1847 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1848 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1849 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1850 trie->trans[ tid ].check = state;
1852 tp += ( maxid - minid + 1 );
1854 Safefree(trie->states[ state ].trans.list);
1857 DEBUG_TRIE_COMPILE_MORE_r(
1858 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1861 trie->states[ state ].trans.base=base;
1863 trie->lasttrans = tp + 1;
1867 Second Pass -- Flat Table Representation.
1869 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1870 We know that we will need Charcount+1 trans at most to store the data
1871 (one row per char at worst case) So we preallocate both structures
1872 assuming worst case.
1874 We then construct the trie using only the .next slots of the entry
1877 We use the .check field of the first entry of the node temporarily to
1878 make compression both faster and easier by keeping track of how many non
1879 zero fields are in the node.
1881 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1884 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1885 number representing the first entry of the node, and state as a
1886 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1887 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1888 are 2 entrys per node. eg:
1896 The table is internally in the right hand, idx form. However as we also
1897 have to deal with the states array which is indexed by nodenum we have to
1898 use TRIE_NODENUM() to convert.
1901 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1902 "%*sCompiling trie using table compiler\n",
1903 (int)depth * 2 + 2, ""));
1905 trie->trans = (reg_trie_trans *)
1906 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1907 * trie->uniquecharcount + 1,
1908 sizeof(reg_trie_trans) );
1909 trie->states = (reg_trie_state *)
1910 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1911 sizeof(reg_trie_state) );
1912 next_alloc = trie->uniquecharcount + 1;
1915 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1917 regnode * const noper = NEXTOPER( cur );
1918 const U8 *uc = (U8*)STRING( noper );
1919 const U8 * const e = uc + STR_LEN( noper );
1921 U32 state = 1; /* required init */
1923 U16 charid = 0; /* sanity init */
1924 U32 accept_state = 0; /* sanity init */
1925 U8 *scan = (U8*)NULL; /* sanity init */
1927 STRLEN foldlen = 0; /* required init */
1928 U32 wordlen = 0; /* required init */
1929 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1931 if ( OP(noper) != NOTHING ) {
1932 for ( ; uc < e ; uc += len ) {
1937 charid = trie->charmap[ uvc ];
1939 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1940 charid = svpp ? (U16)SvIV(*svpp) : 0;
1944 if ( !trie->trans[ state + charid ].next ) {
1945 trie->trans[ state + charid ].next = next_alloc;
1946 trie->trans[ state ].check++;
1947 prev_states[TRIE_NODENUM(next_alloc)]
1948 = TRIE_NODENUM(state);
1949 next_alloc += trie->uniquecharcount;
1951 state = trie->trans[ state + charid ].next;
1953 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1955 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1958 accept_state = TRIE_NODENUM( state );
1959 TRIE_HANDLE_WORD(accept_state);
1961 } /* end second pass */
1963 /* and now dump it out before we compress it */
1964 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1966 next_alloc, depth+1));
1970 * Inplace compress the table.*
1972 For sparse data sets the table constructed by the trie algorithm will
1973 be mostly 0/FAIL transitions or to put it another way mostly empty.
1974 (Note that leaf nodes will not contain any transitions.)
1976 This algorithm compresses the tables by eliminating most such
1977 transitions, at the cost of a modest bit of extra work during lookup:
1979 - Each states[] entry contains a .base field which indicates the
1980 index in the state[] array wheres its transition data is stored.
1982 - If .base is 0 there are no valid transitions from that node.
1984 - If .base is nonzero then charid is added to it to find an entry in
1987 -If trans[states[state].base+charid].check!=state then the
1988 transition is taken to be a 0/Fail transition. Thus if there are fail
1989 transitions at the front of the node then the .base offset will point
1990 somewhere inside the previous nodes data (or maybe even into a node
1991 even earlier), but the .check field determines if the transition is
1995 The following process inplace converts the table to the compressed
1996 table: We first do not compress the root node 1,and mark all its
1997 .check pointers as 1 and set its .base pointer as 1 as well. This
1998 allows us to do a DFA construction from the compressed table later,
1999 and ensures that any .base pointers we calculate later are greater
2002 - We set 'pos' to indicate the first entry of the second node.
2004 - We then iterate over the columns of the node, finding the first and
2005 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2006 and set the .check pointers accordingly, and advance pos
2007 appropriately and repreat for the next node. Note that when we copy
2008 the next pointers we have to convert them from the original
2009 NODEIDX form to NODENUM form as the former is not valid post
2012 - If a node has no transitions used we mark its base as 0 and do not
2013 advance the pos pointer.
2015 - If a node only has one transition we use a second pointer into the
2016 structure to fill in allocated fail transitions from other states.
2017 This pointer is independent of the main pointer and scans forward
2018 looking for null transitions that are allocated to a state. When it
2019 finds one it writes the single transition into the "hole". If the
2020 pointer doesnt find one the single transition is appended as normal.
2022 - Once compressed we can Renew/realloc the structures to release the
2025 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2026 specifically Fig 3.47 and the associated pseudocode.
2030 const U32 laststate = TRIE_NODENUM( next_alloc );
2033 trie->statecount = laststate;
2035 for ( state = 1 ; state < laststate ; state++ ) {
2037 const U32 stateidx = TRIE_NODEIDX( state );
2038 const U32 o_used = trie->trans[ stateidx ].check;
2039 U32 used = trie->trans[ stateidx ].check;
2040 trie->trans[ stateidx ].check = 0;
2042 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2043 if ( flag || trie->trans[ stateidx + charid ].next ) {
2044 if ( trie->trans[ stateidx + charid ].next ) {
2046 for ( ; zp < pos ; zp++ ) {
2047 if ( ! trie->trans[ zp ].next ) {
2051 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2052 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2053 trie->trans[ zp ].check = state;
2054 if ( ++zp > pos ) pos = zp;
2061 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2063 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2064 trie->trans[ pos ].check = state;
2069 trie->lasttrans = pos + 1;
2070 trie->states = (reg_trie_state *)
2071 PerlMemShared_realloc( trie->states, laststate
2072 * sizeof(reg_trie_state) );
2073 DEBUG_TRIE_COMPILE_MORE_r(
2074 PerlIO_printf( Perl_debug_log,
2075 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2076 (int)depth * 2 + 2,"",
2077 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2080 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2083 } /* end table compress */
2085 DEBUG_TRIE_COMPILE_MORE_r(
2086 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2087 (int)depth * 2 + 2, "",
2088 (UV)trie->statecount,
2089 (UV)trie->lasttrans)
2091 /* resize the trans array to remove unused space */
2092 trie->trans = (reg_trie_trans *)
2093 PerlMemShared_realloc( trie->trans, trie->lasttrans
2094 * sizeof(reg_trie_trans) );
2096 { /* Modify the program and insert the new TRIE node */
2097 U8 nodetype =(U8)(flags & 0xFF);
2101 regnode *optimize = NULL;
2102 #ifdef RE_TRACK_PATTERN_OFFSETS
2105 U32 mjd_nodelen = 0;
2106 #endif /* RE_TRACK_PATTERN_OFFSETS */
2107 #endif /* DEBUGGING */
2109 This means we convert either the first branch or the first Exact,
2110 depending on whether the thing following (in 'last') is a branch
2111 or not and whther first is the startbranch (ie is it a sub part of
2112 the alternation or is it the whole thing.)
2113 Assuming its a sub part we convert the EXACT otherwise we convert
2114 the whole branch sequence, including the first.
2116 /* Find the node we are going to overwrite */
2117 if ( first != startbranch || OP( last ) == BRANCH ) {
2118 /* branch sub-chain */
2119 NEXT_OFF( first ) = (U16)(last - first);
2120 #ifdef RE_TRACK_PATTERN_OFFSETS
2122 mjd_offset= Node_Offset((convert));
2123 mjd_nodelen= Node_Length((convert));
2126 /* whole branch chain */
2128 #ifdef RE_TRACK_PATTERN_OFFSETS
2131 const regnode *nop = NEXTOPER( convert );
2132 mjd_offset= Node_Offset((nop));
2133 mjd_nodelen= Node_Length((nop));
2137 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2138 (int)depth * 2 + 2, "",
2139 (UV)mjd_offset, (UV)mjd_nodelen)
2142 /* But first we check to see if there is a common prefix we can
2143 split out as an EXACT and put in front of the TRIE node. */
2144 trie->startstate= 1;
2145 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2147 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2151 const U32 base = trie->states[ state ].trans.base;
2153 if ( trie->states[state].wordnum )
2156 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2157 if ( ( base + ofs >= trie->uniquecharcount ) &&
2158 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2159 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2161 if ( ++count > 1 ) {
2162 SV **tmp = av_fetch( revcharmap, ofs, 0);
2163 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2164 if ( state == 1 ) break;
2166 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2168 PerlIO_printf(Perl_debug_log,
2169 "%*sNew Start State=%"UVuf" Class: [",
2170 (int)depth * 2 + 2, "",
2173 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2174 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2176 TRIE_BITMAP_SET(trie,*ch);
2178 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2180 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2184 TRIE_BITMAP_SET(trie,*ch);
2186 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2187 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2193 SV **tmp = av_fetch( revcharmap, idx, 0);
2195 char *ch = SvPV( *tmp, len );
2197 SV *sv=sv_newmortal();
2198 PerlIO_printf( Perl_debug_log,
2199 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2200 (int)depth * 2 + 2, "",
2202 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2203 PL_colors[0], PL_colors[1],
2204 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2205 PERL_PV_ESCAPE_FIRSTCHAR
2210 OP( convert ) = nodetype;
2211 str=STRING(convert);
2214 STR_LEN(convert) += len;
2220 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2225 trie->prefixlen = (state-1);
2227 regnode *n = convert+NODE_SZ_STR(convert);
2228 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2229 trie->startstate = state;
2230 trie->minlen -= (state - 1);
2231 trie->maxlen -= (state - 1);
2233 /* At least the UNICOS C compiler choked on this
2234 * being argument to DEBUG_r(), so let's just have
2237 #ifdef PERL_EXT_RE_BUILD
2243 regnode *fix = convert;
2244 U32 word = trie->wordcount;
2246 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2247 while( ++fix < n ) {
2248 Set_Node_Offset_Length(fix, 0, 0);
2251 SV ** const tmp = av_fetch( trie_words, word, 0 );
2253 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2254 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2256 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2264 NEXT_OFF(convert) = (U16)(tail - convert);
2265 DEBUG_r(optimize= n);
2271 if ( trie->maxlen ) {
2272 NEXT_OFF( convert ) = (U16)(tail - convert);
2273 ARG_SET( convert, data_slot );
2274 /* Store the offset to the first unabsorbed branch in
2275 jump[0], which is otherwise unused by the jump logic.
2276 We use this when dumping a trie and during optimisation. */
2278 trie->jump[0] = (U16)(nextbranch - convert);
2280 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2281 * and there is a bitmap
2282 * and the first "jump target" node we found leaves enough room
2283 * then convert the TRIE node into a TRIEC node, with the bitmap
2284 * embedded inline in the opcode - this is hypothetically faster.
2286 if ( !trie->states[trie->startstate].wordnum
2288 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2290 OP( convert ) = TRIEC;
2291 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2292 PerlMemShared_free(trie->bitmap);
2295 OP( convert ) = TRIE;
2297 /* store the type in the flags */
2298 convert->flags = nodetype;
2302 + regarglen[ OP( convert ) ];
2304 /* XXX We really should free up the resource in trie now,
2305 as we won't use them - (which resources?) dmq */
2307 /* needed for dumping*/
2308 DEBUG_r(if (optimize) {
2309 regnode *opt = convert;
2311 while ( ++opt < optimize) {
2312 Set_Node_Offset_Length(opt,0,0);
2315 Try to clean up some of the debris left after the
2318 while( optimize < jumper ) {
2319 mjd_nodelen += Node_Length((optimize));
2320 OP( optimize ) = OPTIMIZED;
2321 Set_Node_Offset_Length(optimize,0,0);
2324 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2326 } /* end node insert */
2327 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2329 /* Finish populating the prev field of the wordinfo array. Walk back
2330 * from each accept state until we find another accept state, and if
2331 * so, point the first word's .prev field at the second word. If the
2332 * second already has a .prev field set, stop now. This will be the
2333 * case either if we've already processed that word's accept state,
2334 * or that state had multiple words, and the overspill words were
2335 * already linked up earlier.
2342 for (word=1; word <= trie->wordcount; word++) {
2344 if (trie->wordinfo[word].prev)
2346 state = trie->wordinfo[word].accept;
2348 state = prev_states[state];
2351 prev = trie->states[state].wordnum;
2355 trie->wordinfo[word].prev = prev;
2357 Safefree(prev_states);
2361 /* and now dump out the compressed format */
2362 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2364 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2366 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2367 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2369 SvREFCNT_dec(revcharmap);
2373 : trie->startstate>1
2379 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2381 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2383 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2384 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2387 We find the fail state for each state in the trie, this state is the longest proper
2388 suffix of the current state's 'word' that is also a proper prefix of another word in our
2389 trie. State 1 represents the word '' and is thus the default fail state. This allows
2390 the DFA not to have to restart after its tried and failed a word at a given point, it
2391 simply continues as though it had been matching the other word in the first place.
2393 'abcdgu'=~/abcdefg|cdgu/
2394 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2395 fail, which would bring us to the state representing 'd' in the second word where we would
2396 try 'g' and succeed, proceeding to match 'cdgu'.
2398 /* add a fail transition */
2399 const U32 trie_offset = ARG(source);
2400 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2402 const U32 ucharcount = trie->uniquecharcount;
2403 const U32 numstates = trie->statecount;
2404 const U32 ubound = trie->lasttrans + ucharcount;
2408 U32 base = trie->states[ 1 ].trans.base;
2411 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2412 GET_RE_DEBUG_FLAGS_DECL;
2414 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2416 PERL_UNUSED_ARG(depth);
2420 ARG_SET( stclass, data_slot );
2421 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2422 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2423 aho->trie=trie_offset;
2424 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2425 Copy( trie->states, aho->states, numstates, reg_trie_state );
2426 Newxz( q, numstates, U32);
2427 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2430 /* initialize fail[0..1] to be 1 so that we always have
2431 a valid final fail state */
2432 fail[ 0 ] = fail[ 1 ] = 1;
2434 for ( charid = 0; charid < ucharcount ; charid++ ) {
2435 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2437 q[ q_write ] = newstate;
2438 /* set to point at the root */
2439 fail[ q[ q_write++ ] ]=1;
2442 while ( q_read < q_write) {
2443 const U32 cur = q[ q_read++ % numstates ];
2444 base = trie->states[ cur ].trans.base;
2446 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2447 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2449 U32 fail_state = cur;
2452 fail_state = fail[ fail_state ];
2453 fail_base = aho->states[ fail_state ].trans.base;
2454 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2456 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2457 fail[ ch_state ] = fail_state;
2458 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2460 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2462 q[ q_write++ % numstates] = ch_state;
2466 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2467 when we fail in state 1, this allows us to use the
2468 charclass scan to find a valid start char. This is based on the principle
2469 that theres a good chance the string being searched contains lots of stuff
2470 that cant be a start char.
2472 fail[ 0 ] = fail[ 1 ] = 0;
2473 DEBUG_TRIE_COMPILE_r({
2474 PerlIO_printf(Perl_debug_log,
2475 "%*sStclass Failtable (%"UVuf" states): 0",
2476 (int)(depth * 2), "", (UV)numstates
2478 for( q_read=1; q_read<numstates; q_read++ ) {
2479 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2481 PerlIO_printf(Perl_debug_log, "\n");
2484 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2489 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2490 * These need to be revisited when a newer toolchain becomes available.
2492 #if defined(__sparc64__) && defined(__GNUC__)
2493 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2494 # undef SPARC64_GCC_WORKAROUND
2495 # define SPARC64_GCC_WORKAROUND 1
2499 #define DEBUG_PEEP(str,scan,depth) \
2500 DEBUG_OPTIMISE_r({if (scan){ \
2501 SV * const mysv=sv_newmortal(); \
2502 regnode *Next = regnext(scan); \
2503 regprop(RExC_rx, mysv, scan); \
2504 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2505 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2506 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2513 #define JOIN_EXACT(scan,min,flags) \
2514 if (PL_regkind[OP(scan)] == EXACT) \
2515 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2518 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2519 /* Merge several consecutive EXACTish nodes into one. */
2520 regnode *n = regnext(scan);
2522 regnode *next = scan + NODE_SZ_STR(scan);
2526 regnode *stop = scan;
2527 GET_RE_DEBUG_FLAGS_DECL;
2529 PERL_UNUSED_ARG(depth);
2532 PERL_ARGS_ASSERT_JOIN_EXACT;
2533 #ifndef EXPERIMENTAL_INPLACESCAN
2534 PERL_UNUSED_ARG(flags);
2535 PERL_UNUSED_ARG(val);
2537 DEBUG_PEEP("join",scan,depth);
2539 /* Skip NOTHING, merge EXACT*. */
2541 ( PL_regkind[OP(n)] == NOTHING ||
2542 (stringok && (OP(n) == OP(scan))))
2544 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2546 if (OP(n) == TAIL || n > next)
2548 if (PL_regkind[OP(n)] == NOTHING) {
2549 DEBUG_PEEP("skip:",n,depth);
2550 NEXT_OFF(scan) += NEXT_OFF(n);
2551 next = n + NODE_STEP_REGNODE;
2558 else if (stringok) {
2559 const unsigned int oldl = STR_LEN(scan);
2560 regnode * const nnext = regnext(n);
2562 DEBUG_PEEP("merg",n,depth);
2565 if (oldl + STR_LEN(n) > U8_MAX)
2567 NEXT_OFF(scan) += NEXT_OFF(n);
2568 STR_LEN(scan) += STR_LEN(n);
2569 next = n + NODE_SZ_STR(n);
2570 /* Now we can overwrite *n : */
2571 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2579 #ifdef EXPERIMENTAL_INPLACESCAN
2580 if (flags && !NEXT_OFF(n)) {
2581 DEBUG_PEEP("atch", val, depth);
2582 if (reg_off_by_arg[OP(n)]) {
2583 ARG_SET(n, val - n);
2586 NEXT_OFF(n) = val - n;
2592 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2593 #define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2594 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2595 #define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2598 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2599 && ( STR_LEN(scan) >= 6 ) )
2602 Two problematic code points in Unicode casefolding of EXACT nodes:
2604 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2605 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2611 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2612 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2614 This means that in case-insensitive matching (or "loose matching",
2615 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2616 length of the above casefolded versions) can match a target string
2617 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2618 This would rather mess up the minimum length computation.
2620 What we'll do is to look for the tail four bytes, and then peek
2621 at the preceding two bytes to see whether we need to decrease
2622 the minimum length by four (six minus two).
2624 Thanks to the design of UTF-8, there cannot be false matches:
2625 A sequence of valid UTF-8 bytes cannot be a subsequence of
2626 another valid sequence of UTF-8 bytes.
2629 char * const s0 = STRING(scan), *s, *t;
2630 char * const s1 = s0 + STR_LEN(scan) - 1;
2631 char * const s2 = s1 - 4;
2632 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2633 const char t0[] = "\xaf\x49\xaf\x42";
2635 const char t0[] = "\xcc\x88\xcc\x81";
2637 const char * const t1 = t0 + 3;
2640 s < s2 && (t = ninstr(s, s1, t0, t1));
2643 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2644 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2646 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2647 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2655 n = scan + NODE_SZ_STR(scan);
2657 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2664 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2668 /* REx optimizer. Converts nodes into quicker variants "in place".
2669 Finds fixed substrings. */
2671 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2672 to the position after last scanned or to NULL. */
2674 #define INIT_AND_WITHP \
2675 assert(!and_withp); \
2676 Newx(and_withp,1,struct regnode_charclass_class); \
2677 SAVEFREEPV(and_withp)
2679 /* this is a chain of data about sub patterns we are processing that
2680 need to be handled separately/specially in study_chunk. Its so
2681 we can simulate recursion without losing state. */
2683 typedef struct scan_frame {
2684 regnode *last; /* last node to process in this frame */
2685 regnode *next; /* next node to process when last is reached */
2686 struct scan_frame *prev; /*previous frame*/
2687 I32 stop; /* what stopparen do we use */
2691 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2693 #define CASE_SYNST_FNC(nAmE) \
2695 if (flags & SCF_DO_STCLASS_AND) { \
2696 for (value = 0; value < 256; value++) \
2697 if (!is_ ## nAmE ## _cp(value)) \
2698 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2701 for (value = 0; value < 256; value++) \
2702 if (is_ ## nAmE ## _cp(value)) \
2703 ANYOF_BITMAP_SET(data->start_class, value); \
2707 if (flags & SCF_DO_STCLASS_AND) { \
2708 for (value = 0; value < 256; value++) \
2709 if (is_ ## nAmE ## _cp(value)) \
2710 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2713 for (value = 0; value < 256; value++) \
2714 if (!is_ ## nAmE ## _cp(value)) \
2715 ANYOF_BITMAP_SET(data->start_class, value); \
2722 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2723 I32 *minlenp, I32 *deltap,
2728 struct regnode_charclass_class *and_withp,
2729 U32 flags, U32 depth)
2730 /* scanp: Start here (read-write). */
2731 /* deltap: Write maxlen-minlen here. */
2732 /* last: Stop before this one. */
2733 /* data: string data about the pattern */
2734 /* stopparen: treat close N as END */
2735 /* recursed: which subroutines have we recursed into */
2736 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2739 I32 min = 0, pars = 0, code;
2740 regnode *scan = *scanp, *next;
2742 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2743 int is_inf_internal = 0; /* The studied chunk is infinite */
2744 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2745 scan_data_t data_fake;
2746 SV *re_trie_maxbuff = NULL;
2747 regnode *first_non_open = scan;
2748 I32 stopmin = I32_MAX;
2749 scan_frame *frame = NULL;
2750 GET_RE_DEBUG_FLAGS_DECL;
2752 PERL_ARGS_ASSERT_STUDY_CHUNK;
2755 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2759 while (first_non_open && OP(first_non_open) == OPEN)
2760 first_non_open=regnext(first_non_open);
2765 while ( scan && OP(scan) != END && scan < last ){
2766 /* Peephole optimizer: */
2767 DEBUG_STUDYDATA("Peep:", data,depth);
2768 DEBUG_PEEP("Peep",scan,depth);
2769 JOIN_EXACT(scan,&min,0);
2771 /* Follow the next-chain of the current node and optimize
2772 away all the NOTHINGs from it. */
2773 if (OP(scan) != CURLYX) {
2774 const int max = (reg_off_by_arg[OP(scan)]
2776 /* I32 may be smaller than U16 on CRAYs! */
2777 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2778 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2782 /* Skip NOTHING and LONGJMP. */
2783 while ((n = regnext(n))
2784 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2785 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2786 && off + noff < max)
2788 if (reg_off_by_arg[OP(scan)])
2791 NEXT_OFF(scan) = off;
2796 /* The principal pseudo-switch. Cannot be a switch, since we
2797 look into several different things. */
2798 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2799 || OP(scan) == IFTHEN) {
2800 next = regnext(scan);
2802 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2804 if (OP(next) == code || code == IFTHEN) {
2805 /* NOTE - There is similar code to this block below for handling
2806 TRIE nodes on a re-study. If you change stuff here check there
2808 I32 max1 = 0, min1 = I32_MAX, num = 0;
2809 struct regnode_charclass_class accum;
2810 regnode * const startbranch=scan;
2812 if (flags & SCF_DO_SUBSTR)
2813 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2814 if (flags & SCF_DO_STCLASS)
2815 cl_init_zero(pRExC_state, &accum);
2817 while (OP(scan) == code) {
2818 I32 deltanext, minnext, f = 0, fake;
2819 struct regnode_charclass_class this_class;
2822 data_fake.flags = 0;
2824 data_fake.whilem_c = data->whilem_c;
2825 data_fake.last_closep = data->last_closep;
2828 data_fake.last_closep = &fake;
2830 data_fake.pos_delta = delta;
2831 next = regnext(scan);
2832 scan = NEXTOPER(scan);
2834 scan = NEXTOPER(scan);
2835 if (flags & SCF_DO_STCLASS) {
2836 cl_init(pRExC_state, &this_class);
2837 data_fake.start_class = &this_class;
2838 f = SCF_DO_STCLASS_AND;
2840 if (flags & SCF_WHILEM_VISITED_POS)
2841 f |= SCF_WHILEM_VISITED_POS;
2843 /* we suppose the run is continuous, last=next...*/
2844 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2846 stopparen, recursed, NULL, f,depth+1);
2849 if (max1 < minnext + deltanext)
2850 max1 = minnext + deltanext;
2851 if (deltanext == I32_MAX)
2852 is_inf = is_inf_internal = 1;
2854 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2856 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2857 if ( stopmin > minnext)
2858 stopmin = min + min1;
2859 flags &= ~SCF_DO_SUBSTR;
2861 data->flags |= SCF_SEEN_ACCEPT;
2864 if (data_fake.flags & SF_HAS_EVAL)
2865 data->flags |= SF_HAS_EVAL;
2866 data->whilem_c = data_fake.whilem_c;
2868 if (flags & SCF_DO_STCLASS)
2869 cl_or(pRExC_state, &accum, &this_class);
2871 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2873 if (flags & SCF_DO_SUBSTR) {
2874 data->pos_min += min1;
2875 data->pos_delta += max1 - min1;
2876 if (max1 != min1 || is_inf)
2877 data->longest = &(data->longest_float);
2880 delta += max1 - min1;
2881 if (flags & SCF_DO_STCLASS_OR) {
2882 cl_or(pRExC_state, data->start_class, &accum);
2884 cl_and(data->start_class, and_withp);
2885 flags &= ~SCF_DO_STCLASS;
2888 else if (flags & SCF_DO_STCLASS_AND) {
2890 cl_and(data->start_class, &accum);
2891 flags &= ~SCF_DO_STCLASS;
2894 /* Switch to OR mode: cache the old value of
2895 * data->start_class */
2897 StructCopy(data->start_class, and_withp,
2898 struct regnode_charclass_class);
2899 flags &= ~SCF_DO_STCLASS_AND;
2900 StructCopy(&accum, data->start_class,
2901 struct regnode_charclass_class);
2902 flags |= SCF_DO_STCLASS_OR;
2903 data->start_class->flags |= ANYOF_EOS;
2907 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2910 Assuming this was/is a branch we are dealing with: 'scan' now
2911 points at the item that follows the branch sequence, whatever
2912 it is. We now start at the beginning of the sequence and look
2919 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2921 If we can find such a subsequence we need to turn the first
2922 element into a trie and then add the subsequent branch exact
2923 strings to the trie.
2927 1. patterns where the whole set of branches can be converted.
2929 2. patterns where only a subset can be converted.
2931 In case 1 we can replace the whole set with a single regop
2932 for the trie. In case 2 we need to keep the start and end
2935 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2936 becomes BRANCH TRIE; BRANCH X;
2938 There is an additional case, that being where there is a
2939 common prefix, which gets split out into an EXACT like node
2940 preceding the TRIE node.
2942 If x(1..n)==tail then we can do a simple trie, if not we make
2943 a "jump" trie, such that when we match the appropriate word
2944 we "jump" to the appropriate tail node. Essentially we turn
2945 a nested if into a case structure of sorts.
2950 if (!re_trie_maxbuff) {
2951 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2952 if (!SvIOK(re_trie_maxbuff))
2953 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2955 if ( SvIV(re_trie_maxbuff)>=0 ) {
2957 regnode *first = (regnode *)NULL;
2958 regnode *last = (regnode *)NULL;
2959 regnode *tail = scan;
2964 SV * const mysv = sv_newmortal(); /* for dumping */
2966 /* var tail is used because there may be a TAIL
2967 regop in the way. Ie, the exacts will point to the
2968 thing following the TAIL, but the last branch will
2969 point at the TAIL. So we advance tail. If we
2970 have nested (?:) we may have to move through several
2974 while ( OP( tail ) == TAIL ) {
2975 /* this is the TAIL generated by (?:) */
2976 tail = regnext( tail );
2981 regprop(RExC_rx, mysv, tail );
2982 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2983 (int)depth * 2 + 2, "",
2984 "Looking for TRIE'able sequences. Tail node is: ",
2985 SvPV_nolen_const( mysv )
2991 step through the branches, cur represents each
2992 branch, noper is the first thing to be matched
2993 as part of that branch and noper_next is the
2994 regnext() of that node. if noper is an EXACT
2995 and noper_next is the same as scan (our current
2996 position in the regex) then the EXACT branch is
2997 a possible optimization target. Once we have
2998 two or more consecutive such branches we can
2999 create a trie of the EXACT's contents and stich
3000 it in place. If the sequence represents all of
3001 the branches we eliminate the whole thing and
3002 replace it with a single TRIE. If it is a
3003 subsequence then we need to stitch it in. This
3004 means the first branch has to remain, and needs
3005 to be repointed at the item on the branch chain
3006 following the last branch optimized. This could
3007 be either a BRANCH, in which case the
3008 subsequence is internal, or it could be the
3009 item following the branch sequence in which
3010 case the subsequence is at the end.
3014 /* dont use tail as the end marker for this traverse */
3015 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3016 regnode * const noper = NEXTOPER( cur );
3017 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3018 regnode * const noper_next = regnext( noper );
3022 regprop(RExC_rx, mysv, cur);
3023 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3024 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3026 regprop(RExC_rx, mysv, noper);
3027 PerlIO_printf( Perl_debug_log, " -> %s",
3028 SvPV_nolen_const(mysv));
3031 regprop(RExC_rx, mysv, noper_next );
3032 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3033 SvPV_nolen_const(mysv));
3035 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3036 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3038 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3039 : PL_regkind[ OP( noper ) ] == EXACT )
3040 || OP(noper) == NOTHING )
3042 && noper_next == tail
3047 if ( !first || optype == NOTHING ) {
3048 if (!first) first = cur;
3049 optype = OP( noper );
3055 Currently the trie logic handles case insensitive matching properly only
3056 when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3059 If/when this is fixed the following define can be swapped
3060 in below to fully enable trie logic.
3062 #define TRIE_TYPE_IS_SAFE 1
3065 #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
3067 if ( last && TRIE_TYPE_IS_SAFE ) {
3068 make_trie( pRExC_state,
3069 startbranch, first, cur, tail, count,
3072 if ( PL_regkind[ OP( noper ) ] == EXACT
3074 && noper_next == tail
3079 optype = OP( noper );
3089 regprop(RExC_rx, mysv, cur);
3090 PerlIO_printf( Perl_debug_log,
3091 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3092 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3096 if ( last && TRIE_TYPE_IS_SAFE ) {
3097 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3098 #ifdef TRIE_STUDY_OPT
3099 if ( ((made == MADE_EXACT_TRIE &&
3100 startbranch == first)
3101 || ( first_non_open == first )) &&
3103 flags |= SCF_TRIE_RESTUDY;
3104 if ( startbranch == first
3107 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3117 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3118 scan = NEXTOPER(NEXTOPER(scan));
3119 } else /* single branch is optimized. */
3120 scan = NEXTOPER(scan);
3122 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3123 scan_frame *newframe = NULL;
3128 if (OP(scan) != SUSPEND) {
3129 /* set the pointer */
3130 if (OP(scan) == GOSUB) {
3132 RExC_recurse[ARG2L(scan)] = scan;
3133 start = RExC_open_parens[paren-1];
3134 end = RExC_close_parens[paren-1];
3137 start = RExC_rxi->program + 1;
3141 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3142 SAVEFREEPV(recursed);
3144 if (!PAREN_TEST(recursed,paren+1)) {
3145 PAREN_SET(recursed,paren+1);
3146 Newx(newframe,1,scan_frame);
3148 if (flags & SCF_DO_SUBSTR) {
3149 SCAN_COMMIT(pRExC_state,data,minlenp);
3150 data->longest = &(data->longest_float);
3152 is_inf = is_inf_internal = 1;
3153 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3154 cl_anything(pRExC_state, data->start_class);
3155 flags &= ~SCF_DO_STCLASS;
3158 Newx(newframe,1,scan_frame);
3161 end = regnext(scan);
3166 SAVEFREEPV(newframe);
3167 newframe->next = regnext(scan);
3168 newframe->last = last;
3169 newframe->stop = stopparen;
3170 newframe->prev = frame;
3180 else if (OP(scan) == EXACT) {
3181 I32 l = STR_LEN(scan);
3184 const U8 * const s = (U8*)STRING(scan);
3185 l = utf8_length(s, s + l);
3186 uc = utf8_to_uvchr(s, NULL);
3188 uc = *((U8*)STRING(scan));
3191 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3192 /* The code below prefers earlier match for fixed
3193 offset, later match for variable offset. */
3194 if (data->last_end == -1) { /* Update the start info. */
3195 data->last_start_min = data->pos_min;
3196 data->last_start_max = is_inf
3197 ? I32_MAX : data->pos_min + data->pos_delta;
3199 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3201 SvUTF8_on(data->last_found);
3203 SV * const sv = data->last_found;
3204 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3205 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3206 if (mg && mg->mg_len >= 0)
3207 mg->mg_len += utf8_length((U8*)STRING(scan),
3208 (U8*)STRING(scan)+STR_LEN(scan));
3210 data->last_end = data->pos_min + l;
3211 data->pos_min += l; /* As in the first entry. */
3212 data->flags &= ~SF_BEFORE_EOL;
3214 if (flags & SCF_DO_STCLASS_AND) {
3215 /* Check whether it is compatible with what we know already! */
3219 /* If compatible, we or it in below. It is compatible if is
3220 * in the bitmp and either 1) its bit or its fold is set, or 2)
3221 * it's for a locale. Even if there isn't unicode semantics
3222 * here, at runtime there may be because of matching against a
3223 * utf8 string, so accept a possible false positive for
3224 * latin1-range folds */
3226 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3227 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3228 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3229 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3234 ANYOF_CLASS_ZERO(data->start_class);
3235 ANYOF_BITMAP_ZERO(data->start_class);
3237 ANYOF_BITMAP_SET(data->start_class, uc);
3238 else if (uc >= 0x100) {
3241 /* Some Unicode code points fold to the Latin1 range; as
3242 * XXX temporary code, instead of figuring out if this is
3243 * one, just assume it is and set all the start class bits
3244 * that could be some such above 255 code point's fold
3245 * which will generate fals positives. As the code
3246 * elsewhere that does compute the fold settles down, it
3247 * can be extracted out and re-used here */
3248 for (i = 0; i < 256; i++){
3249 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3250 ANYOF_BITMAP_SET(data->start_class, i);
3254 data->start_class->flags &= ~ANYOF_EOS;
3256 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3258 else if (flags & SCF_DO_STCLASS_OR) {
3259 /* false positive possible if the class is case-folded */
3261 ANYOF_BITMAP_SET(data->start_class, uc);
3263 data->start_class->flags |= ANYOF_UNICODE_ALL;
3264 data->start_class->flags &= ~ANYOF_EOS;
3265 cl_and(data->start_class, and_withp);
3267 flags &= ~SCF_DO_STCLASS;
3269 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3270 I32 l = STR_LEN(scan);
3271 UV uc = *((U8*)STRING(scan));
3273 /* Search for fixed substrings supports EXACT only. */
3274 if (flags & SCF_DO_SUBSTR) {
3276 SCAN_COMMIT(pRExC_state, data, minlenp);
3279 const U8 * const s = (U8 *)STRING(scan);
3280 l = utf8_length(s, s + l);
3281 uc = utf8_to_uvchr(s, NULL);
3284 if (flags & SCF_DO_SUBSTR)
3286 if (flags & SCF_DO_STCLASS_AND) {
3287 /* Check whether it is compatible with what we know already! */
3290 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3291 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3292 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3296 ANYOF_CLASS_ZERO(data->start_class);
3297 ANYOF_BITMAP_ZERO(data->start_class);
3299 ANYOF_BITMAP_SET(data->start_class, uc);
3300 data->start_class->flags &= ~ANYOF_EOS;
3301 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3302 if (OP(scan) == EXACTFL) {
3303 /* XXX This set is probably no longer necessary, and
3304 * probably wrong as LOCALE now is on in the initial
3306 data->start_class->flags |= ANYOF_LOCALE;
3310 /* Also set the other member of the fold pair. In case
3311 * that unicode semantics is called for at runtime, use
3312 * the full latin1 fold. (Can't do this for locale,
3313 * because not known until runtime */
3314 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3317 else if (uc >= 0x100) {
3319 for (i = 0; i < 256; i++){
3320 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3321 ANYOF_BITMAP_SET(data->start_class, i);
3326 else if (flags & SCF_DO_STCLASS_OR) {
3327 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3328 /* false positive possible if the class is case-folded.
3329 Assume that the locale settings are the same... */
3331 ANYOF_BITMAP_SET(data->start_class, uc);
3332 if (OP(scan) != EXACTFL) {
3334 /* And set the other member of the fold pair, but
3335 * can't do that in locale because not known until
3337 ANYOF_BITMAP_SET(data->start_class,
3338 PL_fold_latin1[uc]);
3341 data->start_class->flags &= ~ANYOF_EOS;
3343 cl_and(data->start_class, and_withp);
3345 flags &= ~SCF_DO_STCLASS;
3347 else if (REGNODE_VARIES(OP(scan))) {
3348 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3349 I32 f = flags, pos_before = 0;
3350 regnode * const oscan = scan;
3351 struct regnode_charclass_class this_class;
3352 struct regnode_charclass_class *oclass = NULL;
3353 I32 next_is_eval = 0;
3355 switch (PL_regkind[OP(scan)]) {
3356 case WHILEM: /* End of (?:...)* . */
3357 scan = NEXTOPER(scan);
3360 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3361 next = NEXTOPER(scan);
3362 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3364 maxcount = REG_INFTY;
3365 next = regnext(scan);
3366 scan = NEXTOPER(scan);
3370 if (flags & SCF_DO_SUBSTR)
3375 if (flags & SCF_DO_STCLASS) {
3377 maxcount = REG_INFTY;
3378 next = regnext(scan);
3379 scan = NEXTOPER(scan);
3382 is_inf = is_inf_internal = 1;
3383 scan = regnext(scan);
3384 if (flags & SCF_DO_SUBSTR) {
3385 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3386 data->longest = &(data->longest_float);
3388 goto optimize_curly_tail;
3390 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3391 && (scan->flags == stopparen))
3396 mincount = ARG1(scan);
3397 maxcount = ARG2(scan);
3399 next = regnext(scan);
3400 if (OP(scan) == CURLYX) {
3401 I32 lp = (data ? *(data->last_closep) : 0);
3402 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3404 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3405 next_is_eval = (OP(scan) == EVAL);
3407 if (flags & SCF_DO_SUBSTR) {
3408 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3409 pos_before = data->pos_min;
3413 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3415 data->flags |= SF_IS_INF;
3417 if (flags & SCF_DO_STCLASS) {
3418 cl_init(pRExC_state, &this_class);
3419 oclass = data->start_class;
3420 data->start_class = &this_class;
3421 f |= SCF_DO_STCLASS_AND;
3422 f &= ~SCF_DO_STCLASS_OR;
3424 /* Exclude from super-linear cache processing any {n,m}
3425 regops for which the combination of input pos and regex
3426 pos is not enough information to determine if a match
3429 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3430 regex pos at the \s*, the prospects for a match depend not
3431 only on the input position but also on how many (bar\s*)
3432 repeats into the {4,8} we are. */
3433 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3434 f &= ~SCF_WHILEM_VISITED_POS;
3436 /* This will finish on WHILEM, setting scan, or on NULL: */
3437 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3438 last, data, stopparen, recursed, NULL,
3440 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3442 if (flags & SCF_DO_STCLASS)
3443 data->start_class = oclass;
3444 if (mincount == 0 || minnext == 0) {
3445 if (flags & SCF_DO_STCLASS_OR) {
3446 cl_or(pRExC_state, data->start_class, &this_class);
3448 else if (flags & SCF_DO_STCLASS_AND) {
3449 /* Switch to OR mode: cache the old value of
3450 * data->start_class */
3452 StructCopy(data->start_class, and_withp,
3453 struct regnode_charclass_class);
3454 flags &= ~SCF_DO_STCLASS_AND;
3455 StructCopy(&this_class, data->start_class,
3456 struct regnode_charclass_class);
3457 flags |= SCF_DO_STCLASS_OR;
3458 data->start_class->flags |= ANYOF_EOS;
3460 } else { /* Non-zero len */
3461 if (flags & SCF_DO_STCLASS_OR) {
3462 cl_or(pRExC_state, data->start_class, &this_class);
3463 cl_and(data->start_class, and_withp);
3465 else if (flags & SCF_DO_STCLASS_AND)
3466 cl_and(data->start_class, &this_class);
3467 flags &= ~SCF_DO_STCLASS;
3469 if (!scan) /* It was not CURLYX, but CURLY. */
3471 if ( /* ? quantifier ok, except for (?{ ... }) */
3472 (next_is_eval || !(mincount == 0 && maxcount == 1))
3473 && (minnext == 0) && (deltanext == 0)
3474 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3475 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3477 ckWARNreg(RExC_parse,
3478 "Quantifier unexpected on zero-length expression");
3481 min += minnext * mincount;
3482 is_inf_internal |= ((maxcount == REG_INFTY
3483 && (minnext + deltanext) > 0)
3484 || deltanext == I32_MAX);
3485 is_inf |= is_inf_internal;
3486 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3488 /* Try powerful optimization CURLYX => CURLYN. */
3489 if ( OP(oscan) == CURLYX && data
3490 && data->flags & SF_IN_PAR
3491 && !(data->flags & SF_HAS_EVAL)
3492 && !deltanext && minnext == 1 ) {
3493 /* Try to optimize to CURLYN. */
3494 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3495 regnode * const nxt1 = nxt;
3502 if (!REGNODE_SIMPLE(OP(nxt))
3503 && !(PL_regkind[OP(nxt)] == EXACT
3504 && STR_LEN(nxt) == 1))
3510 if (OP(nxt) != CLOSE)
3512 if (RExC_open_parens) {
3513 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3514 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3516 /* Now we know that nxt2 is the only contents: */
3517 oscan->flags = (U8)ARG(nxt);
3519 OP(nxt1) = NOTHING; /* was OPEN. */
3522 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3523 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3524 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3525 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3526 OP(nxt + 1) = OPTIMIZED; /* was count. */
3527 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3532 /* Try optimization CURLYX => CURLYM. */
3533 if ( OP(oscan) == CURLYX && data
3534 && !(data->flags & SF_HAS_PAR)
3535 && !(data->flags & SF_HAS_EVAL)
3536 && !deltanext /* atom is fixed width */
3537 && minnext != 0 /* CURLYM can't handle zero width */
3539 /* XXXX How to optimize if data == 0? */
3540 /* Optimize to a simpler form. */
3541 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3545 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3546 && (OP(nxt2) != WHILEM))
3548 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3549 /* Need to optimize away parenths. */
3550 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3551 /* Set the parenth number. */
3552 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3554 oscan->flags = (U8)ARG(nxt);
3555 if (RExC_open_parens) {
3556 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3557 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3559 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3560 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3563 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3564 OP(nxt + 1) = OPTIMIZED; /* was count. */
3565 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3566 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3569 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3570 regnode *nnxt = regnext(nxt1);
3572 if (reg_off_by_arg[OP(nxt1)])
3573 ARG_SET(nxt1, nxt2 - nxt1);
3574 else if (nxt2 - nxt1 < U16_MAX)
3575 NEXT_OFF(nxt1) = nxt2 - nxt1;
3577 OP(nxt) = NOTHING; /* Cannot beautify */
3582 /* Optimize again: */
3583 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3584 NULL, stopparen, recursed, NULL, 0,depth+1);
3589 else if ((OP(oscan) == CURLYX)
3590 && (flags & SCF_WHILEM_VISITED_POS)
3591 /* See the comment on a similar expression above.
3592 However, this time it's not a subexpression
3593 we care about, but the expression itself. */
3594 && (maxcount == REG_INFTY)
3595 && data && ++data->whilem_c < 16) {
3596 /* This stays as CURLYX, we can put the count/of pair. */
3597 /* Find WHILEM (as in regexec.c) */
3598 regnode *nxt = oscan + NEXT_OFF(oscan);
3600 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3602 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3603 | (RExC_whilem_seen << 4)); /* On WHILEM */
3605 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3607 if (flags & SCF_DO_SUBSTR) {
3608 SV *last_str = NULL;
3609 int counted = mincount != 0;
3611 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3612 #if defined(SPARC64_GCC_WORKAROUND)
3615 const char *s = NULL;
3618 if (pos_before >= data->last_start_min)
3621 b = data->last_start_min;
3624 s = SvPV_const(data->last_found, l);
3625 old = b - data->last_start_min;
3628 I32 b = pos_before >= data->last_start_min
3629 ? pos_before : data->last_start_min;
3631 const char * const s = SvPV_const(data->last_found, l);
3632 I32 old = b - data->last_start_min;
3636 old = utf8_hop((U8*)s, old) - (U8*)s;
3638 /* Get the added string: */
3639 last_str = newSVpvn_utf8(s + old, l, UTF);
3640 if (deltanext == 0 && pos_before == b) {
3641 /* What was added is a constant string */
3643 SvGROW(last_str, (mincount * l) + 1);
3644 repeatcpy(SvPVX(last_str) + l,
3645 SvPVX_const(last_str), l, mincount - 1);
3646 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3647 /* Add additional parts. */
3648 SvCUR_set(data->last_found,
3649 SvCUR(data->last_found) - l);
3650 sv_catsv(data->last_found, last_str);
3652 SV * sv = data->last_found;
3654 SvUTF8(sv) && SvMAGICAL(sv) ?
3655 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3656 if (mg && mg->mg_len >= 0)
3657 mg->mg_len += CHR_SVLEN(last_str) - l;
3659 data->last_end += l * (mincount - 1);
3662 /* start offset must point into the last copy */
3663 data->last_start_min += minnext * (mincount - 1);
3664 data->last_start_max += is_inf ? I32_MAX
3665 : (maxcount - 1) * (minnext + data->pos_delta);
3668 /* It is counted once already... */
3669 data->pos_min += minnext * (mincount - counted);
3670 data->pos_delta += - counted * deltanext +
3671 (minnext + deltanext) * maxcount - minnext * mincount;
3672 if (mincount != maxcount) {
3673 /* Cannot extend fixed substrings found inside
3675 SCAN_COMMIT(pRExC_state,data,minlenp);
3676 if (mincount && last_str) {
3677 SV * const sv = data->last_found;
3678 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3679 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3683 sv_setsv(sv, last_str);
3684 data->last_end = data->pos_min;
3685 data->last_start_min =
3686 data->pos_min - CHR_SVLEN(last_str);
3687 data->last_start_max = is_inf
3689 : data->pos_min + data->pos_delta
3690 - CHR_SVLEN(last_str);
3692 data->longest = &(data->longest_float);
3694 SvREFCNT_dec(last_str);
3696 if (data && (fl & SF_HAS_EVAL))
3697 data->flags |= SF_HAS_EVAL;
3698 optimize_curly_tail:
3699 if (OP(oscan) != CURLYX) {
3700 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3702 NEXT_OFF(oscan) += NEXT_OFF(next);
3705 default: /* REF, ANYOFV, and CLUMP only? */
3706 if (flags & SCF_DO_SUBSTR) {
3707 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3708 data->longest = &(data->longest_float);
3710 is_inf = is_inf_internal = 1;
3711 if (flags & SCF_DO_STCLASS_OR)
3712 cl_anything(pRExC_state, data->start_class);
3713 flags &= ~SCF_DO_STCLASS;
3717 else if (OP(scan) == LNBREAK) {
3718 if (flags & SCF_DO_STCLASS) {
3720 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3721 if (flags & SCF_DO_STCLASS_AND) {
3722 for (value = 0; value < 256; value++)
3723 if (!is_VERTWS_cp(value))
3724 ANYOF_BITMAP_CLEAR(data->start_class, value);
3727 for (value = 0; value < 256; value++)
3728 if (is_VERTWS_cp(value))
3729 ANYOF_BITMAP_SET(data->start_class, value);
3731 if (flags & SCF_DO_STCLASS_OR)
3732 cl_and(data->start_class, and_withp);
3733 flags &= ~SCF_DO_STCLASS;
3737 if (flags & SCF_DO_SUBSTR) {
3738 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3740 data->pos_delta += 1;
3741 data->longest = &(data->longest_float);
3744 else if (OP(scan) == FOLDCHAR) {
3745 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3746 flags &= ~SCF_DO_STCLASS;
3749 if (flags & SCF_DO_SUBSTR) {
3750 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3752 data->pos_delta += d;
3753 data->longest = &(data->longest_float);
3756 else if (REGNODE_SIMPLE(OP(scan))) {
3759 if (flags & SCF_DO_SUBSTR) {
3760 SCAN_COMMIT(pRExC_state,data,minlenp);
3764 if (flags & SCF_DO_STCLASS) {
3765 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3767 /* Some of the logic below assumes that switching
3768 locale on will only add false positives. */
3769 switch (PL_regkind[OP(scan)]) {
3773 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3774 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3775 cl_anything(pRExC_state, data->start_class);
3778 if (OP(scan) == SANY)
3780 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3781 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3782 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3783 cl_anything(pRExC_state, data->start_class);
3785 if (flags & SCF_DO_STCLASS_AND || !value)
3786 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3789 if (flags & SCF_DO_STCLASS_AND)
3790 cl_and(data->start_class,
3791 (struct regnode_charclass_class*)scan);
3793 cl_or(pRExC_state, data->start_class,
3794 (struct regnode_charclass_class*)scan);
3797 if (flags & SCF_DO_STCLASS_AND) {
3798 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3799 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3800 if (OP(scan) == ALNUMU) {
3801 for (value = 0; value < 256; value++) {
3802 if (!isWORDCHAR_L1(value)) {
3803 ANYOF_BITMAP_CLEAR(data->start_class, value);
3807 for (value = 0; value < 256; value++) {
3808 if (!isALNUM(value)) {
3809 ANYOF_BITMAP_CLEAR(data->start_class, value);
3816 if (data->start_class->flags & ANYOF_LOCALE)
3817 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3819 /* Even if under locale, set the bits for non-locale
3820 * in case it isn't a true locale-node. This will
3821 * create false positives if it truly is locale */
3822 if (OP(scan) == ALNUMU) {
3823 for (value = 0; value < 256; value++) {
3824 if (isWORDCHAR_L1(value)) {
3825 ANYOF_BITMAP_SET(data->start_class, value);
3829 for (value = 0; value < 256; value++) {
3830 if (isALNUM(value)) {
3831 ANYOF_BITMAP_SET(data->start_class, value);
3838 if (flags & SCF_DO_STCLASS_AND) {
3839 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3840 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3841 if (OP(scan) == NALNUMU) {
3842 for (value = 0; value < 256; value++) {
3843 if (isWORDCHAR_L1(value)) {
3844 ANYOF_BITMAP_CLEAR(data->start_class, value);
3848 for (value = 0; value < 256; value++) {
3849 if (isALNUM(value)) {
3850 ANYOF_BITMAP_CLEAR(data->start_class, value);
3857 if (data->start_class->flags & ANYOF_LOCALE)
3858 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3860 /* Even if under locale, set the bits for non-locale in
3861 * case it isn't a true locale-node. This will create
3862 * false positives if it truly is locale */
3863 if (OP(scan) == NALNUMU) {
3864 for (value = 0; value < 256; value++) {
3865 if (! isWORDCHAR_L1(value)) {
3866 ANYOF_BITMAP_SET(data->start_class, value);
3870 for (value = 0; value < 256; value++) {
3871 if (! isALNUM(value)) {
3872 ANYOF_BITMAP_SET(data->start_class, value);
3879 if (flags & SCF_DO_STCLASS_AND) {
3880 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3881 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3882 if (OP(scan) == SPACEU) {
3883 for (value = 0; value < 256; value++) {
3884 if (!isSPACE_L1(value)) {
3885 ANYOF_BITMAP_CLEAR(data->start_class, value);
3889 for (value = 0; value < 256; value++) {
3890 if (!isSPACE(value)) {
3891 ANYOF_BITMAP_CLEAR(data->start_class, value);
3898 if (data->start_class->flags & ANYOF_LOCALE) {
3899 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3901 if (OP(scan) == SPACEU) {
3902 for (value = 0; value < 256; value++) {
3903 if (isSPACE_L1(value)) {
3904 ANYOF_BITMAP_SET(data->start_class, value);
3908 for (value = 0; value < 256; value++) {
3909 if (isSPACE(value)) {
3910 ANYOF_BITMAP_SET(data->start_class, value);
3917 if (flags & SCF_DO_STCLASS_AND) {
3918 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3919 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3920 if (OP(scan) == NSPACEU) {
3921 for (value = 0; value < 256; value++) {
3922 if (isSPACE_L1(value)) {
3923 ANYOF_BITMAP_CLEAR(data->start_class, value);
3927 for (value = 0; value < 256; value++) {
3928 if (isSPACE(value)) {
3929 ANYOF_BITMAP_CLEAR(data->start_class, value);
3936 if (data->start_class->flags & ANYOF_LOCALE)
3937 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3938 if (OP(scan) == NSPACEU) {
3939 for (value = 0; value < 256; value++) {
3940 if (!isSPACE_L1(value)) {
3941 ANYOF_BITMAP_SET(data->start_class, value);
3946 for (value = 0; value < 256; value++) {
3947 if (!isSPACE(value)) {
3948 ANYOF_BITMAP_SET(data->start_class, value);
3955 if (flags & SCF_DO_STCLASS_AND) {
3956 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3957 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3958 for (value = 0; value < 256; value++)
3959 if (!isDIGIT(value))
3960 ANYOF_BITMAP_CLEAR(data->start_class, value);
3964 if (data->start_class->flags & ANYOF_LOCALE)
3965 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3966 for (value = 0; value < 256; value++)
3968 ANYOF_BITMAP_SET(data->start_class, value);
3972 if (flags & SCF_DO_STCLASS_AND) {
3973 if (!(data->start_class->flags & ANYOF_LOCALE))
3974 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3975 for (value = 0; value < 256; value++)
3977 ANYOF_BITMAP_CLEAR(data->start_class, value);
3980 if (data->start_class->flags & ANYOF_LOCALE)
3981 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3982 for (value = 0; value < 256; value++)
3983 if (!isDIGIT(value))
3984 ANYOF_BITMAP_SET(data->start_class, value);
3987 CASE_SYNST_FNC(VERTWS);
3988 CASE_SYNST_FNC(HORIZWS);
3991 if (flags & SCF_DO_STCLASS_OR)
3992 cl_and(data->start_class, and_withp);
3993 flags &= ~SCF_DO_STCLASS;
3996 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3997 data->flags |= (OP(scan) == MEOL
4001 else if ( PL_regkind[OP(scan)] == BRANCHJ
4002 /* Lookbehind, or need to calculate parens/evals/stclass: */
4003 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4004 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4005 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4006 || OP(scan) == UNLESSM )
4008 /* Negative Lookahead/lookbehind
4009 In this case we can't do fixed string optimisation.
4012 I32 deltanext, minnext, fake = 0;
4014 struct regnode_charclass_class intrnl;
4017 data_fake.flags = 0;
4019 data_fake.whilem_c = data->whilem_c;
4020 data_fake.last_closep = data->last_closep;
4023 data_fake.last_closep = &fake;
4024 data_fake.pos_delta = delta;
4025 if ( flags & SCF_DO_STCLASS && !scan->flags
4026 && OP(scan) == IFMATCH ) { /* Lookahead */
4027 cl_init(pRExC_state, &intrnl);
4028 data_fake.start_class = &intrnl;
4029 f |= SCF_DO_STCLASS_AND;
4031 if (flags & SCF_WHILEM_VISITED_POS)
4032 f |= SCF_WHILEM_VISITED_POS;
4033 next = regnext(scan);
4034 nscan = NEXTOPER(NEXTOPER(scan));
4035 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4036 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4039 FAIL("Variable length lookbehind not implemented");
4041 else if (minnext > (I32)U8_MAX) {
4042 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4044 scan->flags = (U8)minnext;
4047 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4049 if (data_fake.flags & SF_HAS_EVAL)
4050 data->flags |= SF_HAS_EVAL;
4051 data->whilem_c = data_fake.whilem_c;
4053 if (f & SCF_DO_STCLASS_AND) {
4054 if (flags & SCF_DO_STCLASS_OR) {
4055 /* OR before, AND after: ideally we would recurse with
4056 * data_fake to get the AND applied by study of the
4057 * remainder of the pattern, and then derecurse;
4058 * *** HACK *** for now just treat as "no information".
4059 * See [perl #56690].
4061 cl_init(pRExC_state, data->start_class);
4063 /* AND before and after: combine and continue */
4064 const int was = (data->start_class->flags & ANYOF_EOS);
4066 cl_and(data->start_class, &intrnl);
4068 data->start_class->flags |= ANYOF_EOS;
4072 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4074 /* Positive Lookahead/lookbehind
4075 In this case we can do fixed string optimisation,
4076 but we must be careful about it. Note in the case of
4077 lookbehind the positions will be offset by the minimum
4078 length of the pattern, something we won't know about
4079 until after the recurse.
4081 I32 deltanext, fake = 0;
4083 struct regnode_charclass_class intrnl;
4085 /* We use SAVEFREEPV so that when the full compile
4086 is finished perl will clean up the allocated
4087 minlens when it's all done. This way we don't
4088 have to worry about freeing them when we know
4089 they wont be used, which would be a pain.
4092 Newx( minnextp, 1, I32 );
4093 SAVEFREEPV(minnextp);
4096 StructCopy(data, &data_fake, scan_data_t);
4097 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4100 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4101 data_fake.last_found=newSVsv(data->last_found);
4105 data_fake.last_closep = &fake;
4106 data_fake.flags = 0;
4107 data_fake.pos_delta = delta;
4109 data_fake.flags |= SF_IS_INF;
4110 if ( flags & SCF_DO_STCLASS && !scan->flags
4111 && OP(scan) == IFMATCH ) { /* Lookahead */
4112 cl_init(pRExC_state, &intrnl);
4113 data_fake.start_class = &intrnl;
4114 f |= SCF_DO_STCLASS_AND;
4116 if (flags & SCF_WHILEM_VISITED_POS)
4117 f |= SCF_WHILEM_VISITED_POS;
4118 next = regnext(scan);
4119 nscan = NEXTOPER(NEXTOPER(scan));
4121 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4122 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4125 FAIL("Variable length lookbehind not implemented");
4127 else if (*minnextp > (I32)U8_MAX) {
4128 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4130 scan->flags = (U8)*minnextp;
4135 if (f & SCF_DO_STCLASS_AND) {
4136 const int was = (data->start_class->flags & ANYOF_EOS);
4138 cl_and(data->start_class, &intrnl);
4140 data->start_class->flags |= ANYOF_EOS;
4143 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4145 if (data_fake.flags & SF_HAS_EVAL)
4146 data->flags |= SF_HAS_EVAL;
4147 data->whilem_c = data_fake.whilem_c;
4148 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4149 if (RExC_rx->minlen<*minnextp)
4150 RExC_rx->minlen=*minnextp;
4151 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4152 SvREFCNT_dec(data_fake.last_found);
4154 if ( data_fake.minlen_fixed != minlenp )
4156 data->offset_fixed= data_fake.offset_fixed;
4157 data->minlen_fixed= data_fake.minlen_fixed;
4158 data->lookbehind_fixed+= scan->flags;
4160 if ( data_fake.minlen_float != minlenp )
4162 data->minlen_float= data_fake.minlen_float;
4163 data->offset_float_min=data_fake.offset_float_min;
4164 data->offset_float_max=data_fake.offset_float_max;
4165 data->lookbehind_float+= scan->flags;
4174 else if (OP(scan) == OPEN) {
4175 if (stopparen != (I32)ARG(scan))
4178 else if (OP(scan) == CLOSE) {
4179 if (stopparen == (I32)ARG(scan)) {
4182 if ((I32)ARG(scan) == is_par) {
4183 next = regnext(scan);
4185 if ( next && (OP(next) != WHILEM) && next < last)
4186 is_par = 0; /* Disable optimization */
4189 *(data->last_closep) = ARG(scan);
4191 else if (OP(scan) == EVAL) {
4193 data->flags |= SF_HAS_EVAL;
4195 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4196 if (flags & SCF_DO_SUBSTR) {
4197 SCAN_COMMIT(pRExC_state,data,minlenp);
4198 flags &= ~SCF_DO_SUBSTR;
4200 if (data && OP(scan)==ACCEPT) {
4201 data->flags |= SCF_SEEN_ACCEPT;
4206 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4208 if (flags & SCF_DO_SUBSTR) {
4209 SCAN_COMMIT(pRExC_state,data,minlenp);
4210 data->longest = &(data->longest_float);
4212 is_inf = is_inf_internal = 1;
4213 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4214 cl_anything(pRExC_state, data->start_class);
4215 flags &= ~SCF_DO_STCLASS;
4217 else if (OP(scan) == GPOS) {
4218 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4219 !(delta || is_inf || (data && data->pos_delta)))
4221 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4222 RExC_rx->extflags |= RXf_ANCH_GPOS;
4223 if (RExC_rx->gofs < (U32)min)
4224 RExC_rx->gofs = min;
4226 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4230 #ifdef TRIE_STUDY_OPT
4231 #ifdef FULL_TRIE_STUDY
4232 else if (PL_regkind[OP(scan)] == TRIE) {
4233 /* NOTE - There is similar code to this block above for handling
4234 BRANCH nodes on the initial study. If you change stuff here
4236 regnode *trie_node= scan;
4237 regnode *tail= regnext(scan);
4238 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4239 I32 max1 = 0, min1 = I32_MAX;
4240 struct regnode_charclass_class accum;
4242 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4243 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4244 if (flags & SCF_DO_STCLASS)
4245 cl_init_zero(pRExC_state, &accum);
4251 const regnode *nextbranch= NULL;
4254 for ( word=1 ; word <= trie->wordcount ; word++)
4256 I32 deltanext=0, minnext=0, f = 0, fake;
4257 struct regnode_charclass_class this_class;
4259 data_fake.flags = 0;
4261 data_fake.whilem_c = data->whilem_c;
4262 data_fake.last_closep = data->last_closep;
4265 data_fake.last_closep = &fake;
4266 data_fake.pos_delta = delta;
4267 if (flags & SCF_DO_STCLASS) {
4268 cl_init(pRExC_state, &this_class);
4269 data_fake.start_class = &this_class;
4270 f = SCF_DO_STCLASS_AND;
4272 if (flags & SCF_WHILEM_VISITED_POS)
4273 f |= SCF_WHILEM_VISITED_POS;
4275 if (trie->jump[word]) {
4277 nextbranch = trie_node + trie->jump[0];
4278 scan= trie_node + trie->jump[word];
4279 /* We go from the jump point to the branch that follows
4280 it. Note this means we need the vestigal unused branches
4281 even though they arent otherwise used.
4283 minnext = study_chunk(pRExC_state, &scan, minlenp,
4284 &deltanext, (regnode *)nextbranch, &data_fake,
4285 stopparen, recursed, NULL, f,depth+1);
4287 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4288 nextbranch= regnext((regnode*)nextbranch);
4290 if (min1 > (I32)(minnext + trie->minlen))
4291 min1 = minnext + trie->minlen;
4292 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4293 max1 = minnext + deltanext + trie->maxlen;
4294 if (deltanext == I32_MAX)
4295 is_inf = is_inf_internal = 1;
4297 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4299 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4300 if ( stopmin > min + min1)
4301 stopmin = min + min1;
4302 flags &= ~SCF_DO_SUBSTR;
4304 data->flags |= SCF_SEEN_ACCEPT;
4307 if (data_fake.flags & SF_HAS_EVAL)
4308 data->flags |= SF_HAS_EVAL;
4309 data->whilem_c = data_fake.whilem_c;
4311 if (flags & SCF_DO_STCLASS)
4312 cl_or(pRExC_state, &accum, &this_class);
4315 if (flags & SCF_DO_SUBSTR) {
4316 data->pos_min += min1;
4317 data->pos_delta += max1 - min1;
4318 if (max1 != min1 || is_inf)
4319 data->longest = &(data->longest_float);
4322 delta += max1 - min1;
4323 if (flags & SCF_DO_STCLASS_OR) {
4324 cl_or(pRExC_state, data->start_class, &accum);
4326 cl_and(data->start_class, and_withp);
4327 flags &= ~SCF_DO_STCLASS;
4330 else if (flags & SCF_DO_STCLASS_AND) {
4332 cl_and(data->start_class, &accum);
4333 flags &= ~SCF_DO_STCLASS;
4336 /* Switch to OR mode: cache the old value of
4337 * data->start_class */
4339 StructCopy(data->start_class, and_withp,
4340 struct regnode_charclass_class);
4341 flags &= ~SCF_DO_STCLASS_AND;
4342 StructCopy(&accum, data->start_class,
4343 struct regnode_charclass_class);
4344 flags |= SCF_DO_STCLASS_OR;
4345 data->start_class->flags |= ANYOF_EOS;
4352 else if (PL_regkind[OP(scan)] == TRIE) {
4353 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4356 min += trie->minlen;
4357 delta += (trie->maxlen - trie->minlen);
4358 flags &= ~SCF_DO_STCLASS; /* xxx */
4359 if (flags & SCF_DO_SUBSTR) {
4360 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4361 data->pos_min += trie->minlen;
4362 data->pos_delta += (trie->maxlen - trie->minlen);
4363 if (trie->maxlen != trie->minlen)
4364 data->longest = &(data->longest_float);
4366 if (trie->jump) /* no more substrings -- for now /grr*/
4367 flags &= ~SCF_DO_SUBSTR;
4369 #endif /* old or new */
4370 #endif /* TRIE_STUDY_OPT */
4372 /* Else: zero-length, ignore. */
4373 scan = regnext(scan);
4378 stopparen = frame->stop;
4379 frame = frame->prev;
4380 goto fake_study_recurse;
4385 DEBUG_STUDYDATA("pre-fin:",data,depth);
4388 *deltap = is_inf_internal ? I32_MAX : delta;
4389 if (flags & SCF_DO_SUBSTR && is_inf)
4390 data->pos_delta = I32_MAX - data->pos_min;
4391 if (is_par > (I32)U8_MAX)
4393 if (is_par && pars==1 && data) {
4394 data->flags |= SF_IN_PAR;
4395 data->flags &= ~SF_HAS_PAR;
4397 else if (pars && data) {
4398 data->flags |= SF_HAS_PAR;
4399 data->flags &= ~SF_IN_PAR;
4401 if (flags & SCF_DO_STCLASS_OR)
4402 cl_and(data->start_class, and_withp);
4403 if (flags & SCF_TRIE_RESTUDY)
4404 data->flags |= SCF_TRIE_RESTUDY;
4406 DEBUG_STUDYDATA("post-fin:",data,depth);
4408 return min < stopmin ? min : stopmin;
4412 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4414 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4416 PERL_ARGS_ASSERT_ADD_DATA;
4418 Renewc(RExC_rxi->data,
4419 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4420 char, struct reg_data);
4422 Renew(RExC_rxi->data->what, count + n, U8);
4424 Newx(RExC_rxi->data->what, n, U8);
4425 RExC_rxi->data->count = count + n;
4426 Copy(s, RExC_rxi->data->what + count, n, U8);
4430 /*XXX: todo make this not included in a non debugging perl */
4431 #ifndef PERL_IN_XSUB_RE
4433 Perl_reginitcolors(pTHX)
4436 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4438 char *t = savepv(s);
4442 t = strchr(t, '\t');
4448 PL_colors[i] = t = (char *)"";
4453 PL_colors[i++] = (char *)"";
4460 #ifdef TRIE_STUDY_OPT
4461 #define CHECK_RESTUDY_GOTO \
4463 (data.flags & SCF_TRIE_RESTUDY) \
4467 #define CHECK_RESTUDY_GOTO
4471 - pregcomp - compile a regular expression into internal code
4473 * We can't allocate space until we know how big the compiled form will be,
4474 * but we can't compile it (and thus know how big it is) until we've got a
4475 * place to put the code. So we cheat: we compile it twice, once with code
4476 * generation turned off and size counting turned on, and once "for real".
4477 * This also means that we don't allocate space until we are sure that the
4478 * thing really will compile successfully, and we never have to move the
4479 * code and thus invalidate pointers into it. (Note that it has to be in
4480 * one piece because free() must be able to free it all.) [NB: not true in perl]
4482 * Beware that the optimization-preparation code in here knows about some
4483 * of the structure of the compiled regexp. [I'll say.]
4488 #ifndef PERL_IN_XSUB_RE
4489 #define RE_ENGINE_PTR &reh_regexp_engine
4491 extern const struct regexp_engine my_reg_engine;
4492 #define RE_ENGINE_PTR &my_reg_engine
4495 #ifndef PERL_IN_XSUB_RE
4497 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4500 HV * const table = GvHV(PL_hintgv);
4502 PERL_ARGS_ASSERT_PREGCOMP;
4504 /* Dispatch a request to compile a regexp to correct
4507 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4508 GET_RE_DEBUG_FLAGS_DECL;
4509 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4510 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4512 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4515 return CALLREGCOMP_ENG(eng, pattern, flags);
4518 return Perl_re_compile(aTHX_ pattern, flags);
4523 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4528 register regexp_internal *ri;
4537 /* these are all flags - maybe they should be turned
4538 * into a single int with different bit masks */
4539 I32 sawlookahead = 0;
4542 bool used_setjump = FALSE;
4543 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4548 RExC_state_t RExC_state;
4549 RExC_state_t * const pRExC_state = &RExC_state;
4550 #ifdef TRIE_STUDY_OPT
4552 RExC_state_t copyRExC_state;
4554 GET_RE_DEBUG_FLAGS_DECL;
4556 PERL_ARGS_ASSERT_RE_COMPILE;
4558 DEBUG_r(if (!PL_colorset) reginitcolors());
4560 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4561 RExC_uni_semantics = 0;
4562 RExC_contains_locale = 0;
4564 /****************** LONG JUMP TARGET HERE***********************/
4565 /* Longjmp back to here if have to switch in midstream to utf8 */
4566 if (! RExC_orig_utf8) {
4567 JMPENV_PUSH(jump_ret);
4568 used_setjump = TRUE;
4571 if (jump_ret == 0) { /* First time through */
4572 exp = SvPV(pattern, plen);
4574 /* ignore the utf8ness if the pattern is 0 length */
4576 RExC_utf8 = RExC_orig_utf8 = 0;
4580 SV *dsv= sv_newmortal();
4581 RE_PV_QUOTED_DECL(s, RExC_utf8,
4582 dsv, exp, plen, 60);
4583 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4584 PL_colors[4],PL_colors[5],s);
4587 else { /* longjumped back */
4590 /* If the cause for the longjmp was other than changing to utf8, pop
4591 * our own setjmp, and longjmp to the correct handler */
4592 if (jump_ret != UTF8_LONGJMP) {
4594 JMPENV_JUMP(jump_ret);
4599 /* It's possible to write a regexp in ascii that represents Unicode
4600 codepoints outside of the byte range, such as via \x{100}. If we
4601 detect such a sequence we have to convert the entire pattern to utf8
4602 and then recompile, as our sizing calculation will have been based
4603 on 1 byte == 1 character, but we will need to use utf8 to encode
4604 at least some part of the pattern, and therefore must convert the whole
4607 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4608 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4609 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4611 RExC_orig_utf8 = RExC_utf8 = 1;
4615 #ifdef TRIE_STUDY_OPT
4619 pm_flags = orig_pm_flags;
4621 if (initial_charset == REGEX_LOCALE_CHARSET) {
4622 RExC_contains_locale = 1;
4624 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4626 /* Set to use unicode semantics if the pattern is in utf8 and has the
4627 * 'depends' charset specified, as it means unicode when utf8 */
4628 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4632 RExC_flags = pm_flags;
4636 RExC_in_lookbehind = 0;
4637 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4638 RExC_seen_evals = 0;
4640 RExC_override_recoding = 0;
4642 /* First pass: determine size, legality. */
4650 RExC_emit = &PL_regdummy;
4651 RExC_whilem_seen = 0;
4652 RExC_open_parens = NULL;
4653 RExC_close_parens = NULL;
4655 RExC_paren_names = NULL;
4657 RExC_paren_name_list = NULL;
4659 RExC_recurse = NULL;
4660 RExC_recurse_count = 0;
4662 #if 0 /* REGC() is (currently) a NOP at the first pass.
4663 * Clever compilers notice this and complain. --jhi */
4664 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4666 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4667 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4668 RExC_precomp = NULL;
4672 /* Here, finished first pass. Get rid of any added setjmp */
4678 PerlIO_printf(Perl_debug_log,
4679 "Required size %"IVdf" nodes\n"
4680 "Starting second pass (creation)\n",
4683 RExC_lastparse=NULL;
4686 /* The first pass could have found things that force Unicode semantics */
4687 if ((RExC_utf8 || RExC_uni_semantics)
4688 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4690 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4693 /* Small enough for pointer-storage convention?
4694 If extralen==0, this means that we will not need long jumps. */
4695 if (RExC_size >= 0x10000L && RExC_extralen)
4696 RExC_size += RExC_extralen;
4699 if (RExC_whilem_seen > 15)
4700 RExC_whilem_seen = 15;
4702 /* Allocate space and zero-initialize. Note, the two step process
4703 of zeroing when in debug mode, thus anything assigned has to
4704 happen after that */
4705 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4706 r = (struct regexp*)SvANY(rx);
4707 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4708 char, regexp_internal);
4709 if ( r == NULL || ri == NULL )
4710 FAIL("Regexp out of space");
4712 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4713 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4715 /* bulk initialize base fields with 0. */
4716 Zero(ri, sizeof(regexp_internal), char);
4719 /* non-zero initialization begins here */
4721 r->engine= RE_ENGINE_PTR;
4722 r->extflags = pm_flags;
4724 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4725 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4727 /* The caret is output if there are any defaults: if not all the STD
4728 * flags are set, or if no character set specifier is needed */
4730 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4732 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4733 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4734 >> RXf_PMf_STD_PMMOD_SHIFT);
4735 const char *fptr = STD_PAT_MODS; /*"msix"*/
4737 /* Allocate for the worst case, which is all the std flags are turned
4738 * on. If more precision is desired, we could do a population count of
4739 * the flags set. This could be done with a small lookup table, or by
4740 * shifting, masking and adding, or even, when available, assembly
4741 * language for a machine-language population count.
4742 * We never output a minus, as all those are defaults, so are
4743 * covered by the caret */
4744 const STRLEN wraplen = plen + has_p + has_runon
4745 + has_default /* If needs a caret */
4747 /* If needs a character set specifier */
4748 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4749 + (sizeof(STD_PAT_MODS) - 1)
4750 + (sizeof("(?:)") - 1);
4752 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4754 SvFLAGS(rx) |= SvUTF8(pattern);
4757 /* If a default, cover it using the caret */
4759 *p++= DEFAULT_PAT_MOD;
4763 const char* const name = get_regex_charset_name(r->extflags, &len);
4764 Copy(name, p, len, char);
4768 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4771 while((ch = *fptr++)) {
4779 Copy(RExC_precomp, p, plen, char);
4780 assert ((RX_WRAPPED(rx) - p) < 16);
4781 r->pre_prefix = p - RX_WRAPPED(rx);
4787 SvCUR_set(rx, p - SvPVX_const(rx));
4791 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4793 if (RExC_seen & REG_SEEN_RECURSE) {
4794 Newxz(RExC_open_parens, RExC_npar,regnode *);
4795 SAVEFREEPV(RExC_open_parens);
4796 Newxz(RExC_close_parens,RExC_npar,regnode *);
4797 SAVEFREEPV(RExC_close_parens);
4800 /* Useful during FAIL. */
4801 #ifdef RE_TRACK_PATTERN_OFFSETS
4802 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4803 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4804 "%s %"UVuf" bytes for offset annotations.\n",
4805 ri->u.offsets ? "Got" : "Couldn't get",
4806 (UV)((2*RExC_size+1) * sizeof(U32))));
4808 SetProgLen(ri,RExC_size);
4812 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
4814 /* Second pass: emit code. */
4815 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4820 RExC_emit_start = ri->program;
4821 RExC_emit = ri->program;
4822 RExC_emit_bound = ri->program + RExC_size + 1;
4824 /* Store the count of eval-groups for security checks: */
4825 RExC_rx->seen_evals = RExC_seen_evals;
4826 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4827 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4831 /* XXXX To minimize changes to RE engine we always allocate
4832 3-units-long substrs field. */
4833 Newx(r->substrs, 1, struct reg_substr_data);
4834 if (RExC_recurse_count) {
4835 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4836 SAVEFREEPV(RExC_recurse);
4840 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4841 Zero(r->substrs, 1, struct reg_substr_data);
4843 #ifdef TRIE_STUDY_OPT
4845 StructCopy(&zero_scan_data, &data, scan_data_t);
4846 copyRExC_state = RExC_state;
4849 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4851 RExC_state = copyRExC_state;
4852 if (seen & REG_TOP_LEVEL_BRANCHES)
4853 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4855 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4856 if (data.last_found) {
4857 SvREFCNT_dec(data.longest_fixed);
4858 SvREFCNT_dec(data.longest_float);
4859 SvREFCNT_dec(data.last_found);
4861 StructCopy(&zero_scan_data, &data, scan_data_t);
4864 StructCopy(&zero_scan_data, &data, scan_data_t);
4867 /* Dig out information for optimizations. */
4868 r->extflags = RExC_flags; /* was pm_op */
4869 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4872 SvUTF8_on(rx); /* Unicode in it? */
4873 ri->regstclass = NULL;
4874 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4875 r->intflags |= PREGf_NAUGHTY;
4876 scan = ri->program + 1; /* First BRANCH. */
4878 /* testing for BRANCH here tells us whether there is "must appear"
4879 data in the pattern. If there is then we can use it for optimisations */
4880 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4882 STRLEN longest_float_length, longest_fixed_length;
4883 struct regnode_charclass_class ch_class; /* pointed to by data */
4885 I32 last_close = 0; /* pointed to by data */
4886 regnode *first= scan;
4887 regnode *first_next= regnext(first);
4889 * Skip introductions and multiplicators >= 1
4890 * so that we can extract the 'meat' of the pattern that must
4891 * match in the large if() sequence following.
4892 * NOTE that EXACT is NOT covered here, as it is normally
4893 * picked up by the optimiser separately.
4895 * This is unfortunate as the optimiser isnt handling lookahead
4896 * properly currently.
4899 while ((OP(first) == OPEN && (sawopen = 1)) ||
4900 /* An OR of *one* alternative - should not happen now. */
4901 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4902 /* for now we can't handle lookbehind IFMATCH*/
4903 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4904 (OP(first) == PLUS) ||
4905 (OP(first) == MINMOD) ||
4906 /* An {n,m} with n>0 */
4907 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4908 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4911 * the only op that could be a regnode is PLUS, all the rest
4912 * will be regnode_1 or regnode_2.
4915 if (OP(first) == PLUS)
4918 first += regarglen[OP(first)];
4920 first = NEXTOPER(first);
4921 first_next= regnext(first);
4924 /* Starting-point info. */
4926 DEBUG_PEEP("first:",first,0);
4927 /* Ignore EXACT as we deal with it later. */
4928 if (PL_regkind[OP(first)] == EXACT) {
4929 if (OP(first) == EXACT)
4930 NOOP; /* Empty, get anchored substr later. */
4932 ri->regstclass = first;
4935 else if (PL_regkind[OP(first)] == TRIE &&
4936 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4939 /* this can happen only on restudy */
4940 if ( OP(first) == TRIE ) {
4941 struct regnode_1 *trieop = (struct regnode_1 *)
4942 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4943 StructCopy(first,trieop,struct regnode_1);
4944 trie_op=(regnode *)trieop;
4946 struct regnode_charclass *trieop = (struct regnode_charclass *)
4947 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4948 StructCopy(first,trieop,struct regnode_charclass);
4949 trie_op=(regnode *)trieop;
4952 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4953 ri->regstclass = trie_op;
4956 else if (REGNODE_SIMPLE(OP(first)))
4957 ri->regstclass = first;
4958 else if (PL_regkind[OP(first)] == BOUND ||
4959 PL_regkind[OP(first)] == NBOUND)
4960 ri->regstclass = first;
4961 else if (PL_regkind[OP(first)] == BOL) {
4962 r->extflags |= (OP(first) == MBOL
4964 : (OP(first) == SBOL
4967 first = NEXTOPER(first);
4970 else if (OP(first) == GPOS) {
4971 r->extflags |= RXf_ANCH_GPOS;
4972 first = NEXTOPER(first);
4975 else if ((!sawopen || !RExC_sawback) &&
4976 (OP(first) == STAR &&
4977 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4978 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4980 /* turn .* into ^.* with an implied $*=1 */
4982 (OP(NEXTOPER(first)) == REG_ANY)
4985 r->extflags |= type;
4986 r->intflags |= PREGf_IMPLICIT;
4987 first = NEXTOPER(first);
4990 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4991 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4992 /* x+ must match at the 1st pos of run of x's */
4993 r->intflags |= PREGf_SKIP;
4995 /* Scan is after the zeroth branch, first is atomic matcher. */
4996 #ifdef TRIE_STUDY_OPT
4999 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5000 (IV)(first - scan + 1))
5004 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5005 (IV)(first - scan + 1))
5011 * If there's something expensive in the r.e., find the
5012 * longest literal string that must appear and make it the
5013 * regmust. Resolve ties in favor of later strings, since
5014 * the regstart check works with the beginning of the r.e.
5015 * and avoiding duplication strengthens checking. Not a
5016 * strong reason, but sufficient in the absence of others.
5017 * [Now we resolve ties in favor of the earlier string if
5018 * it happens that c_offset_min has been invalidated, since the
5019 * earlier string may buy us something the later one won't.]
5022 data.longest_fixed = newSVpvs("");
5023 data.longest_float = newSVpvs("");
5024 data.last_found = newSVpvs("");
5025 data.longest = &(data.longest_fixed);
5027 if (!ri->regstclass) {
5028 cl_init(pRExC_state, &ch_class);
5029 data.start_class = &ch_class;
5030 stclass_flag = SCF_DO_STCLASS_AND;
5031 } else /* XXXX Check for BOUND? */
5033 data.last_closep = &last_close;
5035 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5036 &data, -1, NULL, NULL,
5037 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5043 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5044 && data.last_start_min == 0 && data.last_end > 0
5045 && !RExC_seen_zerolen
5046 && !(RExC_seen & REG_SEEN_VERBARG)
5047 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5048 r->extflags |= RXf_CHECK_ALL;
5049 scan_commit(pRExC_state, &data,&minlen,0);
5050 SvREFCNT_dec(data.last_found);
5052 /* Note that code very similar to this but for anchored string
5053 follows immediately below, changes may need to be made to both.
5056 longest_float_length = CHR_SVLEN(data.longest_float);
5057 if (longest_float_length
5058 || (data.flags & SF_FL_BEFORE_EOL
5059 && (!(data.flags & SF_FL_BEFORE_MEOL)
5060 || (RExC_flags & RXf_PMf_MULTILINE))))
5064 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5065 && data.offset_fixed == data.offset_float_min
5066 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5067 goto remove_float; /* As in (a)+. */
5069 /* copy the information about the longest float from the reg_scan_data
5070 over to the program. */
5071 if (SvUTF8(data.longest_float)) {
5072 r->float_utf8 = data.longest_float;
5073 r->float_substr = NULL;
5075 r->float_substr = data.longest_float;
5076 r->float_utf8 = NULL;
5078 /* float_end_shift is how many chars that must be matched that
5079 follow this item. We calculate it ahead of time as once the
5080 lookbehind offset is added in we lose the ability to correctly
5082 ml = data.minlen_float ? *(data.minlen_float)
5083 : (I32)longest_float_length;
5084 r->float_end_shift = ml - data.offset_float_min
5085 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5086 + data.lookbehind_float;
5087 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5088 r->float_max_offset = data.offset_float_max;
5089 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5090 r->float_max_offset -= data.lookbehind_float;
5092 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5093 && (!(data.flags & SF_FL_BEFORE_MEOL)
5094 || (RExC_flags & RXf_PMf_MULTILINE)));
5095 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5099 r->float_substr = r->float_utf8 = NULL;
5100 SvREFCNT_dec(data.longest_float);
5101 longest_float_length = 0;
5104 /* Note that code very similar to this but for floating string
5105 is immediately above, changes may need to be made to both.
5108 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5109 if (longest_fixed_length
5110 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5111 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5112 || (RExC_flags & RXf_PMf_MULTILINE))))
5116 /* copy the information about the longest fixed
5117 from the reg_scan_data over to the program. */
5118 if (SvUTF8(data.longest_fixed)) {
5119 r->anchored_utf8 = data.longest_fixed;
5120 r->anchored_substr = NULL;
5122 r->anchored_substr = data.longest_fixed;
5123 r->anchored_utf8 = NULL;
5125 /* fixed_end_shift is how many chars that must be matched that
5126 follow this item. We calculate it ahead of time as once the
5127 lookbehind offset is added in we lose the ability to correctly
5129 ml = data.minlen_fixed ? *(data.minlen_fixed)
5130 : (I32)longest_fixed_length;
5131 r->anchored_end_shift = ml - data.offset_fixed
5132 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5133 + data.lookbehind_fixed;
5134 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5136 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5137 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5138 || (RExC_flags & RXf_PMf_MULTILINE)));
5139 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5142 r->anchored_substr = r->anchored_utf8 = NULL;
5143 SvREFCNT_dec(data.longest_fixed);
5144 longest_fixed_length = 0;
5147 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5148 ri->regstclass = NULL;
5150 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5152 && !(data.start_class->flags & ANYOF_EOS)
5153 && !cl_is_anything(data.start_class))
5155 const U32 n = add_data(pRExC_state, 1, "f");
5156 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5158 Newx(RExC_rxi->data->data[n], 1,
5159 struct regnode_charclass_class);
5160 StructCopy(data.start_class,
5161 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5162 struct regnode_charclass_class);
5163 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5164 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5165 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5166 regprop(r, sv, (regnode*)data.start_class);
5167 PerlIO_printf(Perl_debug_log,
5168 "synthetic stclass \"%s\".\n",
5169 SvPVX_const(sv));});
5172 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5173 if (longest_fixed_length > longest_float_length) {
5174 r->check_end_shift = r->anchored_end_shift;
5175 r->check_substr = r->anchored_substr;
5176 r->check_utf8 = r->anchored_utf8;
5177 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5178 if (r->extflags & RXf_ANCH_SINGLE)
5179 r->extflags |= RXf_NOSCAN;
5182 r->check_end_shift = r->float_end_shift;
5183 r->check_substr = r->float_substr;
5184 r->check_utf8 = r->float_utf8;
5185 r->check_offset_min = r->float_min_offset;
5186 r->check_offset_max = r->float_max_offset;
5188 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5189 This should be changed ASAP! */
5190 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5191 r->extflags |= RXf_USE_INTUIT;
5192 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5193 r->extflags |= RXf_INTUIT_TAIL;
5195 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5196 if ( (STRLEN)minlen < longest_float_length )
5197 minlen= longest_float_length;
5198 if ( (STRLEN)minlen < longest_fixed_length )
5199 minlen= longest_fixed_length;
5203 /* Several toplevels. Best we can is to set minlen. */
5205 struct regnode_charclass_class ch_class;
5208 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5210 scan = ri->program + 1;
5211 cl_init(pRExC_state, &ch_class);
5212 data.start_class = &ch_class;
5213 data.last_closep = &last_close;
5216 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5217 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5221 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5222 = r->float_substr = r->float_utf8 = NULL;
5224 if (!(data.start_class->flags & ANYOF_EOS)
5225 && !cl_is_anything(data.start_class))
5227 const U32 n = add_data(pRExC_state, 1, "f");
5228 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5230 Newx(RExC_rxi->data->data[n], 1,
5231 struct regnode_charclass_class);
5232 StructCopy(data.start_class,
5233 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5234 struct regnode_charclass_class);
5235 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5236 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5237 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5238 regprop(r, sv, (regnode*)data.start_class);
5239 PerlIO_printf(Perl_debug_log,
5240 "synthetic stclass \"%s\".\n",
5241 SvPVX_const(sv));});
5245 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5246 the "real" pattern. */
5248 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5249 (IV)minlen, (IV)r->minlen);
5251 r->minlenret = minlen;
5252 if (r->minlen < minlen)
5255 if (RExC_seen & REG_SEEN_GPOS)
5256 r->extflags |= RXf_GPOS_SEEN;
5257 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5258 r->extflags |= RXf_LOOKBEHIND_SEEN;
5259 if (RExC_seen & REG_SEEN_EVAL)
5260 r->extflags |= RXf_EVAL_SEEN;
5261 if (RExC_seen & REG_SEEN_CANY)
5262 r->extflags |= RXf_CANY_SEEN;
5263 if (RExC_seen & REG_SEEN_VERBARG)
5264 r->intflags |= PREGf_VERBARG_SEEN;
5265 if (RExC_seen & REG_SEEN_CUTGROUP)
5266 r->intflags |= PREGf_CUTGROUP_SEEN;
5267 if (RExC_paren_names)
5268 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5270 RXp_PAREN_NAMES(r) = NULL;
5272 #ifdef STUPID_PATTERN_CHECKS
5273 if (RX_PRELEN(rx) == 0)
5274 r->extflags |= RXf_NULL;
5275 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5276 /* XXX: this should happen BEFORE we compile */
5277 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5278 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5279 r->extflags |= RXf_WHITE;
5280 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5281 r->extflags |= RXf_START_ONLY;
5283 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5284 /* XXX: this should happen BEFORE we compile */
5285 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5287 regnode *first = ri->program + 1;
5290 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5291 r->extflags |= RXf_NULL;
5292 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5293 r->extflags |= RXf_START_ONLY;
5294 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5295 && OP(regnext(first)) == END)
5296 r->extflags |= RXf_WHITE;
5300 if (RExC_paren_names) {
5301 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5302 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5305 ri->name_list_idx = 0;
5307 if (RExC_recurse_count) {
5308 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5309 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5310 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5313 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5314 /* assume we don't need to swap parens around before we match */
5317 PerlIO_printf(Perl_debug_log,"Final program:\n");
5320 #ifdef RE_TRACK_PATTERN_OFFSETS
5321 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5322 const U32 len = ri->u.offsets[0];
5324 GET_RE_DEBUG_FLAGS_DECL;
5325 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5326 for (i = 1; i <= len; i++) {
5327 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5328 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5329 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5331 PerlIO_printf(Perl_debug_log, "\n");
5337 #undef RE_ENGINE_PTR
5341 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5344 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5346 PERL_UNUSED_ARG(value);
5348 if (flags & RXapif_FETCH) {
5349 return reg_named_buff_fetch(rx, key, flags);
5350 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5351 Perl_croak_no_modify(aTHX);
5353 } else if (flags & RXapif_EXISTS) {
5354 return reg_named_buff_exists(rx, key, flags)
5357 } else if (flags & RXapif_REGNAMES) {
5358 return reg_named_buff_all(rx, flags);
5359 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5360 return reg_named_buff_scalar(rx, flags);
5362 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5368 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5371 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5372 PERL_UNUSED_ARG(lastkey);
5374 if (flags & RXapif_FIRSTKEY)
5375 return reg_named_buff_firstkey(rx, flags);
5376 else if (flags & RXapif_NEXTKEY)
5377 return reg_named_buff_nextkey(rx, flags);
5379 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5385 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5388 AV *retarray = NULL;
5390 struct regexp *const rx = (struct regexp *)SvANY(r);
5392 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5394 if (flags & RXapif_ALL)
5397 if (rx && RXp_PAREN_NAMES(rx)) {
5398 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5401 SV* sv_dat=HeVAL(he_str);
5402 I32 *nums=(I32*)SvPVX(sv_dat);
5403 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5404 if ((I32)(rx->nparens) >= nums[i]
5405 && rx->offs[nums[i]].start != -1
5406 && rx->offs[nums[i]].end != -1)
5409 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5413 ret = newSVsv(&PL_sv_undef);
5416 av_push(retarray, ret);
5419 return newRV_noinc(MUTABLE_SV(retarray));
5426 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5429 struct regexp *const rx = (struct regexp *)SvANY(r);
5431 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5433 if (rx && RXp_PAREN_NAMES(rx)) {
5434 if (flags & RXapif_ALL) {
5435 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5437 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5451 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5453 struct regexp *const rx = (struct regexp *)SvANY(r);
5455 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5457 if ( rx && RXp_PAREN_NAMES(rx) ) {
5458 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5460 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5467 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5469 struct regexp *const rx = (struct regexp *)SvANY(r);
5470 GET_RE_DEBUG_FLAGS_DECL;
5472 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5474 if (rx && RXp_PAREN_NAMES(rx)) {
5475 HV *hv = RXp_PAREN_NAMES(rx);
5477 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5480 SV* sv_dat = HeVAL(temphe);
5481 I32 *nums = (I32*)SvPVX(sv_dat);
5482 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5483 if ((I32)(rx->lastparen) >= nums[i] &&
5484 rx->offs[nums[i]].start != -1 &&
5485 rx->offs[nums[i]].end != -1)
5491 if (parno || flags & RXapif_ALL) {
5492 return newSVhek(HeKEY_hek(temphe));
5500 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5505 struct regexp *const rx = (struct regexp *)SvANY(r);
5507 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5509 if (rx && RXp_PAREN_NAMES(rx)) {
5510 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5511 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5512 } else if (flags & RXapif_ONE) {
5513 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5514 av = MUTABLE_AV(SvRV(ret));
5515 length = av_len(av);
5517 return newSViv(length + 1);
5519 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5523 return &PL_sv_undef;
5527 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5529 struct regexp *const rx = (struct regexp *)SvANY(r);
5532 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5534 if (rx && RXp_PAREN_NAMES(rx)) {
5535 HV *hv= RXp_PAREN_NAMES(rx);
5537 (void)hv_iterinit(hv);
5538 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5541 SV* sv_dat = HeVAL(temphe);
5542 I32 *nums = (I32*)SvPVX(sv_dat);
5543 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5544 if ((I32)(rx->lastparen) >= nums[i] &&
5545 rx->offs[nums[i]].start != -1 &&
5546 rx->offs[nums[i]].end != -1)
5552 if (parno || flags & RXapif_ALL) {
5553 av_push(av, newSVhek(HeKEY_hek(temphe)));
5558 return newRV_noinc(MUTABLE_SV(av));
5562 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5565 struct regexp *const rx = (struct regexp *)SvANY(r);
5570 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5573 sv_setsv(sv,&PL_sv_undef);
5577 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5579 i = rx->offs[0].start;
5583 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5585 s = rx->subbeg + rx->offs[0].end;
5586 i = rx->sublen - rx->offs[0].end;
5589 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5590 (s1 = rx->offs[paren].start) != -1 &&
5591 (t1 = rx->offs[paren].end) != -1)
5595 s = rx->subbeg + s1;
5597 sv_setsv(sv,&PL_sv_undef);
5600 assert(rx->sublen >= (s - rx->subbeg) + i );
5602 const int oldtainted = PL_tainted;
5604 sv_setpvn(sv, s, i);
5605 PL_tainted = oldtainted;
5606 if ( (rx->extflags & RXf_CANY_SEEN)
5607 ? (RXp_MATCH_UTF8(rx)
5608 && (!i || is_utf8_string((U8*)s, i)))
5609 : (RXp_MATCH_UTF8(rx)) )
5616 if (RXp_MATCH_TAINTED(rx)) {
5617 if (SvTYPE(sv) >= SVt_PVMG) {
5618 MAGIC* const mg = SvMAGIC(sv);
5621 SvMAGIC_set(sv, mg->mg_moremagic);
5623 if ((mgt = SvMAGIC(sv))) {
5624 mg->mg_moremagic = mgt;
5625 SvMAGIC_set(sv, mg);
5635 sv_setsv(sv,&PL_sv_undef);
5641 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5642 SV const * const value)
5644 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5646 PERL_UNUSED_ARG(rx);
5647 PERL_UNUSED_ARG(paren);
5648 PERL_UNUSED_ARG(value);
5651 Perl_croak_no_modify(aTHX);
5655 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5658 struct regexp *const rx = (struct regexp *)SvANY(r);
5662 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5664 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5666 /* $` / ${^PREMATCH} */
5667 case RX_BUFF_IDX_PREMATCH:
5668 if (rx->offs[0].start != -1) {
5669 i = rx->offs[0].start;
5677 /* $' / ${^POSTMATCH} */
5678 case RX_BUFF_IDX_POSTMATCH:
5679 if (rx->offs[0].end != -1) {
5680 i = rx->sublen - rx->offs[0].end;
5682 s1 = rx->offs[0].end;
5688 /* $& / ${^MATCH}, $1, $2, ... */
5690 if (paren <= (I32)rx->nparens &&
5691 (s1 = rx->offs[paren].start) != -1 &&
5692 (t1 = rx->offs[paren].end) != -1)
5697 if (ckWARN(WARN_UNINITIALIZED))
5698 report_uninit((const SV *)sv);
5703 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5704 const char * const s = rx->subbeg + s1;
5709 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5716 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5718 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5719 PERL_UNUSED_ARG(rx);
5723 return newSVpvs("Regexp");
5726 /* Scans the name of a named buffer from the pattern.
5727 * If flags is REG_RSN_RETURN_NULL returns null.
5728 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5729 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5730 * to the parsed name as looked up in the RExC_paren_names hash.
5731 * If there is an error throws a vFAIL().. type exception.
5734 #define REG_RSN_RETURN_NULL 0
5735 #define REG_RSN_RETURN_NAME 1
5736 #define REG_RSN_RETURN_DATA 2
5739 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5741 char *name_start = RExC_parse;
5743 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5745 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5746 /* skip IDFIRST by using do...while */
5749 RExC_parse += UTF8SKIP(RExC_parse);
5750 } while (isALNUM_utf8((U8*)RExC_parse));
5754 } while (isALNUM(*RExC_parse));
5759 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5760 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5761 if ( flags == REG_RSN_RETURN_NAME)
5763 else if (flags==REG_RSN_RETURN_DATA) {
5766 if ( ! sv_name ) /* should not happen*/
5767 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5768 if (RExC_paren_names)
5769 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5771 sv_dat = HeVAL(he_str);
5773 vFAIL("Reference to nonexistent named group");
5777 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5784 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5785 int rem=(int)(RExC_end - RExC_parse); \
5794 if (RExC_lastparse!=RExC_parse) \
5795 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5798 iscut ? "..." : "<" \
5801 PerlIO_printf(Perl_debug_log,"%16s",""); \
5804 num = RExC_size + 1; \
5806 num=REG_NODE_NUM(RExC_emit); \
5807 if (RExC_lastnum!=num) \
5808 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5810 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5811 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5812 (int)((depth*2)), "", \
5816 RExC_lastparse=RExC_parse; \
5821 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5822 DEBUG_PARSE_MSG((funcname)); \
5823 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5825 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5826 DEBUG_PARSE_MSG((funcname)); \
5827 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5830 /* This section of code defines the inversion list object and its methods. The
5831 * interfaces are highly subject to change, so as much as possible is static to
5832 * this file. An inversion list is here implemented as a malloc'd C array with
5833 * some added info. More will be coming when functionality is added later.
5835 * It is currently implemented as an HV to the outside world, but is actually
5836 * an SV pointing to an array of UVs that the SV thinks are bytes. This allows
5837 * us to have an array of UV whose memory management is automatically handled
5838 * by the existing facilities for SV's.
5840 * Some of the methods should always be private to the implementation, and some
5841 * should eventually be made public */
5843 #define INVLIST_INITIAL_LEN 10
5845 PERL_STATIC_INLINE UV*
5846 S_invlist_array(pTHX_ HV* const invlist)
5848 /* Returns the pointer to the inversion list's array. Every time the
5849 * length changes, this needs to be called in case malloc or realloc moved
5852 PERL_ARGS_ASSERT_INVLIST_ARRAY;
5854 return (UV *) SvPVX(invlist);
5857 PERL_STATIC_INLINE UV
5858 S_invlist_len(pTHX_ HV* const invlist)
5860 /* Returns the current number of elements in the inversion list's array */
5862 PERL_ARGS_ASSERT_INVLIST_LEN;
5864 return SvCUR(invlist) / sizeof(UV);
5867 PERL_STATIC_INLINE UV
5868 S_invlist_max(pTHX_ HV* const invlist)
5870 /* Returns the maximum number of elements storable in the inversion list's
5871 * array, without having to realloc() */
5873 PERL_ARGS_ASSERT_INVLIST_MAX;
5875 return SvLEN(invlist) / sizeof(UV);
5878 PERL_STATIC_INLINE void
5879 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5881 /* Sets the current number of elements stored in the inversion list */
5883 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5885 SvCUR_set(invlist, len * sizeof(UV));
5888 PERL_STATIC_INLINE void
5889 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5892 /* Sets the maximum number of elements storable in the inversion list
5893 * without having to realloc() */
5895 PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5897 if (max < invlist_len(invlist)) {
5898 Perl_croak(aTHX_ "panic: Can't make max size '%"UVuf"' less than current length %"UVuf" in inversion list", invlist_max(invlist), invlist_len(invlist));
5901 SvLEN_set(invlist, max * sizeof(UV));
5904 #ifndef PERL_IN_XSUB_RE
5906 Perl__new_invlist(pTHX_ IV initial_size)
5909 /* Return a pointer to a newly constructed inversion list, with enough
5910 * space to store 'initial_size' elements. If that number is negative, a
5911 * system default is used instead */
5913 if (initial_size < 0) {
5914 initial_size = INVLIST_INITIAL_LEN;
5917 /* Allocate the initial space */
5918 return (HV *) newSV(initial_size * sizeof(UV));
5922 PERL_STATIC_INLINE void
5923 S_invlist_destroy(pTHX_ HV* const invlist)
5925 /* Inversion list destructor */
5927 PERL_ARGS_ASSERT_INVLIST_DESTROY;
5929 SvREFCNT_dec(invlist);
5933 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5935 /* Grow the maximum size of an inversion list */
5937 PERL_ARGS_ASSERT_INVLIST_EXTEND;
5939 SvGROW((SV *)invlist, new_max * sizeof(UV));
5942 PERL_STATIC_INLINE void
5943 S_invlist_trim(pTHX_ HV* const invlist)
5945 PERL_ARGS_ASSERT_INVLIST_TRIM;
5947 /* Change the length of the inversion list to how many entries it currently
5950 SvPV_shrink_to_cur((SV *) invlist);
5953 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
5956 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
5958 #ifndef PERL_IN_XSUB_RE
5960 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
5962 /* Subject to change or removal. Append the range from 'start' to 'end' at
5963 * the end of the inversion list. The range must be above any existing
5966 UV* array = invlist_array(invlist);
5967 UV max = invlist_max(invlist);
5968 UV len = invlist_len(invlist);
5970 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
5974 /* Here, the existing list is non-empty. The current max entry in the
5975 * list is generally the first value not in the set, except when the
5976 * set extends to the end of permissible values, in which case it is
5977 * the first entry in that final set, and so this call is an attempt to
5978 * append out-of-order */
5980 UV final_element = len - 1;
5981 if (array[final_element] > start
5982 || ELEMENT_IN_INVLIST_SET(final_element))
5984 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
5987 /* Here, it is a legal append. If the new range begins with the first
5988 * value not in the set, it is extending the set, so the new first
5989 * value not in the set is one greater than the newly extended range.
5991 if (array[final_element] == start) {
5992 if (end != UV_MAX) {
5993 array[final_element] = end + 1;
5996 /* But if the end is the maximum representable on the machine,
5997 * just let the range that this would extend have no end */
5998 invlist_set_len(invlist, len - 1);
6004 /* Here the new range doesn't extend any existing set. Add it */
6006 len += 2; /* Includes an element each for the start and end of range */
6008 /* If overflows the existing space, extend, which may cause the array to be
6011 invlist_extend(invlist, len);
6012 array = invlist_array(invlist);
6015 invlist_set_len(invlist, len);
6017 /* The next item on the list starts the range, the one after that is
6018 * one past the new range. */
6019 array[len - 2] = start;
6020 if (end != UV_MAX) {
6021 array[len - 1] = end + 1;
6024 /* But if the end is the maximum representable on the machine, just let
6025 * the range have no end */
6026 invlist_set_len(invlist, len - 1);
6032 S_invlist_union(pTHX_ HV* const a, HV* const b)
6034 /* Return a new inversion list which is the union of two inversion lists.
6035 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6036 * Richard Gillam, published by Addison-Wesley, and explained at some
6037 * length there. The preface says to incorporate its examples into your
6038 * code at your own risk.
6040 * The algorithm is like a merge sort.
6042 * XXX A potential performance improvement is to keep track as we go along
6043 * if only one of the inputs contributes to the result, meaning the other
6044 * is a subset of that one. In that case, we can skip the final copy and
6045 * return the larger of the input lists */
6047 UV* array_a = invlist_array(a); /* a's array */
6048 UV* array_b = invlist_array(b);
6049 UV len_a = invlist_len(a); /* length of a's array */
6050 UV len_b = invlist_len(b);
6052 HV* u; /* the resulting union */
6056 UV i_a = 0; /* current index into a's array */
6060 /* running count, as explained in the algorithm source book; items are
6061 * stopped accumulating and are output when the count changes to/from 0.
6062 * The count is incremented when we start a range that's in the set, and
6063 * decremented when we start a range that's not in the set. So its range
6064 * is 0 to 2. Only when the count is zero is something not in the set.
6068 PERL_ARGS_ASSERT_INVLIST_UNION;
6070 /* Size the union for the worst case: that the sets are completely
6072 u = _new_invlist(len_a + len_b);
6073 array_u = invlist_array(u);
6075 /* Go through each list item by item, stopping when exhausted one of
6077 while (i_a < len_a && i_b < len_b) {
6078 UV cp; /* The element to potentially add to the union's array */
6079 bool cp_in_set; /* is it in the the input list's set or not */
6081 /* We need to take one or the other of the two inputs for the union.
6082 * Since we are merging two sorted lists, we take the smaller of the
6083 * next items. In case of a tie, we take the one that is in its set
6084 * first. If we took one not in the set first, it would decrement the
6085 * count, possibly to 0 which would cause it to be output as ending the
6086 * range, and the next time through we would take the same number, and
6087 * output it again as beginning the next range. By doing it the
6088 * opposite way, there is no possibility that the count will be
6089 * momentarily decremented to 0, and thus the two adjoining ranges will
6090 * be seamlessly merged. (In a tie and both are in the set or both not
6091 * in the set, it doesn't matter which we take first.) */
6092 if (array_a[i_a] < array_b[i_b]
6093 || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6095 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6099 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6103 /* Here, have chosen which of the two inputs to look at. Only output
6104 * if the running count changes to/from 0, which marks the
6105 * beginning/end of a range in that's in the set */
6108 array_u[i_u++] = cp;
6115 array_u[i_u++] = cp;
6120 /* Here, we are finished going through at least one of the lists, which
6121 * means there is something remaining in at most one. We check if the list
6122 * that hasn't been exhausted is positioned such that we are in the middle
6123 * of a range in its set or not. (We are in the set if the next item in
6124 * the array marks the beginning of something not in the set) If in the
6125 * set, we decrement 'count'; if 0, there is potentially more to output.
6126 * There are four cases:
6127 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6128 * in the union is entirely from the non-exhausted set.
6129 * 2) Both were in their sets, count is 2. Nothing further should
6130 * be output, as everything that remains will be in the exhausted
6131 * list's set, hence in the union; decrementing to 1 but not 0 insures
6133 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6134 * Nothing further should be output because the union includes
6135 * everything from the exhausted set. Not decrementing insures that.
6136 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6137 * decrementing to 0 insures that we look at the remainder of the
6138 * non-exhausted set */
6139 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6140 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6145 /* The final length is what we've output so far, plus what else is about to
6146 * be output. (If 'count' is non-zero, then the input list we exhausted
6147 * has everything remaining up to the machine's limit in its set, and hence
6148 * in the union, so there will be no further output. */
6151 /* At most one of the subexpressions will be non-zero */
6152 len_u += (len_a - i_a) + (len_b - i_b);
6155 /* Set result to final length, which can change the pointer to array_u, so
6157 if (len_u != invlist_len(u)) {
6158 invlist_set_len(u, len_u);
6160 array_u = invlist_array(u);
6163 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6164 * the other) ended with everything above it not in its set. That means
6165 * that the remaining part of the union is precisely the same as the
6166 * non-exhausted list, so can just copy it unchanged. (If both list were
6167 * exhausted at the same time, then the operations below will be both 0.)
6170 IV copy_count; /* At most one will have a non-zero copy count */
6171 if ((copy_count = len_a - i_a) > 0) {
6172 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6174 else if ((copy_count = len_b - i_b) > 0) {
6175 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6183 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6185 /* Return the intersection of two inversion lists. The basis for this
6186 * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6187 * by Addison-Wesley, and explained at some length there. The preface says
6188 * to incorporate its examples into your code at your own risk.
6190 * The algorithm is like a merge sort, and is essentially the same as the
6194 UV* array_a = invlist_array(a); /* a's array */
6195 UV* array_b = invlist_array(b);
6196 UV len_a = invlist_len(a); /* length of a's array */
6197 UV len_b = invlist_len(b);
6199 HV* r; /* the resulting intersection */
6203 UV i_a = 0; /* current index into a's array */
6207 /* running count, as explained in the algorithm source book; items are
6208 * stopped accumulating and are output when the count changes to/from 2.
6209 * The count is incremented when we start a range that's in the set, and
6210 * decremented when we start a range that's not in the set. So its range
6211 * is 0 to 2. Only when the count is 2 is something in the intersection.
6215 PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6217 /* Size the intersection for the worst case: that the intersection ends up
6218 * fragmenting everything to be completely disjoint */
6219 r= _new_invlist(len_a + len_b);
6220 array_r = invlist_array(r);
6222 /* Go through each list item by item, stopping when exhausted one of
6224 while (i_a < len_a && i_b < len_b) {
6225 UV cp; /* The element to potentially add to the intersection's
6227 bool cp_in_set; /* Is it in the input list's set or not */
6229 /* We need to take one or the other of the two inputs for the union.
6230 * Since we are merging two sorted lists, we take the smaller of the
6231 * next items. In case of a tie, we take the one that is not in its
6232 * set first (a difference from the union algorithm). If we took one
6233 * in the set first, it would increment the count, possibly to 2 which
6234 * would cause it to be output as starting a range in the intersection,
6235 * and the next time through we would take that same number, and output
6236 * it again as ending the set. By doing it the opposite of this, we
6237 * there is no possibility that the count will be momentarily
6238 * incremented to 2. (In a tie and both are in the set or both not in
6239 * the set, it doesn't matter which we take first.) */
6240 if (array_a[i_a] < array_b[i_b]
6241 || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6243 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6247 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6251 /* Here, have chosen which of the two inputs to look at. Only output
6252 * if the running count changes to/from 2, which marks the
6253 * beginning/end of a range that's in the intersection */
6257 array_r[i_r++] = cp;
6262 array_r[i_r++] = cp;
6268 /* Here, we are finished going through at least one of the sets, which
6269 * means there is something remaining in at most one. See the comments in
6271 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6272 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6277 /* The final length is what we've output so far plus what else is in the
6278 * intersection. Only one of the subexpressions below will be non-zero */
6281 len_r += (len_a - i_a) + (len_b - i_b);
6284 /* Set result to final length, which can change the pointer to array_r, so
6286 if (len_r != invlist_len(r)) {
6287 invlist_set_len(r, len_r);
6289 array_r = invlist_array(r);
6292 /* Finish outputting any remaining */
6293 if (count == 2) { /* Only one of will have a non-zero copy count */
6295 if ((copy_count = len_a - i_a) > 0) {
6296 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6298 else if ((copy_count = len_b - i_b) > 0) {
6299 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6307 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6309 /* Add the range from 'start' to 'end' inclusive to the inversion list's
6310 * set. A pointer to the inversion list is returned. This may actually be
6311 * a new list, in which case the passed in one has been destroyed. The
6312 * passed in inversion list can be NULL, in which case a new one is created
6313 * with just the one range in it */
6319 if (invlist == NULL) {
6320 invlist = _new_invlist(2);
6324 len = invlist_len(invlist);
6327 /* If comes after the final entry, can just append it to the end */
6329 || start >= invlist_array(invlist)
6330 [invlist_len(invlist) - 1])
6332 _append_range_to_invlist(invlist, start, end);
6336 /* Here, can't just append things, create and return a new inversion list
6337 * which is the union of this range and the existing inversion list */
6338 range_invlist = _new_invlist(2);
6339 _append_range_to_invlist(range_invlist, start, end);
6341 added_invlist = invlist_union(invlist, range_invlist);
6343 /* The passed in list can be freed, as well as our temporary */
6344 invlist_destroy(range_invlist);
6345 if (invlist != added_invlist) {
6346 invlist_destroy(invlist);
6349 return added_invlist;
6352 PERL_STATIC_INLINE HV*
6353 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6354 return add_range_to_invlist(invlist, cp, cp);
6357 /* End of inversion list object */
6360 - reg - regular expression, i.e. main body or parenthesized thing
6362 * Caller must absorb opening parenthesis.
6364 * Combining parenthesis handling with the base level of regular expression
6365 * is a trifle forced, but the need to tie the tails of the branches to what
6366 * follows makes it hard to avoid.
6368 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6370 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6372 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6376 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6377 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6380 register regnode *ret; /* Will be the head of the group. */
6381 register regnode *br;
6382 register regnode *lastbr;
6383 register regnode *ender = NULL;
6384 register I32 parno = 0;
6386 U32 oregflags = RExC_flags;
6387 bool have_branch = 0;
6389 I32 freeze_paren = 0;
6390 I32 after_freeze = 0;
6392 /* for (?g), (?gc), and (?o) warnings; warning
6393 about (?c) will warn about (?g) -- japhy */
6395 #define WASTED_O 0x01
6396 #define WASTED_G 0x02
6397 #define WASTED_C 0x04
6398 #define WASTED_GC (0x02|0x04)
6399 I32 wastedflags = 0x00;
6401 char * parse_start = RExC_parse; /* MJD */
6402 char * const oregcomp_parse = RExC_parse;
6404 GET_RE_DEBUG_FLAGS_DECL;
6406 PERL_ARGS_ASSERT_REG;
6407 DEBUG_PARSE("reg ");
6409 *flagp = 0; /* Tentatively. */
6412 /* Make an OPEN node, if parenthesized. */
6414 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6415 char *start_verb = RExC_parse;
6416 STRLEN verb_len = 0;
6417 char *start_arg = NULL;
6418 unsigned char op = 0;
6420 int internal_argval = 0; /* internal_argval is only useful if !argok */
6421 while ( *RExC_parse && *RExC_parse != ')' ) {
6422 if ( *RExC_parse == ':' ) {
6423 start_arg = RExC_parse + 1;
6429 verb_len = RExC_parse - start_verb;
6432 while ( *RExC_parse && *RExC_parse != ')' )
6434 if ( *RExC_parse != ')' )
6435 vFAIL("Unterminated verb pattern argument");
6436 if ( RExC_parse == start_arg )
6439 if ( *RExC_parse != ')' )
6440 vFAIL("Unterminated verb pattern");
6443 switch ( *start_verb ) {
6444 case 'A': /* (*ACCEPT) */
6445 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6447 internal_argval = RExC_nestroot;
6450 case 'C': /* (*COMMIT) */
6451 if ( memEQs(start_verb,verb_len,"COMMIT") )
6454 case 'F': /* (*FAIL) */
6455 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6460 case ':': /* (*:NAME) */
6461 case 'M': /* (*MARK:NAME) */
6462 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6467 case 'P': /* (*PRUNE) */
6468 if ( memEQs(start_verb,verb_len,"PRUNE") )
6471 case 'S': /* (*SKIP) */
6472 if ( memEQs(start_verb,verb_len,"SKIP") )
6475 case 'T': /* (*THEN) */
6476 /* [19:06] <TimToady> :: is then */
6477 if ( memEQs(start_verb,verb_len,"THEN") ) {
6479 RExC_seen |= REG_SEEN_CUTGROUP;
6485 vFAIL3("Unknown verb pattern '%.*s'",
6486 verb_len, start_verb);
6489 if ( start_arg && internal_argval ) {
6490 vFAIL3("Verb pattern '%.*s' may not have an argument",
6491 verb_len, start_verb);
6492 } else if ( argok < 0 && !start_arg ) {
6493 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6494 verb_len, start_verb);
6496 ret = reganode(pRExC_state, op, internal_argval);
6497 if ( ! internal_argval && ! SIZE_ONLY ) {
6499 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6500 ARG(ret) = add_data( pRExC_state, 1, "S" );
6501 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6508 if (!internal_argval)
6509 RExC_seen |= REG_SEEN_VERBARG;
6510 } else if ( start_arg ) {
6511 vFAIL3("Verb pattern '%.*s' may not have an argument",
6512 verb_len, start_verb);
6514 ret = reg_node(pRExC_state, op);
6516 nextchar(pRExC_state);
6519 if (*RExC_parse == '?') { /* (?...) */
6520 bool is_logical = 0;
6521 const char * const seqstart = RExC_parse;
6522 bool has_use_defaults = FALSE;
6525 paren = *RExC_parse++;
6526 ret = NULL; /* For look-ahead/behind. */
6529 case 'P': /* (?P...) variants for those used to PCRE/Python */
6530 paren = *RExC_parse++;
6531 if ( paren == '<') /* (?P<...>) named capture */
6533 else if (paren == '>') { /* (?P>name) named recursion */
6534 goto named_recursion;
6536 else if (paren == '=') { /* (?P=...) named backref */
6537 /* this pretty much dupes the code for \k<NAME> in regatom(), if
6538 you change this make sure you change that */
6539 char* name_start = RExC_parse;
6541 SV *sv_dat = reg_scan_name(pRExC_state,
6542 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6543 if (RExC_parse == name_start || *RExC_parse != ')')
6544 vFAIL2("Sequence %.3s... not terminated",parse_start);
6547 num = add_data( pRExC_state, 1, "S" );
6548 RExC_rxi->data->data[num]=(void*)sv_dat;
6549 SvREFCNT_inc_simple_void(sv_dat);
6552 ret = reganode(pRExC_state,
6555 : (MORE_ASCII_RESTRICTED)
6557 : (AT_LEAST_UNI_SEMANTICS)
6565 Set_Node_Offset(ret, parse_start+1);
6566 Set_Node_Cur_Length(ret); /* MJD */
6568 nextchar(pRExC_state);
6572 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6574 case '<': /* (?<...) */
6575 if (*RExC_parse == '!')
6577 else if (*RExC_parse != '=')
6583 case '\'': /* (?'...') */
6584 name_start= RExC_parse;
6585 svname = reg_scan_name(pRExC_state,
6586 SIZE_ONLY ? /* reverse test from the others */
6587 REG_RSN_RETURN_NAME :
6588 REG_RSN_RETURN_NULL);
6589 if (RExC_parse == name_start) {
6591 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6594 if (*RExC_parse != paren)
6595 vFAIL2("Sequence (?%c... not terminated",
6596 paren=='>' ? '<' : paren);
6600 if (!svname) /* shouldn't happen */
6602 "panic: reg_scan_name returned NULL");
6603 if (!RExC_paren_names) {
6604 RExC_paren_names= newHV();
6605 sv_2mortal(MUTABLE_SV(RExC_paren_names));
6607 RExC_paren_name_list= newAV();
6608 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6611 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6613 sv_dat = HeVAL(he_str);
6615 /* croak baby croak */
6617 "panic: paren_name hash element allocation failed");
6618 } else if ( SvPOK(sv_dat) ) {
6619 /* (?|...) can mean we have dupes so scan to check
6620 its already been stored. Maybe a flag indicating
6621 we are inside such a construct would be useful,
6622 but the arrays are likely to be quite small, so
6623 for now we punt -- dmq */
6624 IV count = SvIV(sv_dat);
6625 I32 *pv = (I32*)SvPVX(sv_dat);
6627 for ( i = 0 ; i < count ; i++ ) {
6628 if ( pv[i] == RExC_npar ) {
6634 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6635 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6636 pv[count] = RExC_npar;
6637 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6640 (void)SvUPGRADE(sv_dat,SVt_PVNV);
6641 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6643 SvIV_set(sv_dat, 1);
6646 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6647 SvREFCNT_dec(svname);
6650 /*sv_dump(sv_dat);*/
6652 nextchar(pRExC_state);
6654 goto capturing_parens;
6656 RExC_seen |= REG_SEEN_LOOKBEHIND;
6657 RExC_in_lookbehind++;
6659 case '=': /* (?=...) */
6660 RExC_seen_zerolen++;
6662 case '!': /* (?!...) */
6663 RExC_seen_zerolen++;
6664 if (*RExC_parse == ')') {
6665 ret=reg_node(pRExC_state, OPFAIL);
6666 nextchar(pRExC_state);
6670 case '|': /* (?|...) */
6671 /* branch reset, behave like a (?:...) except that
6672 buffers in alternations share the same numbers */
6674 after_freeze = freeze_paren = RExC_npar;
6676 case ':': /* (?:...) */
6677 case '>': /* (?>...) */
6679 case '$': /* (?$...) */
6680 case '@': /* (?@...) */
6681 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6683 case '#': /* (?#...) */
6684 while (*RExC_parse && *RExC_parse != ')')
6686 if (*RExC_parse != ')')
6687 FAIL("Sequence (?#... not terminated");
6688 nextchar(pRExC_state);
6691 case '0' : /* (?0) */
6692 case 'R' : /* (?R) */
6693 if (*RExC_parse != ')')
6694 FAIL("Sequence (?R) not terminated");
6695 ret = reg_node(pRExC_state, GOSTART);
6696 *flagp |= POSTPONED;
6697 nextchar(pRExC_state);
6700 { /* named and numeric backreferences */
6702 case '&': /* (?&NAME) */
6703 parse_start = RExC_parse - 1;
6706 SV *sv_dat = reg_scan_name(pRExC_state,
6707 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6708 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6710 goto gen_recurse_regop;
6713 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6715 vFAIL("Illegal pattern");
6717 goto parse_recursion;
6719 case '-': /* (?-1) */
6720 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6721 RExC_parse--; /* rewind to let it be handled later */
6725 case '1': case '2': case '3': case '4': /* (?1) */
6726 case '5': case '6': case '7': case '8': case '9':
6729 num = atoi(RExC_parse);
6730 parse_start = RExC_parse - 1; /* MJD */
6731 if (*RExC_parse == '-')
6733 while (isDIGIT(*RExC_parse))
6735 if (*RExC_parse!=')')
6736 vFAIL("Expecting close bracket");
6739 if ( paren == '-' ) {
6741 Diagram of capture buffer numbering.
6742 Top line is the normal capture buffer numbers
6743 Bottom line is the negative indexing as from
6747 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6751 num = RExC_npar + num;
6754 vFAIL("Reference to nonexistent group");
6756 } else if ( paren == '+' ) {
6757 num = RExC_npar + num - 1;
6760 ret = reganode(pRExC_state, GOSUB, num);
6762 if (num > (I32)RExC_rx->nparens) {
6764 vFAIL("Reference to nonexistent group");
6766 ARG2L_SET( ret, RExC_recurse_count++);
6768 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6769 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6773 RExC_seen |= REG_SEEN_RECURSE;
6774 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6775 Set_Node_Offset(ret, parse_start); /* MJD */
6777 *flagp |= POSTPONED;
6778 nextchar(pRExC_state);
6780 } /* named and numeric backreferences */
6783 case '?': /* (??...) */
6785 if (*RExC_parse != '{') {
6787 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6790 *flagp |= POSTPONED;
6791 paren = *RExC_parse++;
6793 case '{': /* (?{...}) */
6798 char *s = RExC_parse;
6800 RExC_seen_zerolen++;
6801 RExC_seen |= REG_SEEN_EVAL;
6802 while (count && (c = *RExC_parse)) {
6813 if (*RExC_parse != ')') {
6815 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6819 OP_4tree *sop, *rop;
6820 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6823 Perl_save_re_context(aTHX);
6824 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6825 sop->op_private |= OPpREFCOUNTED;
6826 /* re_dup will OpREFCNT_inc */
6827 OpREFCNT_set(sop, 1);
6830 n = add_data(pRExC_state, 3, "nop");
6831 RExC_rxi->data->data[n] = (void*)rop;
6832 RExC_rxi->data->data[n+1] = (void*)sop;
6833 RExC_rxi->data->data[n+2] = (void*)pad;
6836 else { /* First pass */
6837 if (PL_reginterp_cnt < ++RExC_seen_evals
6839 /* No compiled RE interpolated, has runtime
6840 components ===> unsafe. */
6841 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6842 if (PL_tainting && PL_tainted)
6843 FAIL("Eval-group in insecure regular expression");
6844 #if PERL_VERSION > 8
6845 if (IN_PERL_COMPILETIME)
6850 nextchar(pRExC_state);
6852 ret = reg_node(pRExC_state, LOGICAL);
6855 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6856 /* deal with the length of this later - MJD */
6859 ret = reganode(pRExC_state, EVAL, n);
6860 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6861 Set_Node_Offset(ret, parse_start);
6864 case '(': /* (?(?{...})...) and (?(?=...)...) */
6867 if (RExC_parse[0] == '?') { /* (?(?...)) */
6868 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6869 || RExC_parse[1] == '<'
6870 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6873 ret = reg_node(pRExC_state, LOGICAL);
6876 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6880 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6881 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6883 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6884 char *name_start= RExC_parse++;
6886 SV *sv_dat=reg_scan_name(pRExC_state,
6887 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6888 if (RExC_parse == name_start || *RExC_parse != ch)
6889 vFAIL2("Sequence (?(%c... not terminated",
6890 (ch == '>' ? '<' : ch));
6893 num = add_data( pRExC_state, 1, "S" );
6894 RExC_rxi->data->data[num]=(void*)sv_dat;
6895 SvREFCNT_inc_simple_void(sv_dat);
6897 ret = reganode(pRExC_state,NGROUPP,num);
6898 goto insert_if_check_paren;
6900 else if (RExC_parse[0] == 'D' &&
6901 RExC_parse[1] == 'E' &&
6902 RExC_parse[2] == 'F' &&
6903 RExC_parse[3] == 'I' &&
6904 RExC_parse[4] == 'N' &&
6905 RExC_parse[5] == 'E')
6907 ret = reganode(pRExC_state,DEFINEP,0);
6910 goto insert_if_check_paren;
6912 else if (RExC_parse[0] == 'R') {
6915 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6916 parno = atoi(RExC_parse++);
6917 while (isDIGIT(*RExC_parse))
6919 } else if (RExC_parse[0] == '&') {
6922 sv_dat = reg_scan_name(pRExC_state,
6923 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6924 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6926 ret = reganode(pRExC_state,INSUBP,parno);
6927 goto insert_if_check_paren;
6929 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6932 parno = atoi(RExC_parse++);
6934 while (isDIGIT(*RExC_parse))
6936 ret = reganode(pRExC_state, GROUPP, parno);
6938 insert_if_check_paren:
6939 if ((c = *nextchar(pRExC_state)) != ')')
6940 vFAIL("Switch condition not recognized");
6942 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6943 br = regbranch(pRExC_state, &flags, 1,depth+1);
6945 br = reganode(pRExC_state, LONGJMP, 0);
6947 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6948 c = *nextchar(pRExC_state);
6953 vFAIL("(?(DEFINE)....) does not allow branches");
6954 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6955 regbranch(pRExC_state, &flags, 1,depth+1);
6956 REGTAIL(pRExC_state, ret, lastbr);
6959 c = *nextchar(pRExC_state);
6964 vFAIL("Switch (?(condition)... contains too many branches");
6965 ender = reg_node(pRExC_state, TAIL);
6966 REGTAIL(pRExC_state, br, ender);
6968 REGTAIL(pRExC_state, lastbr, ender);
6969 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6972 REGTAIL(pRExC_state, ret, ender);
6973 RExC_size++; /* XXX WHY do we need this?!!
6974 For large programs it seems to be required
6975 but I can't figure out why. -- dmq*/
6979 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6983 RExC_parse--; /* for vFAIL to print correctly */
6984 vFAIL("Sequence (? incomplete");
6986 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6988 has_use_defaults = TRUE;
6989 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6990 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
6991 ? REGEX_UNICODE_CHARSET
6992 : REGEX_DEPENDS_CHARSET);
6996 parse_flags: /* (?i) */
6998 U32 posflags = 0, negflags = 0;
6999 U32 *flagsp = &posflags;
7000 char has_charset_modifier = '\0';
7001 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7002 ? REGEX_UNICODE_CHARSET
7003 : REGEX_DEPENDS_CHARSET;
7005 while (*RExC_parse) {
7006 /* && strchr("iogcmsx", *RExC_parse) */
7007 /* (?g), (?gc) and (?o) are useless here
7008 and must be globally applied -- japhy */
7009 switch (*RExC_parse) {
7010 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7011 case LOCALE_PAT_MOD:
7012 if (has_charset_modifier) {
7013 goto excess_modifier;
7015 else if (flagsp == &negflags) {
7018 cs = REGEX_LOCALE_CHARSET;
7019 has_charset_modifier = LOCALE_PAT_MOD;
7020 RExC_contains_locale = 1;
7022 case UNICODE_PAT_MOD:
7023 if (has_charset_modifier) {
7024 goto excess_modifier;
7026 else if (flagsp == &negflags) {
7029 cs = REGEX_UNICODE_CHARSET;
7030 has_charset_modifier = UNICODE_PAT_MOD;
7032 case ASCII_RESTRICT_PAT_MOD:
7033 if (flagsp == &negflags) {
7036 if (has_charset_modifier) {
7037 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
7038 goto excess_modifier;
7040 /* Doubled modifier implies more restricted */
7041 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7044 cs = REGEX_ASCII_RESTRICTED_CHARSET;
7046 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
7048 case DEPENDS_PAT_MOD:
7049 if (has_use_defaults) {
7050 goto fail_modifiers;
7052 else if (flagsp == &negflags) {
7055 else if (has_charset_modifier) {
7056 goto excess_modifier;
7059 /* The dual charset means unicode semantics if the
7060 * pattern (or target, not known until runtime) are
7061 * utf8, or something in the pattern indicates unicode
7063 cs = (RExC_utf8 || RExC_uni_semantics)
7064 ? REGEX_UNICODE_CHARSET
7065 : REGEX_DEPENDS_CHARSET;
7066 has_charset_modifier = DEPENDS_PAT_MOD;
7070 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
7071 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
7073 else if (has_charset_modifier == *(RExC_parse - 1)) {
7074 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
7077 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
7082 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
7084 case ONCE_PAT_MOD: /* 'o' */
7085 case GLOBAL_PAT_MOD: /* 'g' */
7086 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7087 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7088 if (! (wastedflags & wflagbit) ) {
7089 wastedflags |= wflagbit;
7092 "Useless (%s%c) - %suse /%c modifier",
7093 flagsp == &negflags ? "?-" : "?",
7095 flagsp == &negflags ? "don't " : "",
7102 case CONTINUE_PAT_MOD: /* 'c' */
7103 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7104 if (! (wastedflags & WASTED_C) ) {
7105 wastedflags |= WASTED_GC;
7108 "Useless (%sc) - %suse /gc modifier",
7109 flagsp == &negflags ? "?-" : "?",
7110 flagsp == &negflags ? "don't " : ""
7115 case KEEPCOPY_PAT_MOD: /* 'p' */
7116 if (flagsp == &negflags) {
7118 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7120 *flagsp |= RXf_PMf_KEEPCOPY;
7124 /* A flag is a default iff it is following a minus, so
7125 * if there is a minus, it means will be trying to
7126 * re-specify a default which is an error */
7127 if (has_use_defaults || flagsp == &negflags) {
7130 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7134 wastedflags = 0; /* reset so (?g-c) warns twice */
7140 RExC_flags |= posflags;
7141 RExC_flags &= ~negflags;
7142 set_regex_charset(&RExC_flags, cs);
7144 oregflags |= posflags;
7145 oregflags &= ~negflags;
7146 set_regex_charset(&oregflags, cs);
7148 nextchar(pRExC_state);
7159 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7164 }} /* one for the default block, one for the switch */
7171 ret = reganode(pRExC_state, OPEN, parno);
7174 RExC_nestroot = parno;
7175 if (RExC_seen & REG_SEEN_RECURSE
7176 && !RExC_open_parens[parno-1])
7178 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7179 "Setting open paren #%"IVdf" to %d\n",
7180 (IV)parno, REG_NODE_NUM(ret)));
7181 RExC_open_parens[parno-1]= ret;
7184 Set_Node_Length(ret, 1); /* MJD */
7185 Set_Node_Offset(ret, RExC_parse); /* MJD */
7193 /* Pick up the branches, linking them together. */
7194 parse_start = RExC_parse; /* MJD */
7195 br = regbranch(pRExC_state, &flags, 1,depth+1);
7197 /* branch_len = (paren != 0); */
7201 if (*RExC_parse == '|') {
7202 if (!SIZE_ONLY && RExC_extralen) {
7203 reginsert(pRExC_state, BRANCHJ, br, depth+1);
7206 reginsert(pRExC_state, BRANCH, br, depth+1);
7207 Set_Node_Length(br, paren != 0);
7208 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7212 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
7214 else if (paren == ':') {
7215 *flagp |= flags&SIMPLE;
7217 if (is_open) { /* Starts with OPEN. */
7218 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
7220 else if (paren != '?') /* Not Conditional */
7222 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7224 while (*RExC_parse == '|') {
7225 if (!SIZE_ONLY && RExC_extralen) {
7226 ender = reganode(pRExC_state, LONGJMP,0);
7227 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7230 RExC_extralen += 2; /* Account for LONGJMP. */
7231 nextchar(pRExC_state);
7233 if (RExC_npar > after_freeze)
7234 after_freeze = RExC_npar;
7235 RExC_npar = freeze_paren;
7237 br = regbranch(pRExC_state, &flags, 0, depth+1);
7241 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
7243 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7246 if (have_branch || paren != ':') {
7247 /* Make a closing node, and hook it on the end. */
7250 ender = reg_node(pRExC_state, TAIL);
7253 ender = reganode(pRExC_state, CLOSE, parno);
7254 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7255 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7256 "Setting close paren #%"IVdf" to %d\n",
7257 (IV)parno, REG_NODE_NUM(ender)));
7258 RExC_close_parens[parno-1]= ender;
7259 if (RExC_nestroot == parno)
7262 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7263 Set_Node_Length(ender,1); /* MJD */
7269 *flagp &= ~HASWIDTH;
7272 ender = reg_node(pRExC_state, SUCCEED);
7275 ender = reg_node(pRExC_state, END);
7277 assert(!RExC_opend); /* there can only be one! */
7282 REGTAIL(pRExC_state, lastbr, ender);
7284 if (have_branch && !SIZE_ONLY) {
7286 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7288 /* Hook the tails of the branches to the closing node. */
7289 for (br = ret; br; br = regnext(br)) {
7290 const U8 op = PL_regkind[OP(br)];
7292 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7294 else if (op == BRANCHJ) {
7295 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7303 static const char parens[] = "=!<,>";
7305 if (paren && (p = strchr(parens, paren))) {
7306 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7307 int flag = (p - parens) > 1;
7310 node = SUSPEND, flag = 0;
7311 reginsert(pRExC_state, node,ret, depth+1);
7312 Set_Node_Cur_Length(ret);
7313 Set_Node_Offset(ret, parse_start + 1);
7315 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7319 /* Check for proper termination. */
7321 RExC_flags = oregflags;
7322 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7323 RExC_parse = oregcomp_parse;
7324 vFAIL("Unmatched (");
7327 else if (!paren && RExC_parse < RExC_end) {
7328 if (*RExC_parse == ')') {
7330 vFAIL("Unmatched )");
7333 FAIL("Junk on end of regexp"); /* "Can't happen". */
7337 if (RExC_in_lookbehind) {
7338 RExC_in_lookbehind--;
7340 if (after_freeze > RExC_npar)
7341 RExC_npar = after_freeze;
7346 - regbranch - one alternative of an | operator
7348 * Implements the concatenation operator.
7351 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7354 register regnode *ret;
7355 register regnode *chain = NULL;
7356 register regnode *latest;
7357 I32 flags = 0, c = 0;
7358 GET_RE_DEBUG_FLAGS_DECL;
7360 PERL_ARGS_ASSERT_REGBRANCH;
7362 DEBUG_PARSE("brnc");
7367 if (!SIZE_ONLY && RExC_extralen)
7368 ret = reganode(pRExC_state, BRANCHJ,0);
7370 ret = reg_node(pRExC_state, BRANCH);
7371 Set_Node_Length(ret, 1);
7375 if (!first && SIZE_ONLY)
7376 RExC_extralen += 1; /* BRANCHJ */
7378 *flagp = WORST; /* Tentatively. */
7381 nextchar(pRExC_state);
7382 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7384 latest = regpiece(pRExC_state, &flags,depth+1);
7385 if (latest == NULL) {
7386 if (flags & TRYAGAIN)
7390 else if (ret == NULL)
7392 *flagp |= flags&(HASWIDTH|POSTPONED);
7393 if (chain == NULL) /* First piece. */
7394 *flagp |= flags&SPSTART;
7397 REGTAIL(pRExC_state, chain, latest);
7402 if (chain == NULL) { /* Loop ran zero times. */
7403 chain = reg_node(pRExC_state, NOTHING);
7408 *flagp |= flags&SIMPLE;
7415 - regpiece - something followed by possible [*+?]
7417 * Note that the branching code sequences used for ? and the general cases
7418 * of * and + are somewhat optimized: they use the same NOTHING node as
7419 * both the endmarker for their branch list and the body of the last branch.
7420 * It might seem that this node could be dispensed with entirely, but the
7421 * endmarker role is not redundant.
7424 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7427 register regnode *ret;
7429 register char *next;
7431 const char * const origparse = RExC_parse;
7433 I32 max = REG_INFTY;
7435 const char *maxpos = NULL;
7436 GET_RE_DEBUG_FLAGS_DECL;
7438 PERL_ARGS_ASSERT_REGPIECE;
7440 DEBUG_PARSE("piec");
7442 ret = regatom(pRExC_state, &flags,depth+1);
7444 if (flags & TRYAGAIN)
7451 if (op == '{' && regcurly(RExC_parse)) {
7453 parse_start = RExC_parse; /* MJD */
7454 next = RExC_parse + 1;
7455 while (isDIGIT(*next) || *next == ',') {
7464 if (*next == '}') { /* got one */
7468 min = atoi(RExC_parse);
7472 maxpos = RExC_parse;
7474 if (!max && *maxpos != '0')
7475 max = REG_INFTY; /* meaning "infinity" */
7476 else if (max >= REG_INFTY)
7477 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7479 nextchar(pRExC_state);
7482 if ((flags&SIMPLE)) {
7483 RExC_naughty += 2 + RExC_naughty / 2;
7484 reginsert(pRExC_state, CURLY, ret, depth+1);
7485 Set_Node_Offset(ret, parse_start+1); /* MJD */
7486 Set_Node_Cur_Length(ret);
7489 regnode * const w = reg_node(pRExC_state, WHILEM);
7492 REGTAIL(pRExC_state, ret, w);
7493 if (!SIZE_ONLY && RExC_extralen) {
7494 reginsert(pRExC_state, LONGJMP,ret, depth+1);
7495 reginsert(pRExC_state, NOTHING,ret, depth+1);
7496 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7498 reginsert(pRExC_state, CURLYX,ret, depth+1);
7500 Set_Node_Offset(ret, parse_start+1);
7501 Set_Node_Length(ret,
7502 op == '{' ? (RExC_parse - parse_start) : 1);
7504 if (!SIZE_ONLY && RExC_extralen)
7505 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
7506 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7508 RExC_whilem_seen++, RExC_extralen += 3;
7509 RExC_naughty += 4 + RExC_naughty; /* compound interest */
7518 vFAIL("Can't do {n,m} with n > m");
7520 ARG1_SET(ret, (U16)min);
7521 ARG2_SET(ret, (U16)max);
7533 #if 0 /* Now runtime fix should be reliable. */
7535 /* if this is reinstated, don't forget to put this back into perldiag:
7537 =item Regexp *+ operand could be empty at {#} in regex m/%s/
7539 (F) The part of the regexp subject to either the * or + quantifier
7540 could match an empty string. The {#} shows in the regular
7541 expression about where the problem was discovered.
7545 if (!(flags&HASWIDTH) && op != '?')
7546 vFAIL("Regexp *+ operand could be empty");
7549 parse_start = RExC_parse;
7550 nextchar(pRExC_state);
7552 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7554 if (op == '*' && (flags&SIMPLE)) {
7555 reginsert(pRExC_state, STAR, ret, depth+1);
7559 else if (op == '*') {
7563 else if (op == '+' && (flags&SIMPLE)) {
7564 reginsert(pRExC_state, PLUS, ret, depth+1);
7568 else if (op == '+') {
7572 else if (op == '?') {
7577 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7578 ckWARN3reg(RExC_parse,
7579 "%.*s matches null string many times",
7580 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7584 if (RExC_parse < RExC_end && *RExC_parse == '?') {
7585 nextchar(pRExC_state);
7586 reginsert(pRExC_state, MINMOD, ret, depth+1);
7587 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7589 #ifndef REG_ALLOW_MINMOD_SUSPEND
7592 if (RExC_parse < RExC_end && *RExC_parse == '+') {
7594 nextchar(pRExC_state);
7595 ender = reg_node(pRExC_state, SUCCEED);
7596 REGTAIL(pRExC_state, ret, ender);
7597 reginsert(pRExC_state, SUSPEND, ret, depth+1);
7599 ender = reg_node(pRExC_state, TAIL);
7600 REGTAIL(pRExC_state, ret, ender);
7604 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7606 vFAIL("Nested quantifiers");
7613 /* reg_namedseq(pRExC_state,UVp, UV depth)
7615 This is expected to be called by a parser routine that has
7616 recognized '\N' and needs to handle the rest. RExC_parse is
7617 expected to point at the first char following the N at the time
7620 The \N may be inside (indicated by valuep not being NULL) or outside a
7623 \N may begin either a named sequence, or if outside a character class, mean
7624 to match a non-newline. For non single-quoted regexes, the tokenizer has
7625 attempted to decide which, and in the case of a named sequence converted it
7626 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7627 where c1... are the characters in the sequence. For single-quoted regexes,
7628 the tokenizer passes the \N sequence through unchanged; this code will not
7629 attempt to determine this nor expand those. The net effect is that if the
7630 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7631 signals that this \N occurrence means to match a non-newline.
7633 Only the \N{U+...} form should occur in a character class, for the same
7634 reason that '.' inside a character class means to just match a period: it
7635 just doesn't make sense.
7637 If valuep is non-null then it is assumed that we are parsing inside
7638 of a charclass definition and the first codepoint in the resolved
7639 string is returned via *valuep and the routine will return NULL.
7640 In this mode if a multichar string is returned from the charnames
7641 handler, a warning will be issued, and only the first char in the
7642 sequence will be examined. If the string returned is zero length
7643 then the value of *valuep is undefined and NON-NULL will
7644 be returned to indicate failure. (This will NOT be a valid pointer
7647 If valuep is null then it is assumed that we are parsing normal text and a
7648 new EXACT node is inserted into the program containing the resolved string,
7649 and a pointer to the new node is returned. But if the string is zero length
7650 a NOTHING node is emitted instead.
7652 On success RExC_parse is set to the char following the endbrace.
7653 Parsing failures will generate a fatal error via vFAIL(...)
7656 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
7658 char * endbrace; /* '}' following the name */
7659 regnode *ret = NULL;
7662 GET_RE_DEBUG_FLAGS_DECL;
7664 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7668 /* The [^\n] meaning of \N ignores spaces and comments under the /x
7669 * modifier. The other meaning does not */
7670 p = (RExC_flags & RXf_PMf_EXTENDED)
7671 ? regwhite( pRExC_state, RExC_parse )
7674 /* Disambiguate between \N meaning a named character versus \N meaning
7675 * [^\n]. The former is assumed when it can't be the latter. */
7676 if (*p != '{' || regcurly(p)) {
7679 /* no bare \N in a charclass */
7680 vFAIL("\\N in a character class must be a named character: \\N{...}");
7682 nextchar(pRExC_state);
7683 ret = reg_node(pRExC_state, REG_ANY);
7684 *flagp |= HASWIDTH|SIMPLE;
7687 Set_Node_Length(ret, 1); /* MJD */
7691 /* Here, we have decided it should be a named sequence */
7693 /* The test above made sure that the next real character is a '{', but
7694 * under the /x modifier, it could be separated by space (or a comment and
7695 * \n) and this is not allowed (for consistency with \x{...} and the
7696 * tokenizer handling of \N{NAME}). */
7697 if (*RExC_parse != '{') {
7698 vFAIL("Missing braces on \\N{}");
7701 RExC_parse++; /* Skip past the '{' */
7703 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7704 || ! (endbrace == RExC_parse /* nothing between the {} */
7705 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7706 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7708 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7709 vFAIL("\\N{NAME} must be resolved by the lexer");
7712 if (endbrace == RExC_parse) { /* empty: \N{} */
7714 RExC_parse = endbrace + 1;
7715 return reg_node(pRExC_state,NOTHING);
7719 ckWARNreg(RExC_parse,
7720 "Ignoring zero length \\N{} in character class"
7722 RExC_parse = endbrace + 1;
7725 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7728 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
7729 RExC_parse += 2; /* Skip past the 'U+' */
7731 if (valuep) { /* In a bracketed char class */
7732 /* We only pay attention to the first char of
7733 multichar strings being returned. I kinda wonder
7734 if this makes sense as it does change the behaviour
7735 from earlier versions, OTOH that behaviour was broken
7736 as well. XXX Solution is to recharacterize as
7737 [rest-of-class]|multi1|multi2... */
7739 STRLEN length_of_hex;
7740 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7741 | PERL_SCAN_DISALLOW_PREFIX
7742 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7744 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7745 if (endchar < endbrace) {
7746 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7749 length_of_hex = (STRLEN)(endchar - RExC_parse);
7750 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7752 /* The tokenizer should have guaranteed validity, but it's possible to
7753 * bypass it by using single quoting, so check */
7754 if (length_of_hex == 0
7755 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7757 RExC_parse += length_of_hex; /* Includes all the valid */
7758 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7759 ? UTF8SKIP(RExC_parse)
7761 /* Guard against malformed utf8 */
7762 if (RExC_parse >= endchar) RExC_parse = endchar;
7763 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7766 RExC_parse = endbrace + 1;
7767 if (endchar == endbrace) return NULL;
7769 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7771 else { /* Not a char class */
7773 /* What is done here is to convert this to a sub-pattern of the form
7774 * (?:\x{char1}\x{char2}...)
7775 * and then call reg recursively. That way, it retains its atomicness,
7776 * while not having to worry about special handling that some code
7777 * points may have. toke.c has converted the original Unicode values
7778 * to native, so that we can just pass on the hex values unchanged. We
7779 * do have to set a flag to keep recoding from happening in the
7782 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
7784 char *endchar; /* Points to '.' or '}' ending cur char in the input
7786 char *orig_end = RExC_end;
7788 while (RExC_parse < endbrace) {
7790 /* Code points are separated by dots. If none, there is only one
7791 * code point, and is terminated by the brace */
7792 endchar = RExC_parse + strcspn(RExC_parse, ".}");
7794 /* Convert to notation the rest of the code understands */
7795 sv_catpv(substitute_parse, "\\x{");
7796 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
7797 sv_catpv(substitute_parse, "}");
7799 /* Point to the beginning of the next character in the sequence. */
7800 RExC_parse = endchar + 1;
7802 sv_catpv(substitute_parse, ")");
7804 RExC_parse = SvPV(substitute_parse, len);
7806 /* Don't allow empty number */
7808 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7810 RExC_end = RExC_parse + len;
7812 /* The values are Unicode, and therefore not subject to recoding */
7813 RExC_override_recoding = 1;
7815 ret = reg(pRExC_state, 1, flagp, depth+1);
7817 RExC_parse = endbrace;
7818 RExC_end = orig_end;
7819 RExC_override_recoding = 0;
7821 nextchar(pRExC_state);
7831 * It returns the code point in utf8 for the value in *encp.
7832 * value: a code value in the source encoding
7833 * encp: a pointer to an Encode object
7835 * If the result from Encode is not a single character,
7836 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7839 S_reg_recode(pTHX_ const char value, SV **encp)
7842 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7843 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7844 const STRLEN newlen = SvCUR(sv);
7845 UV uv = UNICODE_REPLACEMENT;
7847 PERL_ARGS_ASSERT_REG_RECODE;
7851 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7854 if (!newlen || numlen != newlen) {
7855 uv = UNICODE_REPLACEMENT;
7863 - regatom - the lowest level
7865 Try to identify anything special at the start of the pattern. If there
7866 is, then handle it as required. This may involve generating a single regop,
7867 such as for an assertion; or it may involve recursing, such as to
7868 handle a () structure.
7870 If the string doesn't start with something special then we gobble up
7871 as much literal text as we can.
7873 Once we have been able to handle whatever type of thing started the
7874 sequence, we return.
7876 Note: we have to be careful with escapes, as they can be both literal
7877 and special, and in the case of \10 and friends can either, depending
7878 on context. Specifically there are two separate switches for handling
7879 escape sequences, with the one for handling literal escapes requiring
7880 a dummy entry for all of the special escapes that are actually handled
7885 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7888 register regnode *ret = NULL;
7890 char *parse_start = RExC_parse;
7892 GET_RE_DEBUG_FLAGS_DECL;
7893 DEBUG_PARSE("atom");
7894 *flagp = WORST; /* Tentatively. */
7896 PERL_ARGS_ASSERT_REGATOM;
7899 switch ((U8)*RExC_parse) {
7901 RExC_seen_zerolen++;
7902 nextchar(pRExC_state);
7903 if (RExC_flags & RXf_PMf_MULTILINE)
7904 ret = reg_node(pRExC_state, MBOL);
7905 else if (RExC_flags & RXf_PMf_SINGLELINE)
7906 ret = reg_node(pRExC_state, SBOL);
7908 ret = reg_node(pRExC_state, BOL);
7909 Set_Node_Length(ret, 1); /* MJD */
7912 nextchar(pRExC_state);
7914 RExC_seen_zerolen++;
7915 if (RExC_flags & RXf_PMf_MULTILINE)
7916 ret = reg_node(pRExC_state, MEOL);
7917 else if (RExC_flags & RXf_PMf_SINGLELINE)
7918 ret = reg_node(pRExC_state, SEOL);
7920 ret = reg_node(pRExC_state, EOL);
7921 Set_Node_Length(ret, 1); /* MJD */
7924 nextchar(pRExC_state);
7925 if (RExC_flags & RXf_PMf_SINGLELINE)
7926 ret = reg_node(pRExC_state, SANY);
7928 ret = reg_node(pRExC_state, REG_ANY);
7929 *flagp |= HASWIDTH|SIMPLE;
7931 Set_Node_Length(ret, 1); /* MJD */
7935 char * const oregcomp_parse = ++RExC_parse;
7936 ret = regclass(pRExC_state,depth+1);
7937 if (*RExC_parse != ']') {
7938 RExC_parse = oregcomp_parse;
7939 vFAIL("Unmatched [");
7941 nextchar(pRExC_state);
7942 *flagp |= HASWIDTH|SIMPLE;
7943 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7947 nextchar(pRExC_state);
7948 ret = reg(pRExC_state, 1, &flags,depth+1);
7950 if (flags & TRYAGAIN) {
7951 if (RExC_parse == RExC_end) {
7952 /* Make parent create an empty node if needed. */
7960 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7964 if (flags & TRYAGAIN) {
7968 vFAIL("Internal urp");
7969 /* Supposed to be caught earlier. */
7972 if (!regcurly(RExC_parse)) {
7981 vFAIL("Quantifier follows nothing");
7986 This switch handles escape sequences that resolve to some kind
7987 of special regop and not to literal text. Escape sequnces that
7988 resolve to literal text are handled below in the switch marked
7991 Every entry in this switch *must* have a corresponding entry
7992 in the literal escape switch. However, the opposite is not
7993 required, as the default for this switch is to jump to the
7994 literal text handling code.
7996 switch ((U8)*++RExC_parse) {
7997 /* Special Escapes */
7999 RExC_seen_zerolen++;
8000 ret = reg_node(pRExC_state, SBOL);
8002 goto finish_meta_pat;
8004 ret = reg_node(pRExC_state, GPOS);
8005 RExC_seen |= REG_SEEN_GPOS;
8007 goto finish_meta_pat;
8009 RExC_seen_zerolen++;
8010 ret = reg_node(pRExC_state, KEEPS);
8012 /* XXX:dmq : disabling in-place substitution seems to
8013 * be necessary here to avoid cases of memory corruption, as
8014 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8016 RExC_seen |= REG_SEEN_LOOKBEHIND;
8017 goto finish_meta_pat;
8019 ret = reg_node(pRExC_state, SEOL);
8021 RExC_seen_zerolen++; /* Do not optimize RE away */
8022 goto finish_meta_pat;
8024 ret = reg_node(pRExC_state, EOS);
8026 RExC_seen_zerolen++; /* Do not optimize RE away */
8027 goto finish_meta_pat;
8029 ret = reg_node(pRExC_state, CANY);
8030 RExC_seen |= REG_SEEN_CANY;
8031 *flagp |= HASWIDTH|SIMPLE;
8032 goto finish_meta_pat;
8034 ret = reg_node(pRExC_state, CLUMP);
8036 goto finish_meta_pat;
8038 switch (get_regex_charset(RExC_flags)) {
8039 case REGEX_LOCALE_CHARSET:
8042 case REGEX_UNICODE_CHARSET:
8045 case REGEX_ASCII_RESTRICTED_CHARSET:
8046 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8049 case REGEX_DEPENDS_CHARSET:
8055 ret = reg_node(pRExC_state, op);
8056 *flagp |= HASWIDTH|SIMPLE;
8057 goto finish_meta_pat;
8059 switch (get_regex_charset(RExC_flags)) {
8060 case REGEX_LOCALE_CHARSET:
8063 case REGEX_UNICODE_CHARSET:
8066 case REGEX_ASCII_RESTRICTED_CHARSET:
8067 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8070 case REGEX_DEPENDS_CHARSET:
8076 ret = reg_node(pRExC_state, op);
8077 *flagp |= HASWIDTH|SIMPLE;
8078 goto finish_meta_pat;
8080 RExC_seen_zerolen++;
8081 RExC_seen |= REG_SEEN_LOOKBEHIND;
8082 switch (get_regex_charset(RExC_flags)) {
8083 case REGEX_LOCALE_CHARSET:
8086 case REGEX_UNICODE_CHARSET:
8089 case REGEX_ASCII_RESTRICTED_CHARSET:
8090 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8093 case REGEX_DEPENDS_CHARSET:
8099 ret = reg_node(pRExC_state, op);
8100 FLAGS(ret) = get_regex_charset(RExC_flags);
8102 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8103 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8105 goto finish_meta_pat;
8107 RExC_seen_zerolen++;
8108 RExC_seen |= REG_SEEN_LOOKBEHIND;
8109 switch (get_regex_charset(RExC_flags)) {
8110 case REGEX_LOCALE_CHARSET:
8113 case REGEX_UNICODE_CHARSET:
8116 case REGEX_ASCII_RESTRICTED_CHARSET:
8117 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8120 case REGEX_DEPENDS_CHARSET:
8126 ret = reg_node(pRExC_state, op);
8127 FLAGS(ret) = get_regex_charset(RExC_flags);
8129 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8130 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8132 goto finish_meta_pat;
8134 switch (get_regex_charset(RExC_flags)) {
8135 case REGEX_LOCALE_CHARSET:
8138 case REGEX_UNICODE_CHARSET:
8141 case REGEX_ASCII_RESTRICTED_CHARSET:
8142 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8145 case REGEX_DEPENDS_CHARSET:
8151 ret = reg_node(pRExC_state, op);
8152 *flagp |= HASWIDTH|SIMPLE;
8153 goto finish_meta_pat;
8155 switch (get_regex_charset(RExC_flags)) {
8156 case REGEX_LOCALE_CHARSET:
8159 case REGEX_UNICODE_CHARSET:
8162 case REGEX_ASCII_RESTRICTED_CHARSET:
8163 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8166 case REGEX_DEPENDS_CHARSET:
8172 ret = reg_node(pRExC_state, op);
8173 *flagp |= HASWIDTH|SIMPLE;
8174 goto finish_meta_pat;
8176 switch (get_regex_charset(RExC_flags)) {
8177 case REGEX_LOCALE_CHARSET:
8180 case REGEX_ASCII_RESTRICTED_CHARSET:
8181 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8184 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8185 case REGEX_UNICODE_CHARSET:
8191 ret = reg_node(pRExC_state, op);
8192 *flagp |= HASWIDTH|SIMPLE;
8193 goto finish_meta_pat;
8195 switch (get_regex_charset(RExC_flags)) {
8196 case REGEX_LOCALE_CHARSET:
8199 case REGEX_ASCII_RESTRICTED_CHARSET:
8200 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8203 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8204 case REGEX_UNICODE_CHARSET:
8210 ret = reg_node(pRExC_state, op);
8211 *flagp |= HASWIDTH|SIMPLE;
8212 goto finish_meta_pat;
8214 ret = reg_node(pRExC_state, LNBREAK);
8215 *flagp |= HASWIDTH|SIMPLE;
8216 goto finish_meta_pat;
8218 ret = reg_node(pRExC_state, HORIZWS);
8219 *flagp |= HASWIDTH|SIMPLE;
8220 goto finish_meta_pat;
8222 ret = reg_node(pRExC_state, NHORIZWS);
8223 *flagp |= HASWIDTH|SIMPLE;
8224 goto finish_meta_pat;
8226 ret = reg_node(pRExC_state, VERTWS);
8227 *flagp |= HASWIDTH|SIMPLE;
8228 goto finish_meta_pat;
8230 ret = reg_node(pRExC_state, NVERTWS);
8231 *flagp |= HASWIDTH|SIMPLE;
8233 nextchar(pRExC_state);
8234 Set_Node_Length(ret, 2); /* MJD */
8239 char* const oldregxend = RExC_end;
8241 char* parse_start = RExC_parse - 2;
8244 if (RExC_parse[1] == '{') {
8245 /* a lovely hack--pretend we saw [\pX] instead */
8246 RExC_end = strchr(RExC_parse, '}');
8248 const U8 c = (U8)*RExC_parse;
8250 RExC_end = oldregxend;
8251 vFAIL2("Missing right brace on \\%c{}", c);
8256 RExC_end = RExC_parse + 2;
8257 if (RExC_end > oldregxend)
8258 RExC_end = oldregxend;
8262 ret = regclass(pRExC_state,depth+1);
8264 RExC_end = oldregxend;
8267 Set_Node_Offset(ret, parse_start + 2);
8268 Set_Node_Cur_Length(ret);
8269 nextchar(pRExC_state);
8270 *flagp |= HASWIDTH|SIMPLE;
8274 /* Handle \N and \N{NAME} here and not below because it can be
8275 multicharacter. join_exact() will join them up later on.
8276 Also this makes sure that things like /\N{BLAH}+/ and
8277 \N{BLAH} being multi char Just Happen. dmq*/
8279 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
8281 case 'k': /* Handle \k<NAME> and \k'NAME' */
8284 char ch= RExC_parse[1];
8285 if (ch != '<' && ch != '\'' && ch != '{') {
8287 vFAIL2("Sequence %.2s... not terminated",parse_start);
8289 /* this pretty much dupes the code for (?P=...) in reg(), if
8290 you change this make sure you change that */
8291 char* name_start = (RExC_parse += 2);
8293 SV *sv_dat = reg_scan_name(pRExC_state,
8294 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8295 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8296 if (RExC_parse == name_start || *RExC_parse != ch)
8297 vFAIL2("Sequence %.3s... not terminated",parse_start);
8300 num = add_data( pRExC_state, 1, "S" );
8301 RExC_rxi->data->data[num]=(void*)sv_dat;
8302 SvREFCNT_inc_simple_void(sv_dat);
8306 ret = reganode(pRExC_state,
8309 : (MORE_ASCII_RESTRICTED)
8311 : (AT_LEAST_UNI_SEMANTICS)
8319 /* override incorrect value set in reganode MJD */
8320 Set_Node_Offset(ret, parse_start+1);
8321 Set_Node_Cur_Length(ret); /* MJD */
8322 nextchar(pRExC_state);
8328 case '1': case '2': case '3': case '4':
8329 case '5': case '6': case '7': case '8': case '9':
8332 bool isg = *RExC_parse == 'g';
8337 if (*RExC_parse == '{') {
8341 if (*RExC_parse == '-') {
8345 if (hasbrace && !isDIGIT(*RExC_parse)) {
8346 if (isrel) RExC_parse--;
8348 goto parse_named_seq;
8350 num = atoi(RExC_parse);
8351 if (isg && num == 0)
8352 vFAIL("Reference to invalid group 0");
8354 num = RExC_npar - num;
8356 vFAIL("Reference to nonexistent or unclosed group");
8358 if (!isg && num > 9 && num >= RExC_npar)
8361 char * const parse_start = RExC_parse - 1; /* MJD */
8362 while (isDIGIT(*RExC_parse))
8364 if (parse_start == RExC_parse - 1)
8365 vFAIL("Unterminated \\g... pattern");
8367 if (*RExC_parse != '}')
8368 vFAIL("Unterminated \\g{...} pattern");
8372 if (num > (I32)RExC_rx->nparens)
8373 vFAIL("Reference to nonexistent group");
8376 ret = reganode(pRExC_state,
8379 : (MORE_ASCII_RESTRICTED)
8381 : (AT_LEAST_UNI_SEMANTICS)
8389 /* override incorrect value set in reganode MJD */
8390 Set_Node_Offset(ret, parse_start+1);
8391 Set_Node_Cur_Length(ret); /* MJD */
8393 nextchar(pRExC_state);
8398 if (RExC_parse >= RExC_end)
8399 FAIL("Trailing \\");
8402 /* Do not generate "unrecognized" warnings here, we fall
8403 back into the quick-grab loop below */
8410 if (RExC_flags & RXf_PMf_EXTENDED) {
8411 if ( reg_skipcomment( pRExC_state ) )
8418 parse_start = RExC_parse - 1;
8431 char_state latest_char_state = generic_char;
8432 register STRLEN len;
8437 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8438 regnode * orig_emit;
8441 orig_emit = RExC_emit; /* Save the original output node position in
8442 case we need to output a different node
8444 ret = reg_node(pRExC_state,
8445 (U8) ((! FOLD) ? EXACT
8448 : (MORE_ASCII_RESTRICTED)
8450 : (AT_LEAST_UNI_SEMANTICS)
8455 for (len = 0, p = RExC_parse - 1;
8456 len < 127 && p < RExC_end;
8459 char * const oldp = p;
8461 if (RExC_flags & RXf_PMf_EXTENDED)
8462 p = regwhite( pRExC_state, p );
8473 /* Literal Escapes Switch
8475 This switch is meant to handle escape sequences that
8476 resolve to a literal character.
8478 Every escape sequence that represents something
8479 else, like an assertion or a char class, is handled
8480 in the switch marked 'Special Escapes' above in this
8481 routine, but also has an entry here as anything that
8482 isn't explicitly mentioned here will be treated as
8483 an unescaped equivalent literal.
8487 /* These are all the special escapes. */
8488 case 'A': /* Start assertion */
8489 case 'b': case 'B': /* Word-boundary assertion*/
8490 case 'C': /* Single char !DANGEROUS! */
8491 case 'd': case 'D': /* digit class */
8492 case 'g': case 'G': /* generic-backref, pos assertion */
8493 case 'h': case 'H': /* HORIZWS */
8494 case 'k': case 'K': /* named backref, keep marker */
8495 case 'N': /* named char sequence */
8496 case 'p': case 'P': /* Unicode property */
8497 case 'R': /* LNBREAK */
8498 case 's': case 'S': /* space class */
8499 case 'v': case 'V': /* VERTWS */
8500 case 'w': case 'W': /* word class */
8501 case 'X': /* eXtended Unicode "combining character sequence" */
8502 case 'z': case 'Z': /* End of line/string assertion */
8506 /* Anything after here is an escape that resolves to a
8507 literal. (Except digits, which may or may not)
8526 ender = ASCII_TO_NATIVE('\033');
8530 ender = ASCII_TO_NATIVE('\007');
8535 STRLEN brace_len = len;
8537 const char* error_msg;
8539 bool valid = grok_bslash_o(p,
8546 RExC_parse = p; /* going to die anyway; point
8547 to exact spot of failure */
8554 if (PL_encoding && ender < 0x100) {
8555 goto recode_encoding;
8564 char* const e = strchr(p, '}');
8568 vFAIL("Missing right brace on \\x{}");
8571 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8572 | PERL_SCAN_DISALLOW_PREFIX;
8573 STRLEN numlen = e - p - 1;
8574 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8581 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8583 ender = grok_hex(p, &numlen, &flags, NULL);
8586 if (PL_encoding && ender < 0x100)
8587 goto recode_encoding;
8591 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8593 case '0': case '1': case '2': case '3':case '4':
8594 case '5': case '6': case '7': case '8':case '9':
8596 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8598 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8600 ender = grok_oct(p, &numlen, &flags, NULL);
8610 if (PL_encoding && ender < 0x100)
8611 goto recode_encoding;
8614 if (! RExC_override_recoding) {
8615 SV* enc = PL_encoding;
8616 ender = reg_recode((const char)(U8)ender, &enc);
8617 if (!enc && SIZE_ONLY)
8618 ckWARNreg(p, "Invalid escape in the specified encoding");
8624 FAIL("Trailing \\");
8627 if (!SIZE_ONLY&& isALPHA(*p)) {
8628 /* Include any { following the alpha to emphasize
8629 * that it could be part of an escape at some point
8631 int len = (*(p + 1) == '{') ? 2 : 1;
8632 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8634 goto normal_default;
8639 if (UTF8_IS_START(*p) && UTF) {
8641 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8642 &numlen, UTF8_ALLOW_DEFAULT);
8648 } /* End of switch on the literal */
8650 /* Certain characters are problematic because their folded
8651 * length is so different from their original length that it
8652 * isn't handleable by the optimizer. They are therefore not
8653 * placed in an EXACTish node; and are here handled specially.
8654 * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8655 * putting it in a special node keeps regexec from having to
8656 * deal with a non-utf8 multi-char fold */
8658 && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
8660 /* We look for either side of the fold. For example \xDF
8661 * folds to 'ss'. We look for both the single character
8662 * \xDF and the sequence 'ss'. When we find something that
8663 * could be one of those, we stop and flush whatever we
8664 * have output so far into the EXACTish node that was being
8665 * built. Then restore the input pointer to what it was.
8666 * regatom will return that EXACT node, and will be called
8667 * again, positioned so the first character is the one in
8668 * question, which we return in a different node type.
8669 * The multi-char folds are a sequence, so the occurrence
8670 * of the first character in that sequence doesn't
8671 * necessarily mean that what follows is the rest of the
8672 * sequence. We keep track of that with a state machine,
8673 * with the state being set to the latest character
8674 * processed before the current one. Most characters will
8675 * set the state to 0, but if one occurs that is part of a
8676 * potential tricky fold sequence, the state is set to that
8677 * character, and the next loop iteration sees if the state
8678 * should progress towards the final folded-from character,
8679 * or if it was a false alarm. If it turns out to be a
8680 * false alarm, the character(s) will be output in a new
8681 * EXACTish node, and join_exact() will later combine them.
8682 * In the case of the 'ss' sequence, which is more common
8683 * and more easily checked, some look-ahead is done to
8684 * save time by ruling-out some false alarms */
8687 latest_char_state = generic_char;
8691 case 0x17F: /* LATIN SMALL LETTER LONG S */
8692 if (AT_LEAST_UNI_SEMANTICS) {
8693 if (latest_char_state == char_s) { /* 'ss' */
8694 ender = LATIN_SMALL_LETTER_SHARP_S;
8697 else if (p < RExC_end) {
8699 /* Look-ahead at the next character. If it
8700 * is also an s, we handle as a sharp s
8701 * tricky regnode. */
8702 if (*p == 's' || *p == 'S') {
8704 /* But first flush anything in the
8705 * EXACTish buffer */
8710 p++; /* Account for swallowing this
8712 ender = LATIN_SMALL_LETTER_SHARP_S;
8715 /* Here, the next character is not a
8716 * literal 's', but still could
8717 * evaluate to one if part of a \o{},
8718 * \x or \OCTAL-DIGIT. The minimum
8719 * length required for that is 4, eg
8723 && (isDIGIT(*(p + 1))
8725 || *(p + 1) == 'o' ))
8728 /* Here, it could be an 's', too much
8729 * bother to figure it out here. Flush
8730 * the buffer if any; when come back
8731 * here, set the state so know that the
8732 * previous char was an 's' */
8734 latest_char_state = generic_char;
8738 latest_char_state = char_s;
8744 /* Here, can't be an 'ss' sequence, or at least not
8745 * one that could fold to/from the sharp ss */
8746 latest_char_state = generic_char;
8748 case 0x03C5: /* First char in upsilon series */
8749 if (p < RExC_end - 4) { /* Need >= 4 bytes left */
8750 latest_char_state = upsilon_1;
8757 latest_char_state = generic_char;
8760 case 0x03B9: /* First char in iota series */
8761 if (p < RExC_end - 4) {
8762 latest_char_state = iota_1;
8769 latest_char_state = generic_char;
8773 if (latest_char_state == upsilon_1) {
8774 latest_char_state = upsilon_2;
8776 else if (latest_char_state == iota_1) {
8777 latest_char_state = iota_2;
8780 latest_char_state = generic_char;
8784 if (latest_char_state == upsilon_2) {
8785 ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
8788 else if (latest_char_state == iota_2) {
8789 ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
8792 latest_char_state = generic_char;
8795 /* These are the tricky fold characters. Flush any
8797 case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
8798 case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
8799 case LATIN_SMALL_LETTER_SHARP_S:
8800 case LATIN_CAPITAL_LETTER_SHARP_S:
8809 char* const oldregxend = RExC_end;
8810 U8 tmpbuf[UTF8_MAXBYTES+1];
8812 /* Here, we know we need to generate a special
8813 * regnode, and 'ender' contains the tricky
8814 * character. What's done is to pretend it's in a
8815 * [bracketed] class, and let the code that deals
8816 * with those handle it, as that code has all the
8817 * intelligence necessary. First save the current
8818 * parse state, get rid of the already allocated
8819 * but empty EXACT node that the ANYOFV node will
8820 * replace, and point the parse to a buffer which
8821 * we fill with the character we want the regclass
8822 * code to think is being parsed */
8823 RExC_emit = orig_emit;
8824 RExC_parse = (char *) tmpbuf;
8826 U8 *d = uvchr_to_utf8(tmpbuf, ender);
8828 RExC_end = (char *) d;
8830 else { /* ender above 255 already excluded */
8831 tmpbuf[0] = (U8) ender;
8833 RExC_end = RExC_parse + 1;
8836 ret = regclass(pRExC_state,depth+1);
8838 /* Here, have parsed the buffer. Reset the parse to
8839 * the actual input, and return */
8840 RExC_end = oldregxend;
8843 Set_Node_Offset(ret, RExC_parse);
8844 Set_Node_Cur_Length(ret);
8845 nextchar(pRExC_state);
8846 *flagp |= HASWIDTH|SIMPLE;
8852 if ( RExC_flags & RXf_PMf_EXTENDED)
8853 p = regwhite( pRExC_state, p );
8855 /* Prime the casefolded buffer. Locale rules, which apply
8856 * only to code points < 256, aren't known until execution,
8857 * so for them, just output the original character using
8859 if (LOC && ender < 256) {
8860 if (UNI_IS_INVARIANT(ender)) {
8861 *tmpbuf = (U8) ender;
8864 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8865 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8869 else if (isASCII(ender)) { /* Note: Here can't also be LOC
8871 ender = toLOWER(ender);
8872 *tmpbuf = (U8) ender;
8875 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8877 /* Locale and /aa require more selectivity about the
8878 * fold, so are handled below. Otherwise, here, just
8880 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8883 /* Under locale rules or /aa we are not to mix,
8884 * respectively, ords < 256 or ASCII with non-. So
8885 * reject folds that mix them, using only the
8886 * non-folded code point. So do the fold to a
8887 * temporary, and inspect each character in it. */
8888 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8890 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8891 U8* e = s + foldlen;
8892 bool fold_ok = TRUE;
8896 || (LOC && (UTF8_IS_INVARIANT(*s)
8897 || UTF8_IS_DOWNGRADEABLE_START(*s))))
8905 Copy(trialbuf, tmpbuf, foldlen, U8);
8909 uvuni_to_utf8(tmpbuf, ender);
8910 foldlen = UNISKIP(ender);
8914 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8919 /* Emit all the Unicode characters. */
8921 for (foldbuf = tmpbuf;
8923 foldlen -= numlen) {
8924 ender = utf8_to_uvchr(foldbuf, &numlen);
8926 const STRLEN unilen = reguni(pRExC_state, ender, s);
8929 /* In EBCDIC the numlen
8930 * and unilen can differ. */
8932 if (numlen >= foldlen)
8936 break; /* "Can't happen." */
8940 const STRLEN unilen = reguni(pRExC_state, ender, s);
8949 REGC((char)ender, s++);
8955 /* Emit all the Unicode characters. */
8957 for (foldbuf = tmpbuf;
8959 foldlen -= numlen) {
8960 ender = utf8_to_uvchr(foldbuf, &numlen);
8962 const STRLEN unilen = reguni(pRExC_state, ender, s);
8965 /* In EBCDIC the numlen
8966 * and unilen can differ. */
8968 if (numlen >= foldlen)
8976 const STRLEN unilen = reguni(pRExC_state, ender, s);
8985 REGC((char)ender, s++);
8988 loopdone: /* Jumped to when encounters something that shouldn't be in
8991 Set_Node_Cur_Length(ret); /* MJD */
8992 nextchar(pRExC_state);
8994 /* len is STRLEN which is unsigned, need to copy to signed */
8997 vFAIL("Internal disaster");
9001 if (len == 1 && UNI_IS_INVARIANT(ender))
9005 RExC_size += STR_SZ(len);
9008 RExC_emit += STR_SZ(len);
9016 /* Jumped to when an unrecognized character set is encountered */
9018 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9023 S_regwhite( RExC_state_t *pRExC_state, char *p )
9025 const char *e = RExC_end;
9027 PERL_ARGS_ASSERT_REGWHITE;
9032 else if (*p == '#') {
9041 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9049 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9050 Character classes ([:foo:]) can also be negated ([:^foo:]).
9051 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9052 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9053 but trigger failures because they are currently unimplemented. */
9055 #define POSIXCC_DONE(c) ((c) == ':')
9056 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9057 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9060 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9063 I32 namedclass = OOB_NAMEDCLASS;
9065 PERL_ARGS_ASSERT_REGPPOSIXCC;
9067 if (value == '[' && RExC_parse + 1 < RExC_end &&
9068 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9069 POSIXCC(UCHARAT(RExC_parse))) {
9070 const char c = UCHARAT(RExC_parse);
9071 char* const s = RExC_parse++;
9073 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9075 if (RExC_parse == RExC_end)
9076 /* Grandfather lone [:, [=, [. */
9079 const char* const t = RExC_parse++; /* skip over the c */
9082 if (UCHARAT(RExC_parse) == ']') {
9083 const char *posixcc = s + 1;
9084 RExC_parse++; /* skip over the ending ] */
9087 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9088 const I32 skip = t - posixcc;
9090 /* Initially switch on the length of the name. */
9093 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9094 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9097 /* Names all of length 5. */
9098 /* alnum alpha ascii blank cntrl digit graph lower
9099 print punct space upper */
9100 /* Offset 4 gives the best switch position. */
9101 switch (posixcc[4]) {
9103 if (memEQ(posixcc, "alph", 4)) /* alpha */
9104 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9107 if (memEQ(posixcc, "spac", 4)) /* space */
9108 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9111 if (memEQ(posixcc, "grap", 4)) /* graph */
9112 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9115 if (memEQ(posixcc, "asci", 4)) /* ascii */
9116 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9119 if (memEQ(posixcc, "blan", 4)) /* blank */
9120 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9123 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9124 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9127 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9128 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9131 if (memEQ(posixcc, "lowe", 4)) /* lower */
9132 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9133 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9134 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9137 if (memEQ(posixcc, "digi", 4)) /* digit */
9138 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9139 else if (memEQ(posixcc, "prin", 4)) /* print */
9140 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9141 else if (memEQ(posixcc, "punc", 4)) /* punct */
9142 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9147 if (memEQ(posixcc, "xdigit", 6))
9148 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9152 if (namedclass == OOB_NAMEDCLASS)
9153 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9155 assert (posixcc[skip] == ':');
9156 assert (posixcc[skip+1] == ']');
9157 } else if (!SIZE_ONLY) {
9158 /* [[=foo=]] and [[.foo.]] are still future. */
9160 /* adjust RExC_parse so the warning shows after
9162 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9164 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9167 /* Maternal grandfather:
9168 * "[:" ending in ":" but not in ":]" */
9178 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9182 PERL_ARGS_ASSERT_CHECKPOSIXCC;
9184 if (POSIXCC(UCHARAT(RExC_parse))) {
9185 const char *s = RExC_parse;
9186 const char c = *s++;
9190 if (*s && c == *s && s[1] == ']') {
9192 "POSIX syntax [%c %c] belongs inside character classes",
9195 /* [[=foo=]] and [[.foo.]] are still future. */
9196 if (POSIXCC_NOTYET(c)) {
9197 /* adjust RExC_parse so the error shows after
9199 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9201 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9207 /* No locale test, and always Unicode semantics */
9208 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
9210 for (value = 0; value < 256; value++) \
9212 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9216 case ANYOF_N##NAME: \
9217 for (value = 0; value < 256; value++) \
9219 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9224 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9225 * there are two tests passed in, to use depending on that. There aren't any
9226 * cases where the label is different from the name, so no need for that
9228 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
9230 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
9231 else if (UNI_SEMANTICS) { \
9232 for (value = 0; value < 256; value++) { \
9233 if (TEST_8(value)) stored += \
9234 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9238 for (value = 0; value < 128; value++) { \
9239 if (TEST_7(UNI_TO_NATIVE(value))) stored += \
9240 set_regclass_bit(pRExC_state, ret, \
9241 (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9247 case ANYOF_N##NAME: \
9248 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
9249 else if (UNI_SEMANTICS) { \
9250 for (value = 0; value < 256; value++) { \
9251 if (! TEST_8(value)) stored += \
9252 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9256 for (value = 0; value < 128; value++) { \
9257 if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
9258 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9260 if (AT_LEAST_ASCII_RESTRICTED) { \
9261 for (value = 128; value < 256; value++) { \
9262 stored += set_regclass_bit( \
9263 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9265 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
9268 /* For a non-ut8 target string with DEPENDS semantics, all above \
9269 * ASCII Latin1 code points match the complement of any of the \
9270 * classes. But in utf8, they have their Unicode semantics, so \
9271 * can't just set them in the bitmap, or else regexec.c will think \
9272 * they matched when they shouldn't. */ \
9273 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
9281 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9284 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9285 * Locale folding is done at run-time, so this function should not be
9286 * called for nodes that are for locales.
9288 * This function sets the bit corresponding to the fold of the input
9289 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
9292 * It also knows about the characters that are in the bitmap that have
9293 * folds that are matchable only outside it, and sets the appropriate lists
9296 * It returns the number of bits that actually changed from 0 to 1 */
9301 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9303 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9306 /* It assumes the bit for 'value' has already been set */
9307 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9308 ANYOF_BITMAP_SET(node, fold);
9311 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9312 /* Certain Latin1 characters have matches outside the bitmap. To get
9313 * here, 'value' is one of those characters. None of these matches is
9314 * valid for ASCII characters under /aa, which have been excluded by
9315 * the 'if' above. The matches fall into three categories:
9316 * 1) They are singly folded-to or -from an above 255 character, as
9317 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9319 * 2) They are part of a multi-char fold with another character in the
9320 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9321 * 3) They are part of a multi-char fold with a character not in the
9322 * bitmap, such as various ligatures.
9323 * We aren't dealing fully with multi-char folds, except we do deal
9324 * with the pattern containing a character that has a multi-char fold
9325 * (not so much the inverse).
9326 * For types 1) and 3), the matches only happen when the target string
9327 * is utf8; that's not true for 2), and we set a flag for it.
9329 * The code below adds to the passed in inversion list the single fold
9330 * closures for 'value'. The values are hard-coded here so that an
9331 * innocent-looking character class, like /[ks]/i won't have to go out
9332 * to disk to find the possible matches. XXX It would be better to
9333 * generate these via regen, in case a new version of the Unicode
9334 * standard adds new mappings, though that is not really likely. */
9339 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9343 /* LATIN SMALL LETTER LONG S */
9344 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9347 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9348 GREEK_SMALL_LETTER_MU);
9349 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9350 GREEK_CAPITAL_LETTER_MU);
9352 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9353 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9355 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9356 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
9357 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9358 PL_fold_latin1[value]);
9361 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9362 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9363 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9365 case LATIN_SMALL_LETTER_SHARP_S:
9366 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9367 LATIN_CAPITAL_LETTER_SHARP_S);
9369 /* Under /a, /d, and /u, this can match the two chars "ss" */
9370 if (! MORE_ASCII_RESTRICTED) {
9371 add_alternate(alternate_ptr, (U8 *) "ss", 2);
9373 /* And under /u or /a, it can match even if the target is
9375 if (AT_LEAST_UNI_SEMANTICS) {
9376 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9390 /* These all are targets of multi-character folds from code
9391 * points that require UTF8 to express, so they can't match
9392 * unless the target string is in UTF-8, so no action here is
9393 * necessary, as regexec.c properly handles the general case
9394 * for UTF-8 matching */
9397 /* Use deprecated warning to increase the chances of this
9399 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9403 else if (DEPENDS_SEMANTICS
9405 && PL_fold_latin1[value] != value)
9407 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9408 * folds only when the target string is in UTF-8. We add the fold
9409 * here to the list of things to match outside the bitmap, which
9410 * won't be looked at unless it is UTF8 (or else if something else
9411 * says to look even if not utf8, but those things better not happen
9412 * under DEPENDS semantics. */
9413 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9420 PERL_STATIC_INLINE U8
9421 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9423 /* This inline function sets a bit in the bitmap if not already set, and if
9424 * appropriate, its fold, returning the number of bits that actually
9425 * changed from 0 to 1 */
9429 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9431 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
9435 ANYOF_BITMAP_SET(node, value);
9438 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
9439 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9446 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9448 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9449 * alternate list, pointed to by 'alternate_ptr'. This is an array of
9450 * the multi-character folds of characters in the node */
9453 PERL_ARGS_ASSERT_ADD_ALTERNATE;
9455 if (! *alternate_ptr) {
9456 *alternate_ptr = newAV();
9458 sv = newSVpvn_utf8((char*)string, len, TRUE);
9459 av_push(*alternate_ptr, sv);
9464 parse a class specification and produce either an ANYOF node that
9465 matches the pattern or perhaps will be optimized into an EXACTish node
9466 instead. The node contains a bit map for the first 256 characters, with the
9467 corresponding bit set if that character is in the list. For characters
9468 above 255, a range list is used */
9471 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9474 register UV nextvalue;
9475 register IV prevvalue = OOB_UNICODE;
9476 register IV range = 0;
9477 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9478 register regnode *ret;
9481 char *rangebegin = NULL;
9482 bool need_class = 0;
9483 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
9485 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9486 than just initialized. */
9489 /* code points this node matches that can't be stored in the bitmap */
9490 HV* nonbitmap = NULL;
9492 /* The items that are to match that aren't stored in the bitmap, but are a
9493 * result of things that are stored there. This is the fold closure of
9494 * such a character, either because it has DEPENDS semantics and shouldn't
9495 * be matched unless the target string is utf8, or is a code point that is
9496 * too large for the bit map, as for example, the fold of the MICRO SIGN is
9497 * above 255. This all is solely for performance reasons. By having this
9498 * code know the outside-the-bitmap folds that the bitmapped characters are
9499 * involved with, we don't have to go out to disk to find the list of
9500 * matches, unless the character class includes code points that aren't
9501 * storable in the bit map. That means that a character class with an 's'
9502 * in it, for example, doesn't need to go out to disk to find everything
9503 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
9504 * empty unless there is something whose fold we don't know about, and will
9505 * have to go out to the disk to find. */
9506 HV* l1_fold_invlist = NULL;
9508 /* List of multi-character folds that are matched by this node */
9509 AV* unicode_alternate = NULL;
9511 UV literal_endpoint = 0;
9513 UV stored = 0; /* how many chars stored in the bitmap */
9515 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9516 case we need to change the emitted regop to an EXACT. */
9517 const char * orig_parse = RExC_parse;
9518 GET_RE_DEBUG_FLAGS_DECL;
9520 PERL_ARGS_ASSERT_REGCLASS;
9522 PERL_UNUSED_ARG(depth);
9525 DEBUG_PARSE("clas");
9527 /* Assume we are going to generate an ANYOF node. */
9528 ret = reganode(pRExC_state, ANYOF, 0);
9532 ANYOF_FLAGS(ret) = 0;
9535 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
9539 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9541 /* We have decided to not allow multi-char folds in inverted character
9542 * classes, due to the confusion that can happen, even with classes
9543 * that are designed for a non-Unicode world: You have the peculiar
9545 "s s" =~ /^[^\xDF]+$/i => Y
9546 "ss" =~ /^[^\xDF]+$/i => N
9548 * See [perl #89750] */
9549 allow_full_fold = FALSE;
9553 RExC_size += ANYOF_SKIP;
9554 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9557 RExC_emit += ANYOF_SKIP;
9559 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9561 ANYOF_BITMAP_ZERO(ret);
9562 listsv = newSVpvs("# comment\n");
9563 initial_listsv_len = SvCUR(listsv);
9566 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9568 if (!SIZE_ONLY && POSIXCC(nextvalue))
9569 checkposixcc(pRExC_state);
9571 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9572 if (UCHARAT(RExC_parse) == ']')
9576 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9580 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9583 rangebegin = RExC_parse;
9585 value = utf8n_to_uvchr((U8*)RExC_parse,
9586 RExC_end - RExC_parse,
9587 &numlen, UTF8_ALLOW_DEFAULT);
9588 RExC_parse += numlen;
9591 value = UCHARAT(RExC_parse++);
9593 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9594 if (value == '[' && POSIXCC(nextvalue))
9595 namedclass = regpposixcc(pRExC_state, value);
9596 else if (value == '\\') {
9598 value = utf8n_to_uvchr((U8*)RExC_parse,
9599 RExC_end - RExC_parse,
9600 &numlen, UTF8_ALLOW_DEFAULT);
9601 RExC_parse += numlen;
9604 value = UCHARAT(RExC_parse++);
9605 /* Some compilers cannot handle switching on 64-bit integer
9606 * values, therefore value cannot be an UV. Yes, this will
9607 * be a problem later if we want switch on Unicode.
9608 * A similar issue a little bit later when switching on
9609 * namedclass. --jhi */
9610 switch ((I32)value) {
9611 case 'w': namedclass = ANYOF_ALNUM; break;
9612 case 'W': namedclass = ANYOF_NALNUM; break;
9613 case 's': namedclass = ANYOF_SPACE; break;
9614 case 'S': namedclass = ANYOF_NSPACE; break;
9615 case 'd': namedclass = ANYOF_DIGIT; break;
9616 case 'D': namedclass = ANYOF_NDIGIT; break;
9617 case 'v': namedclass = ANYOF_VERTWS; break;
9618 case 'V': namedclass = ANYOF_NVERTWS; break;
9619 case 'h': namedclass = ANYOF_HORIZWS; break;
9620 case 'H': namedclass = ANYOF_NHORIZWS; break;
9621 case 'N': /* Handle \N{NAME} in class */
9623 /* We only pay attention to the first char of
9624 multichar strings being returned. I kinda wonder
9625 if this makes sense as it does change the behaviour
9626 from earlier versions, OTOH that behaviour was broken
9628 UV v; /* value is register so we cant & it /grrr */
9629 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
9639 if (RExC_parse >= RExC_end)
9640 vFAIL2("Empty \\%c{}", (U8)value);
9641 if (*RExC_parse == '{') {
9642 const U8 c = (U8)value;
9643 e = strchr(RExC_parse++, '}');
9645 vFAIL2("Missing right brace on \\%c{}", c);
9646 while (isSPACE(UCHARAT(RExC_parse)))
9648 if (e == RExC_parse)
9649 vFAIL2("Empty \\%c{}", c);
9651 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9659 if (UCHARAT(RExC_parse) == '^') {
9662 value = value == 'p' ? 'P' : 'p'; /* toggle */
9663 while (isSPACE(UCHARAT(RExC_parse))) {
9669 /* Add the property name to the list. If /i matching, give
9670 * a different name which consists of the normal name
9671 * sandwiched between two underscores and '_i'. The design
9672 * is discussed in the commit message for this. */
9673 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9674 (value=='p' ? '+' : '!'),
9683 /* The \p could match something in the Latin1 range, hence
9684 * something that isn't utf8 */
9685 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9686 namedclass = ANYOF_MAX; /* no official name, but it's named */
9688 /* \p means they want Unicode semantics */
9689 RExC_uni_semantics = 1;
9692 case 'n': value = '\n'; break;
9693 case 'r': value = '\r'; break;
9694 case 't': value = '\t'; break;
9695 case 'f': value = '\f'; break;
9696 case 'b': value = '\b'; break;
9697 case 'e': value = ASCII_TO_NATIVE('\033');break;
9698 case 'a': value = ASCII_TO_NATIVE('\007');break;
9700 RExC_parse--; /* function expects to be pointed at the 'o' */
9702 const char* error_msg;
9703 bool valid = grok_bslash_o(RExC_parse,
9708 RExC_parse += numlen;
9713 if (PL_encoding && value < 0x100) {
9714 goto recode_encoding;
9718 if (*RExC_parse == '{') {
9719 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9720 | PERL_SCAN_DISALLOW_PREFIX;
9721 char * const e = strchr(RExC_parse++, '}');
9723 vFAIL("Missing right brace on \\x{}");
9725 numlen = e - RExC_parse;
9726 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9730 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9732 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9733 RExC_parse += numlen;
9735 if (PL_encoding && value < 0x100)
9736 goto recode_encoding;
9739 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9741 case '0': case '1': case '2': case '3': case '4':
9742 case '5': case '6': case '7':
9744 /* Take 1-3 octal digits */
9745 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9747 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9748 RExC_parse += numlen;
9749 if (PL_encoding && value < 0x100)
9750 goto recode_encoding;
9754 if (! RExC_override_recoding) {
9755 SV* enc = PL_encoding;
9756 value = reg_recode((const char)(U8)value, &enc);
9757 if (!enc && SIZE_ONLY)
9758 ckWARNreg(RExC_parse,
9759 "Invalid escape in the specified encoding");
9763 /* Allow \_ to not give an error */
9764 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9765 ckWARN2reg(RExC_parse,
9766 "Unrecognized escape \\%c in character class passed through",
9771 } /* end of \blah */
9777 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9779 /* What matches in a locale is not known until runtime, so need to
9780 * (one time per class) allocate extra space to pass to regexec.
9781 * The space will contain a bit for each named class that is to be
9782 * matched against. This isn't needed for \p{} and pseudo-classes,
9783 * as they are not affected by locale, and hence are dealt with
9785 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9788 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9791 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9792 ANYOF_CLASS_ZERO(ret);
9794 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9797 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
9798 * literal, as is the character that began the false range, i.e.
9799 * the 'a' in the examples */
9803 RExC_parse >= rangebegin ?
9804 RExC_parse - rangebegin : 0;
9805 ckWARN4reg(RExC_parse,
9806 "False [] range \"%*.*s\"",
9810 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9811 if (prevvalue < 256) {
9813 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9816 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9820 range = 0; /* this was not a true range */
9826 const char *what = NULL;
9829 /* Possible truncation here but in some 64-bit environments
9830 * the compiler gets heartburn about switch on 64-bit values.
9831 * A similar issue a little earlier when switching on value.
9833 switch ((I32)namedclass) {
9835 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9836 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9837 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9838 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9839 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9840 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9841 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9842 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9843 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9844 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9845 /* \s, \w match all unicode if utf8. */
9846 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9847 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9848 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9849 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9850 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9853 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9855 for (value = 0; value < 128; value++)
9857 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9860 what = NULL; /* Doesn't match outside ascii, so
9861 don't want to add +utf8:: */
9865 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9867 for (value = 128; value < 256; value++)
9869 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9871 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9877 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9879 /* consecutive digits assumed */
9880 for (value = '0'; value <= '9'; value++)
9882 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9889 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9891 /* consecutive digits assumed */
9892 for (value = 0; value < '0'; value++)
9894 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9895 for (value = '9' + 1; value < 256; value++)
9897 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9901 if (AT_LEAST_ASCII_RESTRICTED ) {
9902 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9906 /* this is to handle \p and \P */
9909 vFAIL("Invalid [::] class");
9912 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9913 /* Strings such as "+utf8::isWord\n" */
9914 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9919 } /* end of namedclass \blah */
9922 if (prevvalue > (IV)value) /* b-a */ {
9923 const int w = RExC_parse - rangebegin;
9924 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9925 range = 0; /* not a valid range */
9929 prevvalue = value; /* save the beginning of the range */
9930 if (RExC_parse+1 < RExC_end
9931 && *RExC_parse == '-'
9932 && RExC_parse[1] != ']')
9936 /* a bad range like \w-, [:word:]- ? */
9937 if (namedclass > OOB_NAMEDCLASS) {
9938 if (ckWARN(WARN_REGEXP)) {
9940 RExC_parse >= rangebegin ?
9941 RExC_parse - rangebegin : 0;
9943 "False [] range \"%*.*s\"",
9948 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9950 range = 1; /* yeah, it's a range! */
9951 continue; /* but do it the next time */
9955 /* non-Latin1 code point implies unicode semantics. Must be set in
9956 * pass1 so is there for the whole of pass 2 */
9958 RExC_uni_semantics = 1;
9961 /* now is the next time */
9963 if (prevvalue < 256) {
9964 const IV ceilvalue = value < 256 ? value : 255;
9967 /* In EBCDIC [\x89-\x91] should include
9968 * the \x8e but [i-j] should not. */
9969 if (literal_endpoint == 2 &&
9970 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9971 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9973 if (isLOWER(prevvalue)) {
9974 for (i = prevvalue; i <= ceilvalue; i++)
9975 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9977 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9980 for (i = prevvalue; i <= ceilvalue; i++)
9981 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9983 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9989 for (i = prevvalue; i <= ceilvalue; i++) {
9990 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9994 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
9995 const UV natvalue = NATIVE_TO_UNI(value);
9996 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
9999 literal_endpoint = 0;
10003 range = 0; /* this range (if it was one) is done now */
10010 /****** !SIZE_ONLY AFTER HERE *********/
10012 /* If folding and there are code points above 255, we calculate all
10013 * characters that could fold to or from the ones already on the list */
10014 if (FOLD && nonbitmap) {
10017 HV* fold_intersection;
10020 /* This is a list of all the characters that participate in folds
10021 * (except marks, etc in multi-char folds */
10022 if (! PL_utf8_foldable) {
10023 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10024 PL_utf8_foldable = _swash_to_invlist(swash);
10027 /* This is a hash that for a particular fold gives all characters
10028 * that are involved in it */
10029 if (! PL_utf8_foldclosures) {
10031 /* If we were unable to find any folds, then we likely won't be
10032 * able to find the closures. So just create an empty list.
10033 * Folding will effectively be restricted to the non-Unicode rules
10034 * hard-coded into Perl. (This case happens legitimately during
10035 * compilation of Perl itself before the Unicode tables are
10037 if (invlist_len(PL_utf8_foldable) == 0) {
10038 PL_utf8_foldclosures = _new_invlist(0);
10040 /* If the folds haven't been read in, call a fold function
10042 if (! PL_utf8_tofold) {
10043 U8 dummy[UTF8_MAXBYTES+1];
10045 to_utf8_fold((U8*) "A", dummy, &dummy_len);
10047 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10051 /* Only the characters in this class that participate in folds need
10052 * be checked. Get the intersection of this class and all the
10053 * possible characters that are foldable. This can quickly narrow
10054 * down a large class */
10055 fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10057 /* Now look at the foldable characters in this class individually */
10058 fold_list = invlist_array(fold_intersection);
10059 for (i = 0; i < invlist_len(fold_intersection); i++) {
10062 /* The next entry is the beginning of the range that is in the
10064 UV start = fold_list[i++];
10067 /* The next entry is the beginning of the next range, which
10068 * isn't in the class, so the end of the current range is one
10069 * less than that */
10070 UV end = fold_list[i] - 1;
10072 /* Look at every character in the range */
10073 for (j = start; j <= end; j++) {
10076 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10079 _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
10081 if (foldlen > (STRLEN)UNISKIP(f)) {
10083 /* Any multicharacter foldings (disallowed in
10084 * lookbehind patterns) require the following
10085 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10086 * E folds into "pq" and F folds into "rst", all other
10087 * characters fold to single characters. We save away
10088 * these multicharacter foldings, to be later saved as
10089 * part of the additional "s" data. */
10090 if (! RExC_in_lookbehind) {
10092 U8* e = foldbuf + foldlen;
10094 /* If any of the folded characters of this are in
10095 * the Latin1 range, tell the regex engine that
10096 * this can match a non-utf8 target string. The
10097 * only multi-byte fold whose source is in the
10098 * Latin1 range (U+00DF) applies only when the
10099 * target string is utf8, or under unicode rules */
10100 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10103 /* Can't mix ascii with non- under /aa */
10104 if (MORE_ASCII_RESTRICTED
10105 && (isASCII(*loc) != isASCII(j)))
10107 goto end_multi_fold;
10109 if (UTF8_IS_INVARIANT(*loc)
10110 || UTF8_IS_DOWNGRADEABLE_START(*loc))
10112 /* Can't mix above and below 256 under
10115 goto end_multi_fold;
10118 |= ANYOF_NONBITMAP_NON_UTF8;
10121 loc += UTF8SKIP(loc);
10125 add_alternate(&unicode_alternate, foldbuf, foldlen);
10129 /* This is special-cased, as it is the only letter which
10130 * has both a multi-fold and single-fold in Latin1. All
10131 * the other chars that have single and multi-folds are
10132 * always in utf8, and the utf8 folding algorithm catches
10134 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
10135 stored += set_regclass_bit(pRExC_state,
10137 LATIN_SMALL_LETTER_SHARP_S,
10138 &l1_fold_invlist, &unicode_alternate);
10142 /* Single character fold. Add everything in its fold
10143 * closure to the list that this node should match */
10146 /* The fold closures data structure is a hash with the
10147 * keys being every character that is folded to, like
10148 * 'k', and the values each an array of everything that
10149 * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
10150 if ((listp = hv_fetch(PL_utf8_foldclosures,
10151 (char *) foldbuf, foldlen, FALSE)))
10153 AV* list = (AV*) *listp;
10155 for (k = 0; k <= av_len(list); k++) {
10156 SV** c_p = av_fetch(list, k, FALSE);
10159 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10163 /* /aa doesn't allow folds between ASCII and
10164 * non-; /l doesn't allow them between above
10166 if ((MORE_ASCII_RESTRICTED
10167 && (isASCII(c) != isASCII(j)))
10168 || (LOC && ((c < 256) != (j < 256))))
10173 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10174 stored += set_regclass_bit(pRExC_state,
10177 &l1_fold_invlist, &unicode_alternate);
10179 /* It may be that the code point is already
10180 * in this range or already in the bitmap,
10181 * in which case we need do nothing */
10182 else if ((c < start || c > end)
10184 || ! ANYOF_BITMAP_TEST(ret, c)))
10186 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10193 invlist_destroy(fold_intersection);
10196 /* Combine the two lists into one. */
10197 if (l1_fold_invlist) {
10199 HV* temp = invlist_union(nonbitmap, l1_fold_invlist);
10200 invlist_destroy(nonbitmap);
10202 invlist_destroy(l1_fold_invlist);
10205 nonbitmap = l1_fold_invlist;
10209 /* Here, we have calculated what code points should be in the character
10210 * class. Now we can see about various optimizations. Fold calculation
10211 * needs to take place before inversion. Otherwise /[^k]/i would invert to
10212 * include K, which under /i would match k. */
10214 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
10215 * set the FOLD flag yet, so this this does optimize those. It doesn't
10216 * optimize locale. Doing so perhaps could be done as long as there is
10217 * nothing like \w in it; some thought also would have to be given to the
10218 * interaction with above 0x100 chars */
10220 && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10221 && ! unicode_alternate
10223 && SvCUR(listsv) == initial_listsv_len)
10225 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10226 ANYOF_BITMAP(ret)[value] ^= 0xFF;
10227 stored = 256 - stored;
10229 /* The inversion means that everything above 255 is matched; and at the
10230 * same time we clear the invert flag */
10231 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10234 /* Folding in the bitmap is taken care of above, but not for locale (for
10235 * which we have to wait to see what folding is in effect at runtime), and
10236 * for things not in the bitmap. Set run-time fold flag for these */
10237 if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10238 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10241 /* A single character class can be "optimized" into an EXACTish node.
10242 * Note that since we don't currently count how many characters there are
10243 * outside the bitmap, we are XXX missing optimization possibilities for
10244 * them. This optimization can't happen unless this is a truly single
10245 * character class, which means that it can't be an inversion into a
10246 * many-character class, and there must be no possibility of there being
10247 * things outside the bitmap. 'stored' (only) for locales doesn't include
10248 * \w, etc, so have to make a special test that they aren't present
10250 * Similarly A 2-character class of the very special form like [bB] can be
10251 * optimized into an EXACTFish node, but only for non-locales, and for
10252 * characters which only have the two folds; so things like 'fF' and 'Ii'
10253 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10256 && ! unicode_alternate
10257 && SvCUR(listsv) == initial_listsv_len
10258 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10259 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10260 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10261 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10262 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10263 /* If the latest code point has a fold whose
10264 * bit is set, it must be the only other one */
10265 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10266 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10268 /* Note that the information needed to decide to do this optimization
10269 * is not currently available until the 2nd pass, and that the actually
10270 * used EXACTish node takes less space than the calculated ANYOF node,
10271 * and hence the amount of space calculated in the first pass is larger
10272 * than actually used, so this optimization doesn't gain us any space.
10273 * But an EXACT node is faster than an ANYOF node, and can be combined
10274 * with any adjacent EXACT nodes later by the optimizer for further
10275 * gains. The speed of executing an EXACTF is similar to an ANYOF
10276 * node, so the optimization advantage comes from the ability to join
10277 * it to adjacent EXACT nodes */
10279 const char * cur_parse= RExC_parse;
10281 RExC_emit = (regnode *)orig_emit;
10282 RExC_parse = (char *)orig_parse;
10286 /* A locale node with one point can be folded; all the other cases
10287 * with folding will have two points, since we calculate them above
10289 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10295 } /* else 2 chars in the bit map: the folds of each other */
10296 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10298 /* To join adjacent nodes, they must be the exact EXACTish type.
10299 * Try to use the most likely type, by using EXACTFU if the regex
10300 * calls for them, or is required because the character is
10304 else { /* Otherwise, more likely to be EXACTF type */
10308 ret = reg_node(pRExC_state, op);
10309 RExC_parse = (char *)cur_parse;
10310 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10311 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10312 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10314 RExC_emit += STR_SZ(2);
10317 *STRING(ret)= (char)value;
10319 RExC_emit += STR_SZ(1);
10321 SvREFCNT_dec(listsv);
10326 UV* nonbitmap_array = invlist_array(nonbitmap);
10327 UV nonbitmap_len = invlist_len(nonbitmap);
10330 /* Here have the full list of items to match that aren't in the
10331 * bitmap. Convert to the structure that the rest of the code is
10332 * expecting. XXX That rest of the code should convert to this
10334 for (i = 0; i < nonbitmap_len; i++) {
10336 /* The next entry is the beginning of the range that is in the
10338 UV start = nonbitmap_array[i++];
10341 /* The next entry is the beginning of the next range, which isn't
10342 * in the class, so the end of the current range is one less than
10343 * that. But if there is no next range, it means that the range
10344 * begun by 'start' extends to infinity, which for this platform
10345 * ends at UV_MAX */
10346 if (i == nonbitmap_len) {
10350 end = nonbitmap_array[i] - 1;
10353 if (start == end) {
10354 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10357 /* The \t sets the whole range */
10358 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10363 invlist_destroy(nonbitmap);
10366 if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10367 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10368 SvREFCNT_dec(listsv);
10369 SvREFCNT_dec(unicode_alternate);
10373 AV * const av = newAV();
10375 /* The 0th element stores the character class description
10376 * in its textual form: used later (regexec.c:Perl_regclass_swash())
10377 * to initialize the appropriate swash (which gets stored in
10378 * the 1st element), and also useful for dumping the regnode.
10379 * The 2nd element stores the multicharacter foldings,
10380 * used later (regexec.c:S_reginclass()). */
10381 av_store(av, 0, listsv);
10382 av_store(av, 1, NULL);
10384 /* Store any computed multi-char folds only if we are allowing
10386 if (allow_full_fold) {
10387 av_store(av, 2, MUTABLE_SV(unicode_alternate));
10388 if (unicode_alternate) { /* This node is variable length */
10393 av_store(av, 2, NULL);
10395 rv = newRV_noinc(MUTABLE_SV(av));
10396 n = add_data(pRExC_state, 1, "s");
10397 RExC_rxi->data->data[n] = (void*)rv;
10405 /* reg_skipcomment()
10407 Absorbs an /x style # comments from the input stream.
10408 Returns true if there is more text remaining in the stream.
10409 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10410 terminates the pattern without including a newline.
10412 Note its the callers responsibility to ensure that we are
10413 actually in /x mode
10418 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10422 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10424 while (RExC_parse < RExC_end)
10425 if (*RExC_parse++ == '\n') {
10430 /* we ran off the end of the pattern without ending
10431 the comment, so we have to add an \n when wrapping */
10432 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10440 Advances the parse position, and optionally absorbs
10441 "whitespace" from the inputstream.
10443 Without /x "whitespace" means (?#...) style comments only,
10444 with /x this means (?#...) and # comments and whitespace proper.
10446 Returns the RExC_parse point from BEFORE the scan occurs.
10448 This is the /x friendly way of saying RExC_parse++.
10452 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10454 char* const retval = RExC_parse++;
10456 PERL_ARGS_ASSERT_NEXTCHAR;
10459 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10460 RExC_parse[2] == '#') {
10461 while (*RExC_parse != ')') {
10462 if (RExC_parse == RExC_end)
10463 FAIL("Sequence (?#... not terminated");
10469 if (RExC_flags & RXf_PMf_EXTENDED) {
10470 if (isSPACE(*RExC_parse)) {
10474 else if (*RExC_parse == '#') {
10475 if ( reg_skipcomment( pRExC_state ) )
10484 - reg_node - emit a node
10486 STATIC regnode * /* Location. */
10487 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10490 register regnode *ptr;
10491 regnode * const ret = RExC_emit;
10492 GET_RE_DEBUG_FLAGS_DECL;
10494 PERL_ARGS_ASSERT_REG_NODE;
10497 SIZE_ALIGN(RExC_size);
10501 if (RExC_emit >= RExC_emit_bound)
10502 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10504 NODE_ALIGN_FILL(ret);
10506 FILL_ADVANCE_NODE(ptr, op);
10507 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
10508 #ifdef RE_TRACK_PATTERN_OFFSETS
10509 if (RExC_offsets) { /* MJD */
10510 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10511 "reg_node", __LINE__,
10513 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10514 ? "Overwriting end of array!\n" : "OK",
10515 (UV)(RExC_emit - RExC_emit_start),
10516 (UV)(RExC_parse - RExC_start),
10517 (UV)RExC_offsets[0]));
10518 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10526 - reganode - emit a node with an argument
10528 STATIC regnode * /* Location. */
10529 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10532 register regnode *ptr;
10533 regnode * const ret = RExC_emit;
10534 GET_RE_DEBUG_FLAGS_DECL;
10536 PERL_ARGS_ASSERT_REGANODE;
10539 SIZE_ALIGN(RExC_size);
10544 assert(2==regarglen[op]+1);
10546 Anything larger than this has to allocate the extra amount.
10547 If we changed this to be:
10549 RExC_size += (1 + regarglen[op]);
10551 then it wouldn't matter. Its not clear what side effect
10552 might come from that so its not done so far.
10557 if (RExC_emit >= RExC_emit_bound)
10558 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10560 NODE_ALIGN_FILL(ret);
10562 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10563 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
10564 #ifdef RE_TRACK_PATTERN_OFFSETS
10565 if (RExC_offsets) { /* MJD */
10566 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10570 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10571 "Overwriting end of array!\n" : "OK",
10572 (UV)(RExC_emit - RExC_emit_start),
10573 (UV)(RExC_parse - RExC_start),
10574 (UV)RExC_offsets[0]));
10575 Set_Cur_Node_Offset;
10583 - reguni - emit (if appropriate) a Unicode character
10586 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10590 PERL_ARGS_ASSERT_REGUNI;
10592 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10596 - reginsert - insert an operator in front of already-emitted operand
10598 * Means relocating the operand.
10601 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10604 register regnode *src;
10605 register regnode *dst;
10606 register regnode *place;
10607 const int offset = regarglen[(U8)op];
10608 const int size = NODE_STEP_REGNODE + offset;
10609 GET_RE_DEBUG_FLAGS_DECL;
10611 PERL_ARGS_ASSERT_REGINSERT;
10612 PERL_UNUSED_ARG(depth);
10613 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10614 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10623 if (RExC_open_parens) {
10625 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10626 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10627 if ( RExC_open_parens[paren] >= opnd ) {
10628 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10629 RExC_open_parens[paren] += size;
10631 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10633 if ( RExC_close_parens[paren] >= opnd ) {
10634 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10635 RExC_close_parens[paren] += size;
10637 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10642 while (src > opnd) {
10643 StructCopy(--src, --dst, regnode);
10644 #ifdef RE_TRACK_PATTERN_OFFSETS
10645 if (RExC_offsets) { /* MJD 20010112 */
10646 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10650 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10651 ? "Overwriting end of array!\n" : "OK",
10652 (UV)(src - RExC_emit_start),
10653 (UV)(dst - RExC_emit_start),
10654 (UV)RExC_offsets[0]));
10655 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10656 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10662 place = opnd; /* Op node, where operand used to be. */
10663 #ifdef RE_TRACK_PATTERN_OFFSETS
10664 if (RExC_offsets) { /* MJD */
10665 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10669 (UV)(place - RExC_emit_start) > RExC_offsets[0]
10670 ? "Overwriting end of array!\n" : "OK",
10671 (UV)(place - RExC_emit_start),
10672 (UV)(RExC_parse - RExC_start),
10673 (UV)RExC_offsets[0]));
10674 Set_Node_Offset(place, RExC_parse);
10675 Set_Node_Length(place, 1);
10678 src = NEXTOPER(place);
10679 FILL_ADVANCE_NODE(place, op);
10680 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
10681 Zero(src, offset, regnode);
10685 - regtail - set the next-pointer at the end of a node chain of p to val.
10686 - SEE ALSO: regtail_study
10688 /* TODO: All three parms should be const */
10690 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10693 register regnode *scan;
10694 GET_RE_DEBUG_FLAGS_DECL;
10696 PERL_ARGS_ASSERT_REGTAIL;
10698 PERL_UNUSED_ARG(depth);
10704 /* Find last node. */
10707 regnode * const temp = regnext(scan);
10709 SV * const mysv=sv_newmortal();
10710 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10711 regprop(RExC_rx, mysv, scan);
10712 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10713 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10714 (temp == NULL ? "->" : ""),
10715 (temp == NULL ? PL_reg_name[OP(val)] : "")
10723 if (reg_off_by_arg[OP(scan)]) {
10724 ARG_SET(scan, val - scan);
10727 NEXT_OFF(scan) = val - scan;
10733 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10734 - Look for optimizable sequences at the same time.
10735 - currently only looks for EXACT chains.
10737 This is experimental code. The idea is to use this routine to perform
10738 in place optimizations on branches and groups as they are constructed,
10739 with the long term intention of removing optimization from study_chunk so
10740 that it is purely analytical.
10742 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10743 to control which is which.
10746 /* TODO: All four parms should be const */
10749 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10752 register regnode *scan;
10754 #ifdef EXPERIMENTAL_INPLACESCAN
10757 GET_RE_DEBUG_FLAGS_DECL;
10759 PERL_ARGS_ASSERT_REGTAIL_STUDY;
10765 /* Find last node. */
10769 regnode * const temp = regnext(scan);
10770 #ifdef EXPERIMENTAL_INPLACESCAN
10771 if (PL_regkind[OP(scan)] == EXACT)
10772 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10776 switch (OP(scan)) {
10782 if( exact == PSEUDO )
10784 else if ( exact != OP(scan) )
10793 SV * const mysv=sv_newmortal();
10794 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10795 regprop(RExC_rx, mysv, scan);
10796 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10797 SvPV_nolen_const(mysv),
10798 REG_NODE_NUM(scan),
10799 PL_reg_name[exact]);
10806 SV * const mysv_val=sv_newmortal();
10807 DEBUG_PARSE_MSG("");
10808 regprop(RExC_rx, mysv_val, val);
10809 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10810 SvPV_nolen_const(mysv_val),
10811 (IV)REG_NODE_NUM(val),
10815 if (reg_off_by_arg[OP(scan)]) {
10816 ARG_SET(scan, val - scan);
10819 NEXT_OFF(scan) = val - scan;
10827 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10831 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10837 for (bit=0; bit<32; bit++) {
10838 if (flags & (1<<bit)) {
10839 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10842 if (!set++ && lead)
10843 PerlIO_printf(Perl_debug_log, "%s",lead);
10844 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10847 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10848 if (!set++ && lead) {
10849 PerlIO_printf(Perl_debug_log, "%s",lead);
10852 case REGEX_UNICODE_CHARSET:
10853 PerlIO_printf(Perl_debug_log, "UNICODE");
10855 case REGEX_LOCALE_CHARSET:
10856 PerlIO_printf(Perl_debug_log, "LOCALE");
10858 case REGEX_ASCII_RESTRICTED_CHARSET:
10859 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10861 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10862 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10865 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10871 PerlIO_printf(Perl_debug_log, "\n");
10873 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10879 Perl_regdump(pTHX_ const regexp *r)
10883 SV * const sv = sv_newmortal();
10884 SV *dsv= sv_newmortal();
10885 RXi_GET_DECL(r,ri);
10886 GET_RE_DEBUG_FLAGS_DECL;
10888 PERL_ARGS_ASSERT_REGDUMP;
10890 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10892 /* Header fields of interest. */
10893 if (r->anchored_substr) {
10894 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10895 RE_SV_DUMPLEN(r->anchored_substr), 30);
10896 PerlIO_printf(Perl_debug_log,
10897 "anchored %s%s at %"IVdf" ",
10898 s, RE_SV_TAIL(r->anchored_substr),
10899 (IV)r->anchored_offset);
10900 } else if (r->anchored_utf8) {
10901 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10902 RE_SV_DUMPLEN(r->anchored_utf8), 30);
10903 PerlIO_printf(Perl_debug_log,
10904 "anchored utf8 %s%s at %"IVdf" ",
10905 s, RE_SV_TAIL(r->anchored_utf8),
10906 (IV)r->anchored_offset);
10908 if (r->float_substr) {
10909 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10910 RE_SV_DUMPLEN(r->float_substr), 30);
10911 PerlIO_printf(Perl_debug_log,
10912 "floating %s%s at %"IVdf"..%"UVuf" ",
10913 s, RE_SV_TAIL(r->float_substr),
10914 (IV)r->float_min_offset, (UV)r->float_max_offset);
10915 } else if (r->float_utf8) {
10916 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10917 RE_SV_DUMPLEN(r->float_utf8), 30);
10918 PerlIO_printf(Perl_debug_log,
10919 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10920 s, RE_SV_TAIL(r->float_utf8),
10921 (IV)r->float_min_offset, (UV)r->float_max_offset);
10923 if (r->check_substr || r->check_utf8)
10924 PerlIO_printf(Perl_debug_log,
10926 (r->check_substr == r->float_substr
10927 && r->check_utf8 == r->float_utf8
10928 ? "(checking floating" : "(checking anchored"));
10929 if (r->extflags & RXf_NOSCAN)
10930 PerlIO_printf(Perl_debug_log, " noscan");
10931 if (r->extflags & RXf_CHECK_ALL)
10932 PerlIO_printf(Perl_debug_log, " isall");
10933 if (r->check_substr || r->check_utf8)
10934 PerlIO_printf(Perl_debug_log, ") ");
10936 if (ri->regstclass) {
10937 regprop(r, sv, ri->regstclass);
10938 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10940 if (r->extflags & RXf_ANCH) {
10941 PerlIO_printf(Perl_debug_log, "anchored");
10942 if (r->extflags & RXf_ANCH_BOL)
10943 PerlIO_printf(Perl_debug_log, "(BOL)");
10944 if (r->extflags & RXf_ANCH_MBOL)
10945 PerlIO_printf(Perl_debug_log, "(MBOL)");
10946 if (r->extflags & RXf_ANCH_SBOL)
10947 PerlIO_printf(Perl_debug_log, "(SBOL)");
10948 if (r->extflags & RXf_ANCH_GPOS)
10949 PerlIO_printf(Perl_debug_log, "(GPOS)");
10950 PerlIO_putc(Perl_debug_log, ' ');
10952 if (r->extflags & RXf_GPOS_SEEN)
10953 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10954 if (r->intflags & PREGf_SKIP)
10955 PerlIO_printf(Perl_debug_log, "plus ");
10956 if (r->intflags & PREGf_IMPLICIT)
10957 PerlIO_printf(Perl_debug_log, "implicit ");
10958 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10959 if (r->extflags & RXf_EVAL_SEEN)
10960 PerlIO_printf(Perl_debug_log, "with eval ");
10961 PerlIO_printf(Perl_debug_log, "\n");
10962 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10964 PERL_ARGS_ASSERT_REGDUMP;
10965 PERL_UNUSED_CONTEXT;
10966 PERL_UNUSED_ARG(r);
10967 #endif /* DEBUGGING */
10971 - regprop - printable representation of opcode
10973 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10976 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10977 if (flags & ANYOF_INVERT) \
10978 /*make sure the invert info is in each */ \
10979 sv_catpvs(sv, "^"); \
10985 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10990 RXi_GET_DECL(prog,progi);
10991 GET_RE_DEBUG_FLAGS_DECL;
10993 PERL_ARGS_ASSERT_REGPROP;
10997 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
10998 /* It would be nice to FAIL() here, but this may be called from
10999 regexec.c, and it would be hard to supply pRExC_state. */
11000 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
11001 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
11003 k = PL_regkind[OP(o)];
11006 sv_catpvs(sv, " ");
11007 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
11008 * is a crude hack but it may be the best for now since
11009 * we have no flag "this EXACTish node was UTF-8"
11011 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
11012 PERL_PV_ESCAPE_UNI_DETECT |
11013 PERL_PV_ESCAPE_NONASCII |
11014 PERL_PV_PRETTY_ELLIPSES |
11015 PERL_PV_PRETTY_LTGT |
11016 PERL_PV_PRETTY_NOCLEAR
11018 } else if (k == TRIE) {
11019 /* print the details of the trie in dumpuntil instead, as
11020 * progi->data isn't available here */
11021 const char op = OP(o);
11022 const U32 n = ARG(o);
11023 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11024 (reg_ac_data *)progi->data->data[n] :
11026 const reg_trie_data * const trie
11027 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11029 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11030 DEBUG_TRIE_COMPILE_r(
11031 Perl_sv_catpvf(aTHX_ sv,
11032 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11033 (UV)trie->startstate,
11034 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11035 (UV)trie->wordcount,
11038 (UV)TRIE_CHARCOUNT(trie),
11039 (UV)trie->uniquecharcount
11042 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11044 int rangestart = -1;
11045 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11046 sv_catpvs(sv, "[");
11047 for (i = 0; i <= 256; i++) {
11048 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11049 if (rangestart == -1)
11051 } else if (rangestart != -1) {
11052 if (i <= rangestart + 3)
11053 for (; rangestart < i; rangestart++)
11054 put_byte(sv, rangestart);
11056 put_byte(sv, rangestart);
11057 sv_catpvs(sv, "-");
11058 put_byte(sv, i - 1);
11063 sv_catpvs(sv, "]");
11066 } else if (k == CURLY) {
11067 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11068 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11069 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11071 else if (k == WHILEM && o->flags) /* Ordinal/of */
11072 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11073 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11074 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
11075 if ( RXp_PAREN_NAMES(prog) ) {
11076 if ( k != REF || (OP(o) < NREF)) {
11077 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11078 SV **name= av_fetch(list, ARG(o), 0 );
11080 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11083 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11084 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11085 I32 *nums=(I32*)SvPVX(sv_dat);
11086 SV **name= av_fetch(list, nums[0], 0 );
11089 for ( n=0; n<SvIVX(sv_dat); n++ ) {
11090 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11091 (n ? "," : ""), (IV)nums[n]);
11093 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11097 } else if (k == GOSUB)
11098 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11099 else if (k == VERB) {
11101 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11102 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11103 } else if (k == LOGICAL)
11104 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
11105 else if (k == FOLDCHAR)
11106 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11107 else if (k == ANYOF) {
11108 int i, rangestart = -1;
11109 const U8 flags = ANYOF_FLAGS(o);
11112 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11113 static const char * const anyofs[] = {
11146 if (flags & ANYOF_LOCALE)
11147 sv_catpvs(sv, "{loc}");
11148 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11149 sv_catpvs(sv, "{i}");
11150 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11151 if (flags & ANYOF_INVERT)
11152 sv_catpvs(sv, "^");
11154 /* output what the standard cp 0-255 bitmap matches */
11155 for (i = 0; i <= 256; i++) {
11156 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11157 if (rangestart == -1)
11159 } else if (rangestart != -1) {
11160 if (i <= rangestart + 3)
11161 for (; rangestart < i; rangestart++)
11162 put_byte(sv, rangestart);
11164 put_byte(sv, rangestart);
11165 sv_catpvs(sv, "-");
11166 put_byte(sv, i - 1);
11173 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11174 /* output any special charclass tests (used entirely under use locale) */
11175 if (ANYOF_CLASS_TEST_ANY_SET(o))
11176 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11177 if (ANYOF_CLASS_TEST(o,i)) {
11178 sv_catpv(sv, anyofs[i]);
11182 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11184 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11185 sv_catpvs(sv, "{non-utf8-latin1-all}");
11188 /* output information about the unicode matching */
11189 if (flags & ANYOF_UNICODE_ALL)
11190 sv_catpvs(sv, "{unicode_all}");
11191 else if (ANYOF_NONBITMAP(o))
11192 sv_catpvs(sv, "{unicode}");
11193 if (flags & ANYOF_NONBITMAP_NON_UTF8)
11194 sv_catpvs(sv, "{outside bitmap}");
11196 if (ANYOF_NONBITMAP(o)) {
11198 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11202 U8 s[UTF8_MAXBYTES_CASE+1];
11204 for (i = 0; i <= 256; i++) { /* just the first 256 */
11205 uvchr_to_utf8(s, i);
11207 if (i < 256 && swash_fetch(sw, s, TRUE)) {
11208 if (rangestart == -1)
11210 } else if (rangestart != -1) {
11211 if (i <= rangestart + 3)
11212 for (; rangestart < i; rangestart++) {
11213 const U8 * const e = uvchr_to_utf8(s,rangestart);
11215 for(p = s; p < e; p++)
11219 const U8 *e = uvchr_to_utf8(s,rangestart);
11221 for (p = s; p < e; p++)
11223 sv_catpvs(sv, "-");
11224 e = uvchr_to_utf8(s, i-1);
11225 for (p = s; p < e; p++)
11232 sv_catpvs(sv, "..."); /* et cetera */
11236 char *s = savesvpv(lv);
11237 char * const origs = s;
11239 while (*s && *s != '\n')
11243 const char * const t = ++s;
11261 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11263 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11264 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11266 PERL_UNUSED_CONTEXT;
11267 PERL_UNUSED_ARG(sv);
11268 PERL_UNUSED_ARG(o);
11269 PERL_UNUSED_ARG(prog);
11270 #endif /* DEBUGGING */
11274 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11275 { /* Assume that RE_INTUIT is set */
11277 struct regexp *const prog = (struct regexp *)SvANY(r);
11278 GET_RE_DEBUG_FLAGS_DECL;
11280 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11281 PERL_UNUSED_CONTEXT;
11285 const char * const s = SvPV_nolen_const(prog->check_substr
11286 ? prog->check_substr : prog->check_utf8);
11288 if (!PL_colorset) reginitcolors();
11289 PerlIO_printf(Perl_debug_log,
11290 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11292 prog->check_substr ? "" : "utf8 ",
11293 PL_colors[5],PL_colors[0],
11296 (strlen(s) > 60 ? "..." : ""));
11299 return prog->check_substr ? prog->check_substr : prog->check_utf8;
11305 handles refcounting and freeing the perl core regexp structure. When
11306 it is necessary to actually free the structure the first thing it
11307 does is call the 'free' method of the regexp_engine associated to
11308 the regexp, allowing the handling of the void *pprivate; member
11309 first. (This routine is not overridable by extensions, which is why
11310 the extensions free is called first.)
11312 See regdupe and regdupe_internal if you change anything here.
11314 #ifndef PERL_IN_XSUB_RE
11316 Perl_pregfree(pTHX_ REGEXP *r)
11322 Perl_pregfree2(pTHX_ REGEXP *rx)
11325 struct regexp *const r = (struct regexp *)SvANY(rx);
11326 GET_RE_DEBUG_FLAGS_DECL;
11328 PERL_ARGS_ASSERT_PREGFREE2;
11330 if (r->mother_re) {
11331 ReREFCNT_dec(r->mother_re);
11333 CALLREGFREE_PVT(rx); /* free the private data */
11334 SvREFCNT_dec(RXp_PAREN_NAMES(r));
11337 SvREFCNT_dec(r->anchored_substr);
11338 SvREFCNT_dec(r->anchored_utf8);
11339 SvREFCNT_dec(r->float_substr);
11340 SvREFCNT_dec(r->float_utf8);
11341 Safefree(r->substrs);
11343 RX_MATCH_COPY_FREE(rx);
11344 #ifdef PERL_OLD_COPY_ON_WRITE
11345 SvREFCNT_dec(r->saved_copy);
11352 This is a hacky workaround to the structural issue of match results
11353 being stored in the regexp structure which is in turn stored in
11354 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11355 could be PL_curpm in multiple contexts, and could require multiple
11356 result sets being associated with the pattern simultaneously, such
11357 as when doing a recursive match with (??{$qr})
11359 The solution is to make a lightweight copy of the regexp structure
11360 when a qr// is returned from the code executed by (??{$qr}) this
11361 lightweight copy doesn't actually own any of its data except for
11362 the starp/end and the actual regexp structure itself.
11368 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11370 struct regexp *ret;
11371 struct regexp *const r = (struct regexp *)SvANY(rx);
11372 register const I32 npar = r->nparens+1;
11374 PERL_ARGS_ASSERT_REG_TEMP_COPY;
11377 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11378 ret = (struct regexp *)SvANY(ret_x);
11380 (void)ReREFCNT_inc(rx);
11381 /* We can take advantage of the existing "copied buffer" mechanism in SVs
11382 by pointing directly at the buffer, but flagging that the allocated
11383 space in the copy is zero. As we've just done a struct copy, it's now
11384 a case of zero-ing that, rather than copying the current length. */
11385 SvPV_set(ret_x, RX_WRAPPED(rx));
11386 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11387 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11388 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11389 SvLEN_set(ret_x, 0);
11390 SvSTASH_set(ret_x, NULL);
11391 SvMAGIC_set(ret_x, NULL);
11392 Newx(ret->offs, npar, regexp_paren_pair);
11393 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11395 Newx(ret->substrs, 1, struct reg_substr_data);
11396 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11398 SvREFCNT_inc_void(ret->anchored_substr);
11399 SvREFCNT_inc_void(ret->anchored_utf8);
11400 SvREFCNT_inc_void(ret->float_substr);
11401 SvREFCNT_inc_void(ret->float_utf8);
11403 /* check_substr and check_utf8, if non-NULL, point to either their
11404 anchored or float namesakes, and don't hold a second reference. */
11406 RX_MATCH_COPIED_off(ret_x);
11407 #ifdef PERL_OLD_COPY_ON_WRITE
11408 ret->saved_copy = NULL;
11410 ret->mother_re = rx;
11416 /* regfree_internal()
11418 Free the private data in a regexp. This is overloadable by
11419 extensions. Perl takes care of the regexp structure in pregfree(),
11420 this covers the *pprivate pointer which technically perl doesn't
11421 know about, however of course we have to handle the
11422 regexp_internal structure when no extension is in use.
11424 Note this is called before freeing anything in the regexp
11429 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11432 struct regexp *const r = (struct regexp *)SvANY(rx);
11433 RXi_GET_DECL(r,ri);
11434 GET_RE_DEBUG_FLAGS_DECL;
11436 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11442 SV *dsv= sv_newmortal();
11443 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11444 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11445 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11446 PL_colors[4],PL_colors[5],s);
11449 #ifdef RE_TRACK_PATTERN_OFFSETS
11451 Safefree(ri->u.offsets); /* 20010421 MJD */
11454 int n = ri->data->count;
11455 PAD* new_comppad = NULL;
11460 /* If you add a ->what type here, update the comment in regcomp.h */
11461 switch (ri->data->what[n]) {
11466 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11469 Safefree(ri->data->data[n]);
11472 new_comppad = MUTABLE_AV(ri->data->data[n]);
11475 if (new_comppad == NULL)
11476 Perl_croak(aTHX_ "panic: pregfree comppad");
11477 PAD_SAVE_LOCAL(old_comppad,
11478 /* Watch out for global destruction's random ordering. */
11479 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11482 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11485 op_free((OP_4tree*)ri->data->data[n]);
11487 PAD_RESTORE_LOCAL(old_comppad);
11488 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11489 new_comppad = NULL;
11494 { /* Aho Corasick add-on structure for a trie node.
11495 Used in stclass optimization only */
11497 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11499 refcount = --aho->refcount;
11502 PerlMemShared_free(aho->states);
11503 PerlMemShared_free(aho->fail);
11504 /* do this last!!!! */
11505 PerlMemShared_free(ri->data->data[n]);
11506 PerlMemShared_free(ri->regstclass);
11512 /* trie structure. */
11514 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11516 refcount = --trie->refcount;
11519 PerlMemShared_free(trie->charmap);
11520 PerlMemShared_free(trie->states);
11521 PerlMemShared_free(trie->trans);
11523 PerlMemShared_free(trie->bitmap);
11525 PerlMemShared_free(trie->jump);
11526 PerlMemShared_free(trie->wordinfo);
11527 /* do this last!!!! */
11528 PerlMemShared_free(ri->data->data[n]);
11533 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11536 Safefree(ri->data->what);
11537 Safefree(ri->data);
11543 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11544 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11545 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11548 re_dup - duplicate a regexp.
11550 This routine is expected to clone a given regexp structure. It is only
11551 compiled under USE_ITHREADS.
11553 After all of the core data stored in struct regexp is duplicated
11554 the regexp_engine.dupe method is used to copy any private data
11555 stored in the *pprivate pointer. This allows extensions to handle
11556 any duplication it needs to do.
11558 See pregfree() and regfree_internal() if you change anything here.
11560 #if defined(USE_ITHREADS)
11561 #ifndef PERL_IN_XSUB_RE
11563 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11567 const struct regexp *r = (const struct regexp *)SvANY(sstr);
11568 struct regexp *ret = (struct regexp *)SvANY(dstr);
11570 PERL_ARGS_ASSERT_RE_DUP_GUTS;
11572 npar = r->nparens+1;
11573 Newx(ret->offs, npar, regexp_paren_pair);
11574 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11576 /* no need to copy these */
11577 Newx(ret->swap, npar, regexp_paren_pair);
11580 if (ret->substrs) {
11581 /* Do it this way to avoid reading from *r after the StructCopy().
11582 That way, if any of the sv_dup_inc()s dislodge *r from the L1
11583 cache, it doesn't matter. */
11584 const bool anchored = r->check_substr
11585 ? r->check_substr == r->anchored_substr
11586 : r->check_utf8 == r->anchored_utf8;
11587 Newx(ret->substrs, 1, struct reg_substr_data);
11588 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11590 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11591 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11592 ret->float_substr = sv_dup_inc(ret->float_substr, param);
11593 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11595 /* check_substr and check_utf8, if non-NULL, point to either their
11596 anchored or float namesakes, and don't hold a second reference. */
11598 if (ret->check_substr) {
11600 assert(r->check_utf8 == r->anchored_utf8);
11601 ret->check_substr = ret->anchored_substr;
11602 ret->check_utf8 = ret->anchored_utf8;
11604 assert(r->check_substr == r->float_substr);
11605 assert(r->check_utf8 == r->float_utf8);
11606 ret->check_substr = ret->float_substr;
11607 ret->check_utf8 = ret->float_utf8;
11609 } else if (ret->check_utf8) {
11611 ret->check_utf8 = ret->anchored_utf8;
11613 ret->check_utf8 = ret->float_utf8;
11618 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11621 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11623 if (RX_MATCH_COPIED(dstr))
11624 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
11626 ret->subbeg = NULL;
11627 #ifdef PERL_OLD_COPY_ON_WRITE
11628 ret->saved_copy = NULL;
11631 if (ret->mother_re) {
11632 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11633 /* Our storage points directly to our mother regexp, but that's
11634 1: a buffer in a different thread
11635 2: something we no longer hold a reference on
11636 so we need to copy it locally. */
11637 /* Note we need to sue SvCUR() on our mother_re, because it, in
11638 turn, may well be pointing to its own mother_re. */
11639 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11640 SvCUR(ret->mother_re)+1));
11641 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11643 ret->mother_re = NULL;
11647 #endif /* PERL_IN_XSUB_RE */
11652 This is the internal complement to regdupe() which is used to copy
11653 the structure pointed to by the *pprivate pointer in the regexp.
11654 This is the core version of the extension overridable cloning hook.
11655 The regexp structure being duplicated will be copied by perl prior
11656 to this and will be provided as the regexp *r argument, however
11657 with the /old/ structures pprivate pointer value. Thus this routine
11658 may override any copying normally done by perl.
11660 It returns a pointer to the new regexp_internal structure.
11664 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11667 struct regexp *const r = (struct regexp *)SvANY(rx);
11668 regexp_internal *reti;
11670 RXi_GET_DECL(r,ri);
11672 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11674 npar = r->nparens+1;
11677 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11678 Copy(ri->program, reti->program, len+1, regnode);
11681 reti->regstclass = NULL;
11684 struct reg_data *d;
11685 const int count = ri->data->count;
11688 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11689 char, struct reg_data);
11690 Newx(d->what, count, U8);
11693 for (i = 0; i < count; i++) {
11694 d->what[i] = ri->data->what[i];
11695 switch (d->what[i]) {
11696 /* legal options are one of: sSfpontTua
11697 see also regcomp.h and pregfree() */
11698 case 'a': /* actually an AV, but the dup function is identical. */
11701 case 'p': /* actually an AV, but the dup function is identical. */
11702 case 'u': /* actually an HV, but the dup function is identical. */
11703 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11706 /* This is cheating. */
11707 Newx(d->data[i], 1, struct regnode_charclass_class);
11708 StructCopy(ri->data->data[i], d->data[i],
11709 struct regnode_charclass_class);
11710 reti->regstclass = (regnode*)d->data[i];
11713 /* Compiled op trees are readonly and in shared memory,
11714 and can thus be shared without duplication. */
11716 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11720 /* Trie stclasses are readonly and can thus be shared
11721 * without duplication. We free the stclass in pregfree
11722 * when the corresponding reg_ac_data struct is freed.
11724 reti->regstclass= ri->regstclass;
11728 ((reg_trie_data*)ri->data->data[i])->refcount++;
11732 d->data[i] = ri->data->data[i];
11735 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11744 reti->name_list_idx = ri->name_list_idx;
11746 #ifdef RE_TRACK_PATTERN_OFFSETS
11747 if (ri->u.offsets) {
11748 Newx(reti->u.offsets, 2*len+1, U32);
11749 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11752 SetProgLen(reti,len);
11755 return (void*)reti;
11758 #endif /* USE_ITHREADS */
11760 #ifndef PERL_IN_XSUB_RE
11763 - regnext - dig the "next" pointer out of a node
11766 Perl_regnext(pTHX_ register regnode *p)
11769 register I32 offset;
11774 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
11775 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11778 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11787 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11790 STRLEN l1 = strlen(pat1);
11791 STRLEN l2 = strlen(pat2);
11794 const char *message;
11796 PERL_ARGS_ASSERT_RE_CROAK2;
11802 Copy(pat1, buf, l1 , char);
11803 Copy(pat2, buf + l1, l2 , char);
11804 buf[l1 + l2] = '\n';
11805 buf[l1 + l2 + 1] = '\0';
11807 /* ANSI variant takes additional second argument */
11808 va_start(args, pat2);
11812 msv = vmess(buf, &args);
11814 message = SvPV_const(msv,l1);
11817 Copy(message, buf, l1 , char);
11818 buf[l1-1] = '\0'; /* Overwrite \n */
11819 Perl_croak(aTHX_ "%s", buf);
11822 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
11824 #ifndef PERL_IN_XSUB_RE
11826 Perl_save_re_context(pTHX)
11830 struct re_save_state *state;
11832 SAVEVPTR(PL_curcop);
11833 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11835 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11836 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11837 SSPUSHUV(SAVEt_RE_STATE);
11839 Copy(&PL_reg_state, state, 1, struct re_save_state);
11841 PL_reg_start_tmp = 0;
11842 PL_reg_start_tmpl = 0;
11843 PL_reg_oldsaved = NULL;
11844 PL_reg_oldsavedlen = 0;
11845 PL_reg_maxiter = 0;
11846 PL_reg_leftiter = 0;
11847 PL_reg_poscache = NULL;
11848 PL_reg_poscache_size = 0;
11849 #ifdef PERL_OLD_COPY_ON_WRITE
11853 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11855 const REGEXP * const rx = PM_GETRE(PL_curpm);
11858 for (i = 1; i <= RX_NPARENS(rx); i++) {
11859 char digits[TYPE_CHARS(long)];
11860 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11861 GV *const *const gvp
11862 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11865 GV * const gv = *gvp;
11866 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11876 clear_re(pTHX_ void *r)
11879 ReREFCNT_dec((REGEXP *)r);
11885 S_put_byte(pTHX_ SV *sv, int c)
11887 PERL_ARGS_ASSERT_PUT_BYTE;
11889 /* Our definition of isPRINT() ignores locales, so only bytes that are
11890 not part of UTF-8 are considered printable. I assume that the same
11891 holds for UTF-EBCDIC.
11892 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11893 which Wikipedia says:
11895 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11896 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11897 identical, to the ASCII delete (DEL) or rubout control character.
11898 ) So the old condition can be simplified to !isPRINT(c) */
11901 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11904 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11908 const char string = c;
11909 if (c == '-' || c == ']' || c == '\\' || c == '^')
11910 sv_catpvs(sv, "\\");
11911 sv_catpvn(sv, &string, 1);
11916 #define CLEAR_OPTSTART \
11917 if (optstart) STMT_START { \
11918 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11922 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11924 STATIC const regnode *
11925 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11926 const regnode *last, const regnode *plast,
11927 SV* sv, I32 indent, U32 depth)
11930 register U8 op = PSEUDO; /* Arbitrary non-END op. */
11931 register const regnode *next;
11932 const regnode *optstart= NULL;
11934 RXi_GET_DECL(r,ri);
11935 GET_RE_DEBUG_FLAGS_DECL;
11937 PERL_ARGS_ASSERT_DUMPUNTIL;
11939 #ifdef DEBUG_DUMPUNTIL
11940 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11941 last ? last-start : 0,plast ? plast-start : 0);
11944 if (plast && plast < last)
11947 while (PL_regkind[op] != END && (!last || node < last)) {
11948 /* While that wasn't END last time... */
11951 if (op == CLOSE || op == WHILEM)
11953 next = regnext((regnode *)node);
11956 if (OP(node) == OPTIMIZED) {
11957 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11964 regprop(r, sv, node);
11965 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11966 (int)(2*indent + 1), "", SvPVX_const(sv));
11968 if (OP(node) != OPTIMIZED) {
11969 if (next == NULL) /* Next ptr. */
11970 PerlIO_printf(Perl_debug_log, " (0)");
11971 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11972 PerlIO_printf(Perl_debug_log, " (FAIL)");
11974 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11975 (void)PerlIO_putc(Perl_debug_log, '\n');
11979 if (PL_regkind[(U8)op] == BRANCHJ) {
11982 register const regnode *nnode = (OP(next) == LONGJMP
11983 ? regnext((regnode *)next)
11985 if (last && nnode > last)
11987 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11990 else if (PL_regkind[(U8)op] == BRANCH) {
11992 DUMPUNTIL(NEXTOPER(node), next);
11994 else if ( PL_regkind[(U8)op] == TRIE ) {
11995 const regnode *this_trie = node;
11996 const char op = OP(node);
11997 const U32 n = ARG(node);
11998 const reg_ac_data * const ac = op>=AHOCORASICK ?
11999 (reg_ac_data *)ri->data->data[n] :
12001 const reg_trie_data * const trie =
12002 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
12004 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
12006 const regnode *nextbranch= NULL;
12009 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
12010 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
12012 PerlIO_printf(Perl_debug_log, "%*s%s ",
12013 (int)(2*(indent+3)), "",
12014 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12015 PL_colors[0], PL_colors[1],
12016 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12017 PERL_PV_PRETTY_ELLIPSES |
12018 PERL_PV_PRETTY_LTGT
12023 U16 dist= trie->jump[word_idx+1];
12024 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12025 (UV)((dist ? this_trie + dist : next) - start));
12028 nextbranch= this_trie + trie->jump[0];
12029 DUMPUNTIL(this_trie + dist, nextbranch);
12031 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12032 nextbranch= regnext((regnode *)nextbranch);
12034 PerlIO_printf(Perl_debug_log, "\n");
12037 if (last && next > last)
12042 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
12043 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12044 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12046 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12048 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12050 else if ( op == PLUS || op == STAR) {
12051 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
12053 else if (PL_regkind[(U8)op] == ANYOF) {
12054 /* arglen 1 + class block */
12055 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
12056 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12057 node = NEXTOPER(node);
12059 else if (PL_regkind[(U8)op] == EXACT) {
12060 /* Literal string, where present. */
12061 node += NODE_SZ_STR(node) - 1;
12062 node = NEXTOPER(node);
12065 node = NEXTOPER(node);
12066 node += regarglen[(U8)op];
12068 if (op == CURLYX || op == OPEN)
12072 #ifdef DEBUG_DUMPUNTIL
12073 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12078 #endif /* DEBUGGING */
12082 * c-indentation-style: bsd
12083 * c-basic-offset: 4
12084 * indent-tabs-mode: t
12087 * ex: set ts=8 sts=4 sw=4 noet: