X-Git-Url: http://git.vpit.fr/?p=perl%2Fmodules%2Fre-engine-Hooks.git;a=blobdiff_plain;f=src%2F5016003%2Fregcomp.c;fp=src%2F5016003%2Fregcomp.c;h=d3a891a6be874e55c51aafb65e2bb43b4eefc878;hp=0000000000000000000000000000000000000000;hb=a76f00e22fd80312f10e8bf44bf8343b4e4d78e6;hpb=8f657e98a7fe5e93f5c7ec58c77e4793ab82bc8a diff --git a/src/5016003/regcomp.c b/src/5016003/regcomp.c new file mode 100644 index 0000000..d3a891a --- /dev/null +++ b/src/5016003/regcomp.c @@ -0,0 +1,13470 @@ +/* regcomp.c + */ + +/* + * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee + * + * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] + */ + +/* This file contains functions for compiling a regular expression. See + * also regexec.c which funnily enough, contains functions for executing + * a regular expression. + * + * This file is also copied at build time to ext/re/re_comp.c, where + * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. + * This causes the main functions to be compiled under new names and with + * debugging support added, which makes "use re 'debug'" work. + */ + +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +/* + * pregcomp and pregexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + * + **** Alterations to Henry's code are... + **** + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others + **** + **** You may distribute under the terms of either the GNU General Public + **** License or the Artistic License, as specified in the README file. + + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#define PERL_IN_REGCOMP_C +#include "perl.h" + +#ifndef PERL_IN_XSUB_RE +#include "re_defs.h" +#endif + +#define REG_COMP_C +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +#else +# include "regcomp.h" +#endif + +#include "dquote_static.c" +#ifndef PERL_IN_XSUB_RE +# include "charclass_invlists.h" +#endif + +#ifdef op +#undef op +#endif /* op */ + +#ifdef MSDOS +# if defined(BUGGY_MSC6) + /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ +# pragma optimize("a",off) + /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ +# pragma optimize("w",on ) +# endif /* BUGGY_MSC6 */ +#endif /* MSDOS */ + +#ifndef STATIC +#define STATIC static +#endif + +typedef struct RExC_state_t { + U32 flags; /* are we folding, multilining? */ + char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ + regexp *rx; /* perl core regexp structure */ + regexp_internal *rxi; /* internal data for regexp object pprivate field */ + char *start; /* Start of input for compile */ + char *end; /* End of input for compile */ + char *parse; /* Input-scan pointer. */ + I32 whilem_seen; /* number of WHILEM in this expr */ + regnode *emit_start; /* Start of emitted-code area */ + regnode *emit_bound; /* First regnode outside of the allocated space */ + regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ + I32 naughty; /* How bad is this pattern? */ + I32 sawback; /* Did we see \1, ...? */ + U32 seen; + I32 size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN). */ + I32 cpar; /* Capture buffer count, (CLOSE). */ + I32 nestroot; /* root parens we are in - used by accept */ + I32 extralen; + I32 seen_zerolen; + I32 seen_evals; + regnode **open_parens; /* pointers to open parens */ + regnode **close_parens; /* pointers to close parens */ + regnode *opend; /* END node in program */ + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ + I32 uni_semantics; /* If a d charset modifier should use unicode + rules, even if the pattern is not in + utf8 */ + HV *paren_names; /* Paren names */ + + regnode **recurse; /* Recurse regops */ + I32 recurse_count; /* Number of recurse regops */ + I32 in_lookbehind; + I32 contains_locale; + I32 override_recoding; +#if ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ +#define RExC_starttry (pRExC_state->starttry) +#endif +#ifdef DEBUGGING + const char *lastparse; + I32 lastnum; + AV *paren_name_list; /* idx -> name */ +#define RExC_lastparse (pRExC_state->lastparse) +#define RExC_lastnum (pRExC_state->lastnum) +#define RExC_paren_name_list (pRExC_state->paren_name_list) +#endif +} RExC_state_t; + +#define RExC_flags (pRExC_state->flags) +#define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) +#define RExC_rx (pRExC_state->rx) +#define RExC_rxi (pRExC_state->rxi) +#define RExC_start (pRExC_state->start) +#define RExC_end (pRExC_state->end) +#define RExC_parse (pRExC_state->parse) +#define RExC_whilem_seen (pRExC_state->whilem_seen) +#ifdef RE_TRACK_PATTERN_OFFSETS +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#endif +#define RExC_emit (pRExC_state->emit) +#define RExC_emit_start (pRExC_state->emit_start) +#define RExC_emit_bound (pRExC_state->emit_bound) +#define RExC_naughty (pRExC_state->naughty) +#define RExC_sawback (pRExC_state->sawback) +#define RExC_seen (pRExC_state->seen) +#define RExC_size (pRExC_state->size) +#define RExC_npar (pRExC_state->npar) +#define RExC_nestroot (pRExC_state->nestroot) +#define RExC_extralen (pRExC_state->extralen) +#define RExC_seen_zerolen (pRExC_state->seen_zerolen) +#define RExC_seen_evals (pRExC_state->seen_evals) +#define RExC_utf8 (pRExC_state->utf8) +#define RExC_uni_semantics (pRExC_state->uni_semantics) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) +#define RExC_open_parens (pRExC_state->open_parens) +#define RExC_close_parens (pRExC_state->close_parens) +#define RExC_opend (pRExC_state->opend) +#define RExC_paren_names (pRExC_state->paren_names) +#define RExC_recurse (pRExC_state->recurse) +#define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_in_lookbehind (pRExC_state->in_lookbehind) +#define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_override_recoding (pRExC_state->override_recoding) + + +#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ + ((*s) == '{' && regcurly(s))) + +#ifdef SPSTART +#undef SPSTART /* dratted cpp namespace... */ +#endif +/* + * Flags to be passed up and down. + */ +#define WORST 0 /* Worst case. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ + +/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single + * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */ +#define SIMPLE 0x02 +#define SPSTART 0x04 /* Starts with * or +. */ +#define TRYAGAIN 0x08 /* Weeded out a declaration. */ +#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) + +/* whether trie related optimizations are enabled */ +#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION +#define TRIE_STUDY_OPT +#define FULL_TRIE_STUDY +#define TRIE_STCLASS +#endif + + + +#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] +#define PBITVAL(paren) (1 << ((paren) & 7)) +#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) +#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) +#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) + +/* If not already in utf8, do a longjmp back to the beginning */ +#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */ +#define REQUIRE_UTF8 STMT_START { \ + if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \ + } STMT_END + +/* About scan_data_t. + + During optimisation we recurse through the regexp program performing + various inplace (keyhole style) optimisations. In addition study_chunk + and scan_commit populate this data structure with information about + what strings MUST appear in the pattern. We look for the longest + string that must appear at a fixed location, and we look for the + longest string that may appear at a floating location. So for instance + in the pattern: + + /FOO[xX]A.*B[xX]BAR/ + + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating + strings (because they follow a .* construct). study_chunk will identify + both FOO and BAR as being the longest fixed and floating strings respectively. + + The strings can be composites, for instance + + /(f)(o)(o)/ + + will result in a composite fixed substring 'foo'. + + For each string some basic information is maintained: + + - offset or min_offset + This is the position the string must appear at, or not before. + It also implicitly (when combined with minlenp) tells us how many + characters must match before the string we are searching for. + Likewise when combined with minlenp and the length of the string it + tells us how many characters must appear after the string we have + found. + + - max_offset + Only used for floating strings. This is the rightmost point that + the string can appear at. If set to I32 max it indicates that the + string can occur infinitely far to the right. + + - minlenp + A pointer to the minimum length of the pattern that the string + was found inside. This is important as in the case of positive + lookahead or positive lookbehind we can have multiple patterns + involved. Consider + + /(?=FOO).*F/ + + The minimum length of the pattern overall is 3, the minimum length + of the lookahead part is 3, but the minimum length of the part that + will actually match is 1. So 'FOO's minimum length is 3, but the + minimum length for the F is 1. This is important as the minimum length + is used to determine offsets in front of and behind the string being + looked for. Since strings can be composites this is the length of the + pattern at the time it was committed with a scan_commit. Note that + the length is calculated by study_chunk, so that the minimum lengths + are not known until the full pattern has been compiled, thus the + pointer to the value. + + - lookbehind + + In the case of lookbehind the string being searched for can be + offset past the start point of the final matching string. + If this value was just blithely removed from the min_offset it would + invalidate some of the calculations for how many chars must match + before or after (as they are derived from min_offset and minlen and + the length of the string being searched for). + When the final pattern is compiled and the data is moved from the + scan_data_t structure into the regexp structure the information + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the + associated string. + + The fields pos_min and pos_delta are used to store the minimum offset + and the delta to the maximum offset at the current point in the pattern. + +*/ + +typedef struct scan_data_t { + /*I32 len_min; unused */ + /*I32 len_delta; unused */ + I32 pos_min; + I32 pos_delta; + SV *last_found; + I32 last_end; /* min value, <0 unless valid. */ + I32 last_start_min; + I32 last_start_max; + SV **longest; /* Either &l_fixed, or &l_float. */ + SV *longest_fixed; /* longest fixed string found in pattern */ + I32 offset_fixed; /* offset where it starts */ + I32 *minlen_fixed; /* pointer to the minlen relevant to the string */ + I32 lookbehind_fixed; /* is the position of the string modfied by LB */ + SV *longest_float; /* longest floating string found in pattern */ + I32 offset_float_min; /* earliest point in string it can appear */ + I32 offset_float_max; /* latest point in string it can appear */ + I32 *minlen_float; /* pointer to the minlen relevant to the string */ + I32 lookbehind_float; /* is the position of the string modified by LB */ + I32 flags; + I32 whilem_c; + I32 *last_closep; + struct regnode_charclass_class *start_class; +} scan_data_t; + +/* + * Forward declarations for pregcomp()'s friends. + */ + +static const scan_data_t zero_scan_data = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; + +#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) +#define SF_BEFORE_SEOL 0x0001 +#define SF_BEFORE_MEOL 0x0002 +#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) +#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) + +#ifdef NO_UNARY_PLUS +# define SF_FIX_SHIFT_EOL (0+2) +# define SF_FL_SHIFT_EOL (0+4) +#else +# define SF_FIX_SHIFT_EOL (+2) +# define SF_FL_SHIFT_EOL (+4) +#endif + +#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) +#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) + +#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) +#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ +#define SF_IS_INF 0x0040 +#define SF_HAS_PAR 0x0080 +#define SF_IN_PAR 0x0100 +#define SF_HAS_EVAL 0x0200 +#define SCF_DO_SUBSTR 0x0400 +#define SCF_DO_STCLASS_AND 0x0800 +#define SCF_DO_STCLASS_OR 0x1000 +#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 + +#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ +#define SCF_SEEN_ACCEPT 0x8000 + +#define UTF cBOOL(RExC_utf8) + +/* The enums for all these are ordered so things work out correctly */ +#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) +#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) + +#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) + +#define OOB_UNICODE 12345678 +#define OOB_NAMEDCLASS -1 + +#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) + + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "<-- HERE" /* marker as it appears in the description */ +#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define _FAIL(code) STMT_START { \ + const char *ellipses = ""; \ + IV len = RExC_end - RExC_precomp; \ + \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + code; \ +} STMT_END + +#define FAIL(msg) _FAIL( \ + Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ + msg, (int)len, RExC_precomp, ellipses)) + +#define FAIL2(msg,arg) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ + arg, (int)len, RExC_precomp, ellipses)) + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + Simple_vFAIL(m); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + Simple_vFAIL2(m, a1); \ +} STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + Simple_vFAIL3(m, a1, a2); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define ckWARNreg(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define ckWARNregdep(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define ckWARN2regdep(loc,m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define ckWARN2reg(loc, m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define vWARN3(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define vWARN4(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + +#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + + +/* Allow for side effects in s */ +#define REGC(c,s) STMT_START { \ + if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ +} STMT_END + +/* Macros for recording node offsets. 20001227 mjd@plover.com + * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in + * element 2*n-1 of the array. Element #2n holds the byte length node #n. + * Element 0 holds the number n. + * Position is 1 indexed. + */ +#ifndef RE_TRACK_PATTERN_OFFSETS +#define Set_Node_Offset_To_R(node,byte) +#define Set_Node_Offset(node,byte) +#define Set_Cur_Node_Offset +#define Set_Node_Length_To_R(node,len) +#define Set_Node_Length(node,len) +#define Set_Node_Cur_Length(node) +#define Node_Offset(n) +#define Node_Length(n) +#define Set_Node_Offset_Length(node,offset,len) +#define ProgLen(ri) ri->u.proglen +#define SetProgLen(ri,x) ri->u.proglen = x +#else +#define ProgLen(ri) ri->u.offsets[0] +#define SetProgLen(ri,x) ri->u.offsets[0] = x +#define Set_Node_Offset_To_R(node,byte) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(byte))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ + } else { \ + RExC_offsets[2*(node)-1] = (byte); \ + } \ + } \ +} STMT_END + +#define Set_Node_Offset(node,byte) \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) +#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) + +#define Set_Node_Length_To_R(node,len) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(len))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ + } else { \ + RExC_offsets[2*(node)] = (len); \ + } \ + } \ +} STMT_END + +#define Set_Node_Length(node,len) \ + Set_Node_Length_To_R((node)-RExC_emit_start, len) +#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) +#define Set_Node_Cur_Length(node) \ + Set_Node_Length(node, RExC_parse - parse_start) + +/* Get offsets and lengths */ +#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) +#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) + +#define Set_Node_Offset_Length(node,offset,len) STMT_START { \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ + Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ +} STMT_END +#endif + +#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS +#define EXPERIMENTAL_INPLACESCAN +#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ + +#define DEBUG_STUDYDATA(str,data,depth) \ +DEBUG_OPTIMISE_MORE_r(if(data){ \ + PerlIO_printf(Perl_debug_log, \ + "%*s" str "Pos:%"IVdf"/%"IVdf \ + " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ + (int)(depth)*2, "", \ + (IV)((data)->pos_min), \ + (IV)((data)->pos_delta), \ + (UV)((data)->flags), \ + (IV)((data)->whilem_c), \ + (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ + is_inf ? "INF " : "" \ + ); \ + if ((data)->last_found) \ + PerlIO_printf(Perl_debug_log, \ + "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ + " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ + SvPVX_const((data)->last_found), \ + (IV)((data)->last_end), \ + (IV)((data)->last_start_min), \ + (IV)((data)->last_start_max), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ + SvPVX_const((data)->longest_fixed), \ + (IV)((data)->offset_fixed), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_float)) ? "*" : "", \ + SvPVX_const((data)->longest_float), \ + (IV)((data)->offset_float_min), \ + (IV)((data)->offset_float_max) \ + ); \ + PerlIO_printf(Perl_debug_log,"\n"); \ +}); + +static void clear_re(pTHX_ void *r); + +/* Mark that we cannot extend a found fixed substring at this point. + Update the longest found anchored substring and the longest found + floating substrings if needed. */ + +STATIC void +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) +{ + const STRLEN l = CHR_SVLEN(data->last_found); + const STRLEN old_l = CHR_SVLEN(*data->longest); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_SCAN_COMMIT; + + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { + SvSetMagicSV(*data->longest, data->last_found); + if (*data->longest == data->longest_fixed) { + data->offset_fixed = l ? data->last_start_min : data->pos_min; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); + else + data->flags &= ~SF_FIX_BEFORE_EOL; + data->minlen_fixed=minlenp; + data->lookbehind_fixed=0; + } + else { /* *data->longest == data->longest_float */ + data->offset_float_min = l ? data->last_start_min : data->pos_min; + data->offset_float_max = (l + ? data->last_start_max + : data->pos_min + data->pos_delta); + if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) + data->offset_float_max = I32_MAX; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); + else + data->flags &= ~SF_FL_BEFORE_EOL; + data->minlen_float=minlenp; + data->lookbehind_float=0; + } + } + SvCUR_set(data->last_found, 0); + { + SV * const sv = data->last_found; + if (SvUTF8(sv) && SvMAGICAL(sv)) { + MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); + if (mg) + mg->mg_len = 0; + } + } + data->last_end = -1; + data->flags &= ~SF_BEFORE_EOL; + DEBUG_STUDYDATA("commit: ",data,0); +} + +/* Can match anything (initialization) */ +STATIC void +S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +{ + PERL_ARGS_ASSERT_CL_ANYTHING; + + ANYOF_BITMAP_SETALL(cl); + cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL + |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL; + + /* If any portion of the regex is to operate under locale rules, + * initialization includes it. The reason this isn't done for all regexes + * is that the optimizer was written under the assumption that locale was + * all-or-nothing. Given the complexity and lack of documentation in the + * optimizer, and that there are inadequate test cases for locale, so many + * parts of it may not work properly, it is safest to avoid locale unless + * necessary. */ + if (RExC_contains_locale) { + ANYOF_CLASS_SETALL(cl); /* /l uses class */ + cl->flags |= ANYOF_LOCALE; + } + else { + ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ + } +} + +/* Can match anything (initialization) */ +STATIC int +S_cl_is_anything(const struct regnode_charclass_class *cl) +{ + int value; + + PERL_ARGS_ASSERT_CL_IS_ANYTHING; + + for (value = 0; value <= ANYOF_MAX; value += 2) + if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) + return 1; + if (!(cl->flags & ANYOF_UNICODE_ALL)) + return 0; + if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) + return 0; + return 1; +} + +/* Can match anything (initialization) */ +STATIC void +S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +{ + PERL_ARGS_ASSERT_CL_INIT; + + Zero(cl, 1, struct regnode_charclass_class); + cl->type = ANYOF; + cl_anything(pRExC_state, cl); + ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); +} + +/* These two functions currently do the exact same thing */ +#define cl_init_zero S_cl_init + +/* 'AND' a given class with another one. Can create false positives. 'cl' + * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ +STATIC void +S_cl_and(struct regnode_charclass_class *cl, + const struct regnode_charclass_class *and_with) +{ + PERL_ARGS_ASSERT_CL_AND; + + assert(and_with->type == ANYOF); + + /* I (khw) am not sure all these restrictions are necessary XXX */ + if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) + && !(ANYOF_CLASS_TEST_ANY_SET(cl)) + && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) + && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD) + && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) { + int i; + + if (and_with->flags & ANYOF_INVERT) + for (i = 0; i < ANYOF_BITMAP_SIZE; i++) + cl->bitmap[i] &= ~and_with->bitmap[i]; + else + for (i = 0; i < ANYOF_BITMAP_SIZE; i++) + cl->bitmap[i] &= and_with->bitmap[i]; + } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ + + if (and_with->flags & ANYOF_INVERT) { + + /* Here, the and'ed node is inverted. Get the AND of the flags that + * aren't affected by the inversion. Those that are affected are + * handled individually below */ + U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; + cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); + cl->flags |= affected_flags; + + /* We currently don't know how to deal with things that aren't in the + * bitmap, but we know that the intersection is no greater than what + * is already in cl, so let there be false positives that get sorted + * out after the synthetic start class succeeds, and the node is + * matched for real. */ + + /* The inversion of these two flags indicate that the resulting + * intersection doesn't have them */ + if (and_with->flags & ANYOF_UNICODE_ALL) { + cl->flags &= ~ANYOF_UNICODE_ALL; + } + if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { + cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; + } + } + else { /* and'd node is not inverted */ + U8 outside_bitmap_but_not_utf8; /* Temp variable */ + + if (! ANYOF_NONBITMAP(and_with)) { + + /* Here 'and_with' doesn't match anything outside the bitmap + * (except possibly ANYOF_UNICODE_ALL), which means the + * intersection can't either, except for ANYOF_UNICODE_ALL, in + * which case we don't know what the intersection is, but it's no + * greater than what cl already has, so can just leave it alone, + * with possible false positives */ + if (! (and_with->flags & ANYOF_UNICODE_ALL)) { + ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; + } + } + else if (! ANYOF_NONBITMAP(cl)) { + + /* Here, 'and_with' does match something outside the bitmap, and cl + * doesn't have a list of things to match outside the bitmap. If + * cl can match all code points above 255, the intersection will + * be those above-255 code points that 'and_with' matches. If cl + * can't match all Unicode code points, it means that it can't + * match anything outside the bitmap (since the 'if' that got us + * into this block tested for that), so we leave the bitmap empty. + */ + if (cl->flags & ANYOF_UNICODE_ALL) { + ARG_SET(cl, ARG(and_with)); + + /* and_with's ARG may match things that don't require UTF8. + * And now cl's will too, in spite of this being an 'and'. See + * the comments below about the kludge */ + cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8; + } + } + else { + /* Here, both 'and_with' and cl match something outside the + * bitmap. Currently we do not do the intersection, so just match + * whatever cl had at the beginning. */ + } + + + /* Take the intersection of the two sets of flags. However, the + * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a + * kludge around the fact that this flag is not treated like the others + * which are initialized in cl_anything(). The way the optimizer works + * is that the synthetic start class (SSC) is initialized to match + * anything, and then the first time a real node is encountered, its + * values are AND'd with the SSC's with the result being the values of + * the real node. However, there are paths through the optimizer where + * the AND never gets called, so those initialized bits are set + * inappropriately, which is not usually a big deal, as they just cause + * false positives in the SSC, which will just mean a probably + * imperceptible slow down in execution. However this bit has a + * higher false positive consequence in that it can cause utf8.pm, + * utf8_heavy.pl ... to be loaded when not necessary, which is a much + * bigger slowdown and also causes significant extra memory to be used. + * In order to prevent this, the code now takes a different tack. The + * bit isn't set unless some part of the regular expression needs it, + * but once set it won't get cleared. This means that these extra + * modules won't get loaded unless there was some path through the + * pattern that would have required them anyway, and so any false + * positives that occur by not ANDing them out when they could be + * aren't as severe as they would be if we treated this bit like all + * the others */ + outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags) + & ANYOF_NONBITMAP_NON_UTF8; + cl->flags &= and_with->flags; + cl->flags |= outside_bitmap_but_not_utf8; + } +} + +/* 'OR' a given class with another one. Can create false positives. 'cl' + * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if + * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ +STATIC void +S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +{ + PERL_ARGS_ASSERT_CL_OR; + + if (or_with->flags & ANYOF_INVERT) { + + /* Here, the or'd node is to be inverted. This means we take the + * complement of everything not in the bitmap, but currently we don't + * know what that is, so give up and match anything */ + if (ANYOF_NONBITMAP(or_with)) { + cl_anything(pRExC_state, cl); + } + /* We do not use + * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) + * <= (B1 | !B2) | (CL1 | !CL2) + * which is wasteful if CL2 is small, but we ignore CL2: + * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 + * XXXX Can we handle case-fold? Unclear: + * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = + * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) + */ + else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) + && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) + && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) { + int i; + + for (i = 0; i < ANYOF_BITMAP_SIZE; i++) + cl->bitmap[i] |= ~or_with->bitmap[i]; + } /* XXXX: logic is complicated otherwise */ + else { + cl_anything(pRExC_state, cl); + } + + /* And, we can just take the union of the flags that aren't affected + * by the inversion */ + cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; + + /* For the remaining flags: + ANYOF_UNICODE_ALL and inverted means to not match anything above + 255, which means that the union with cl should just be + what cl has in it, so can ignore this flag + ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord + is 127-255 to match them, but then invert that, so the + union with cl should just be what cl has in it, so can + ignore this flag + */ + } else { /* 'or_with' is not inverted */ + /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ + if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) + && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) + || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) { + int i; + + /* OR char bitmap and class bitmap separately */ + for (i = 0; i < ANYOF_BITMAP_SIZE; i++) + cl->bitmap[i] |= or_with->bitmap[i]; + if (ANYOF_CLASS_TEST_ANY_SET(or_with)) { + for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++) + cl->classflags[i] |= or_with->classflags[i]; + cl->flags |= ANYOF_CLASS; + } + } + else { /* XXXX: logic is complicated, leave it along for a moment. */ + cl_anything(pRExC_state, cl); + } + + if (ANYOF_NONBITMAP(or_with)) { + + /* Use the added node's outside-the-bit-map match if there isn't a + * conflict. If there is a conflict (both nodes match something + * outside the bitmap, but what they match outside is not the same + * pointer, and hence not easily compared until XXX we extend + * inversion lists this far), give up and allow the start class to + * match everything outside the bitmap. If that stuff is all above + * 255, can just set UNICODE_ALL, otherwise caould be anything. */ + if (! ANYOF_NONBITMAP(cl)) { + ARG_SET(cl, ARG(or_with)); + } + else if (ARG(cl) != ARG(or_with)) { + + if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { + cl_anything(pRExC_state, cl); + } + else { + cl->flags |= ANYOF_UNICODE_ALL; + } + } + } + + /* Take the union */ + cl->flags |= or_with->flags; + } +} + +#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] +#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) +#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) + + +#ifdef DEBUGGING +/* + dump_trie(trie,widecharmap,revcharmap) + dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) + dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) + + These routines dump out a trie in a somewhat readable format. + The _interim_ variants are used for debugging the interim + tables that are used to generate the final compressed + representation which is what dump_trie expects. + + Part of the reason for their existence is to provide a form + of documentation as to how the different representations function. + +*/ + +/* + Dumps the final compressed table form of the trie to Perl_debug_log. + Used for debugging make_trie(). +*/ + +STATIC void +S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, + AV *revcharmap, U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + U16 word; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE; + + PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", + (int)depth * 2 + 2,"", + "Match","Base","Ofs" ); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) { + SV ** const tmp = av_fetch( revcharmap, state, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", + (int)depth * 2 + 2,""); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) + PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); + PerlIO_printf( Perl_debug_log, "\n"); + + for( state = 1 ; state < trie->statecount ; state++ ) { + const U32 base = trie->states[ state ].trans.base; + + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); + + if ( trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); + } else { + PerlIO_printf( Perl_debug_log, "%6s", "" ); + } + + PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); + + if ( base ) { + U32 ofs = 0; + + while( ( base + ofs < trie->uniquecharcount ) || + ( base + ofs - trie->uniquecharcount < trie->lasttrans + && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) + ofs++; + + PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) && + ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && + trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + { + PerlIO_printf( Perl_debug_log, "%*"UVXf, + colwidth, + (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); + } else { + PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); + } + } + + PerlIO_printf( Perl_debug_log, "]"); + + } + PerlIO_printf( Perl_debug_log, "\n" ); + } + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, ""); + for (word=1; word <= trie->wordcount; word++) { + PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", + (int)word, (int)(trie->wordinfo[word].prev), + (int)(trie->wordinfo[word].len)); + } + PerlIO_printf(Perl_debug_log, "\n" ); +} +/* + Dumps a fully constructed but uncompressed trie in list form. + List tries normally only are used for construction when the number of + possible chars (trie->uniquecharcount) is very high. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + + /* print out the table precompression. */ + PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", + (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", + "------:-----+-----------------\n" ); + + for( state=1 ; state < next_alloc ; state ++ ) { + U16 charid; + + PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", + (int)depth * 2 + 2,"", (UV)state ); + if ( ! trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, "%5s| ",""); + } else { + PerlIO_printf( Perl_debug_log, "W%4x| ", + trie->states[ state ].wordnum + ); + } + for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) , + TRIE_LIST_ITEM(state,charid).forid, + (UV)TRIE_LIST_ITEM(state,charid).newstate + ); + if (!(charid % 10)) + PerlIO_printf(Perl_debug_log, "\n%*s| ", + (int)((depth * 2) + 14), ""); + } + } + PerlIO_printf( Perl_debug_log, "\n"); + } +} + +/* + Dumps a fully constructed but uncompressed trie in table form. + This is the normal DFA style state transition table, with a few + twists to facilitate compression later. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + U16 charid; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; + + /* + print out the table precompression so that we can do a visual check + that they are identical. + */ + + PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, charid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + + PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); + + for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { + PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); + } + + PerlIO_printf( Perl_debug_log, "\n" ); + + for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { + + PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", + (int)depth * 2 + 2,"", + (UV)TRIE_NODENUM( state ) ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); + if (v) + PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); + else + PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); + } + if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); + } else { + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, + trie->states[ TRIE_NODENUM( state ) ].wordnum ); + } + } +} + +#endif + + +/* make_trie(startbranch,first,last,tail,word_count,flags,depth) + startbranch: the first branch in the whole branch sequence + first : start branch of sequence of branch-exact nodes. + May be the same as startbranch + last : Thing following the last branch. + May be the same as tail. + tail : item following the branch sequence + count : words in the sequence + flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ + depth : indent depth + +Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. + +A trie is an N'ary tree where the branches are determined by digital +decomposition of the key. IE, at the root node you look up the 1st character and +follow that branch repeat until you find the end of the branches. Nodes can be +marked as "accepting" meaning they represent a complete word. Eg: + + /he|she|his|hers/ + +would convert into the following structure. Numbers represent states, letters +following numbers represent valid transitions on the letter from that state, if +the number is in square brackets it represents an accepting state, otherwise it +will be in parenthesis. + + +-h->+-e->[3]-+-r->(8)-+-s->[9] + | | + | (2) + | | + (1) +-i->(6)-+-s->[7] + | + +-s->(3)-+-h->(4)-+-e->[5] + + Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) + +This shows that when matching against the string 'hers' we will begin at state 1 +read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, +then read 'r' and go to state 8 followed by 's' which takes us to state 9 which +is also accepting. Thus we know that we can match both 'he' and 'hers' with a +single traverse. We store a mapping from accepting to state to which word was +matched, and then when we have multiple possibilities we try to complete the +rest of the regex in the order in which they occured in the alternation. + +The only prior NFA like behaviour that would be changed by the TRIE support is +the silent ignoring of duplicate alternations which are of the form: + + / (DUPE|DUPE) X? (?{ ... }) Y /x + +Thus EVAL blocks following a trie may be called a different number of times with +and without the optimisation. With the optimisations dupes will be silently +ignored. This inconsistent behaviour of EVAL type nodes is well established as +the following demonstrates: + + 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ + +which prints out 'word' three times, but + + 'words'=~/(word|word|word)(?{ print $1 })S/ + +which doesnt print it out at all. This is due to other optimisations kicking in. + +Example of what happens on a structural level: + +The regexp /(ac|ad|ab)+/ will produce the following debug output: + + 1: CURLYM[1] {1,32767}(18) + 5: BRANCH(8) + 6: EXACT (16) + 8: BRANCH(11) + 9: EXACT (16) + 11: BRANCH(14) + 12: EXACT (16) + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +This would be optimizable with startbranch=5, first=5, last=16, tail=16 +and should turn into: + + 1: CURLYM[1] {1,32767}(18) + 5: TRIE(16) + [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] + + + + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +Cases where tail != last would be like /(?foo|bar)baz/: + + 1: BRANCH(4) + 2: EXACT (8) + 4: BRANCH(7) + 5: EXACT (8) + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + +which would be optimizable with startbranch=1, first=1, last=7, tail=8 +and would end up looking like: + + 1: TRIE(8) + [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] + + + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + + d = uvuni_to_utf8_flags(d, uv, 0); + +is the recommended Unicode-aware way of saying + + *(d++) = uv; +*/ + +#define TRIE_STORE_REVCHAR(val) \ + STMT_START { \ + if (UTF) { \ + SV *zlopp = newSV(7); /* XXX: optimize me */ \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)val; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END + +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need folding */ \ + uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* if we use this folder we have to obey unicode rules on latin-1 data */ \ + if ( foldlen > 0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + scan += len; \ + len = 0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \ + skiplen = UNISKIP(uvc); \ + foldlen -= skiplen; \ + scan = foldbuf + skiplen; \ + } \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ +} STMT_END + + + +#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ + if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ + U32 ging = TRIE_LIST_LEN( state ) *= 2; \ + Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ + } \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ + TRIE_LIST_CUR( state )++; \ +} STMT_END + +#define TRIE_LIST_NEW(state) STMT_START { \ + Newxz( trie->states[ state ].trans.list, \ + 4, reg_trie_trans_le ); \ + TRIE_LIST_CUR( state ) = 1; \ + TRIE_LIST_LEN( state ) = 4; \ +} STMT_END + +#define TRIE_HANDLE_WORD(state) STMT_START { \ + U16 dupe= trie->states[ state ].wordnum; \ + regnode * const noper_next = regnext( noper ); \ + \ + DEBUG_r({ \ + /* store the word for dumping */ \ + SV* tmp; \ + if (OP(noper) != NOTHING) \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ + else \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ + av_push( trie_words, tmp ); \ + }); \ + \ + curword++; \ + trie->wordinfo[curword].prev = 0; \ + trie->wordinfo[curword].len = wordlen; \ + trie->wordinfo[curword].accept = state; \ + \ + if ( noper_next < tail ) { \ + if (!trie->jump) \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ + trie->jump[curword] = (U16)(noper_next - convert); \ + if (!jumper) \ + jumper = noper_next; \ + if (!nextbranch) \ + nextbranch= regnext(cur); \ + } \ + \ + if ( dupe ) { \ + /* It's a dupe. Pre-insert into the wordinfo[].prev */\ + /* chain, so that when the bits of chain are later */\ + /* linked together, the dups appear in the chain */\ + trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ + trie->wordinfo[dupe].prev = curword; \ + } else { \ + /* we haven't inserted this word yet. */ \ + trie->states[ state ].wordnum = curword; \ + } \ +} STMT_END + + +#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ + ( ( base + charid >= ucharcount \ + && base + charid < ubound \ + && state == trie->trans[ base - ucharcount + charid ].check \ + && trie->trans[ base - ucharcount + charid ].next ) \ + ? trie->trans[ base - ucharcount + charid ].next \ + : ( state==1 ? special : 0 ) \ + ) + +#define MADE_TRIE 1 +#define MADE_JUMP_TRIE 2 +#define MADE_EXACT_TRIE 4 + +STATIC I32 +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) +{ + dVAR; + /* first pass, loop through and scan words */ + reg_trie_data *trie; + HV *widecharmap = NULL; + AV *revcharmap = newAV(); + regnode *cur; + const U32 uniflags = UTF8_ALLOW_DEFAULT; + STRLEN len = 0; + UV uvc = 0; + U16 curword = 0; + U32 next_alloc = 0; + regnode *jumper = NULL; + regnode *nextbranch = NULL; + regnode *convert = NULL; + U32 *prev_states; /* temp array mapping each state to previous one */ + /* we just use folder as a flag in utf8 */ + const U8 * folder = NULL; + +#ifdef DEBUGGING + const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); + AV *trie_words = NULL; + /* along with revcharmap, this only used during construction but both are + * useful during debugging so we store them in the struct when debugging. + */ +#else + const U32 data_slot = add_data( pRExC_state, 2, "tu" ); + STRLEN trie_charcount=0; +#endif + SV *re_trie_maxbuff; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + switch (flags) { + case EXACT: break; + case EXACTFA: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: + case EXACTFU: folder = PL_fold_latin1; break; + case EXACTF: folder = PL_fold; break; + case EXACTFL: folder = PL_fold_locale; break; + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); + } + + trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); + trie->refcount = 1; + trie->startstate = 1; + trie->wordcount = word_count; + RExC_rxi->data->data[ data_slot ] = (void*)trie; + trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); + if (flags == EXACT) + trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); + trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( + trie->wordcount+1, sizeof(reg_trie_wordinfo)); + + DEBUG_r({ + trie_words = newAV(); + }); + + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + if (!SvIOK(re_trie_maxbuff)) { + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + DEBUG_OPTIMISE_r({ + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), + (int)depth); + }); + + /* Find the node we are going to overwrite */ + if ( first == startbranch && OP( last ) != BRANCH ) { + /* whole branch chain */ + convert = first; + } else { + /* branch sub-chain */ + convert = NEXTOPER( first ); + } + + /* -- First loop and Setup -- + + We first traverse the branches and scan each word to determine if it + contains widechars, and how many unique chars there are, this is + important as we have to build a table with at least as many columns as we + have unique chars. + + We use an array of integers to represent the character codes 0..255 + (trie->charmap) and we use a an HV* to store Unicode characters. We use the + native representation of the character value as the key and IV's for the + coded index. + + *TODO* If we keep track of how many times each character is used we can + remap the columns so that the table compression later on is more + efficient in terms of memory by ensuring the most common value is in the + middle and the least common are on the outside. IMO this would be better + than a most to least common mapping as theres a decent chance the most + common letter will share a node with the least common, meaning the node + will not be compressible. With a middle is most common approach the worst + case is when we have the least common nodes twice. + + */ + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + regnode * const noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 * const e = uc + STR_LEN( noper ); + STRLEN foldlen = 0; + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; + const U8 *scan = (U8*)NULL; + U32 wordlen = 0; /* required init */ + STRLEN chars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ + + if (OP(noper) == NOTHING) { + trie->minlen= 0; + continue; + } + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ + TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte + regardless of encoding */ + if (OP( noper ) == EXACTFU_SS) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie,0xDF); + } + } + for ( ; uc < e ; uc += len ) { + TRIE_CHARCOUNT(trie)++; + TRIE_READ_CHAR; + chars++; + if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } + if ( !trie->charmap[ uvc ] ) { + trie->charmap[ uvc ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( uvc ); + } + if ( set_bit ) { + /* store the codepoint in the bitmap, and its folded + * equivalent. */ + TRIE_BITMAP_SET(trie, uvc); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + variant codepoints */ + if (! UNI_IS_INVARIANT(uvc)) { + TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); + } + } + set_bit = 0; /* We've done our bit :-) */ + } + } else { + SV** svpp; + if ( !widecharmap ) + widecharmap = newHV(); + + svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); + + if ( !svpp ) + Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); + + if ( !SvTRUE( *svpp ) ) { + sv_setiv( *svpp, ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR(uvc); + } + } + } + if( cur == first ) { + trie->minlen = chars; + trie->maxlen = chars; + } else if (chars < trie->minlen) { + trie->minlen = chars; + } else if (chars > trie->maxlen) { + trie->maxlen = chars; + } + if (OP( noper ) == EXACTFU_SS) { + /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ + if (trie->minlen > 1) + trie->minlen= 1; + } + if (OP( noper ) == EXACTFU_TRICKYFOLD) { + /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" + * - We assume that any such sequence might match a 2 byte string */ + if (trie->minlen > 2 ) + trie->minlen= 2; + } + + } /* end first pass */ + DEBUG_TRIE_COMPILE_r( + PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + (int)depth * 2 + 2,"", + ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, + (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, + (int)trie->minlen, (int)trie->maxlen ) + ); + + /* + We now know what we are dealing with in terms of unique chars and + string sizes so we can calculate how much memory a naive + representation using a flat table will take. If it's over a reasonable + limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory + conservative but potentially much slower representation using an array + of lists. + + At the end we convert both representations into the same compressed + form that will be used in regexec.c for matching with. The latter + is a form that cannot be used to construct with but has memory + properties similar to the list form and access properties similar + to the table form making it both suitable for fast searches and + small enough that its feasable to store for the duration of a program. + + See the comment in the code where the compressed table is produced + inplace from the flat tabe representation for an explanation of how + the compression works. + + */ + + + Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); + prev_states[1] = 0; + + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { + /* + Second Pass -- Array Of Lists Representation + + Each state will be represented by a list of charid:state records + (reg_trie_trans_le) the first such element holds the CUR and LEN + points of the allocated array. (See defines above). + + We build the initial structure using the lists, and then convert + it into the compressed table form which allows faster lookups + (but cant be modified once converted). + */ + + STRLEN transcount = 1; + + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using list compiler\n", + (int)depth * 2 + 2, "")); + + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + TRIE_LIST_NEW(1); + next_alloc = 2; + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode * const noper = NEXTOPER( cur ); + U8 *uc = (U8*)STRING( noper ); + const U8 * const e = uc + STR_LEN( noper ); + U32 state = 1; /* required init */ + U16 charid = 0; /* sanity init */ + U8 *scan = (U8*)NULL; /* sanity init */ + STRLEN foldlen = 0; /* required init */ + U32 wordlen = 0; /* required init */ + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; + + if (OP(noper) != NOTHING) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + if ( !svpp ) { + charid = 0; + } else { + charid=(U16)SvIV( *svpp ); + } + } + /* charid is now 0 if we dont know the char read, or nonzero if we do */ + if ( charid ) { + + U16 check; + U32 newstate = 0; + + charid--; + if ( !trie->states[ state ].trans.list ) { + TRIE_LIST_NEW( state ); + } + for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { + if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { + newstate = TRIE_LIST_ITEM( state, check ).newstate; + break; + } + } + if ( ! newstate ) { + newstate = next_alloc++; + prev_states[newstate] = state; + TRIE_LIST_PUSH( state, charid, newstate ); + transcount++; + } + state = newstate; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + } + } + TRIE_HANDLE_WORD(state); + + } /* end second pass */ + + /* next alloc is the NEXT state to be allocated */ + trie->statecount = next_alloc; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, + next_alloc + * sizeof(reg_trie_state) ); + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, + revcharmap, next_alloc, + depth+1) + ); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); + { + U32 state; + U32 tp = 0; + U32 zp = 0; + + + for( state=1 ; state < next_alloc ; state ++ ) { + U32 base=0; + + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) + ); + */ + + if (trie->states[state].trans.list) { + U16 minid=TRIE_LIST_ITEM( state, 1).forid; + U16 maxid=minid; + U16 idx; + + for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U16 forid = TRIE_LIST_ITEM( state, idx).forid; + if ( forid < minid ) { + minid=forid; + } else if ( forid > maxid ) { + maxid=forid; + } + } + if ( transcount < tp + maxid - minid + 1) { + transcount *= 2; + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, + transcount + * sizeof(reg_trie_trans) ); + Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); + } + base = trie->uniquecharcount + tp - minid; + if ( maxid == minid ) { + U32 set = 0; + for ( ; zp < tp ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + base = trie->uniquecharcount + zp - minid; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ zp ].check = state; + set = 1; + break; + } + } + if ( !set ) { + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ tp ].check = state; + tp++; + zp = tp; + } + } else { + for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; + trie->trans[ tid ].check = state; + } + tp += ( maxid - minid + 1 ); + } + Safefree(trie->states[ state ].trans.list); + } + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, " base: %d\n",base); + ); + */ + trie->states[ state ].trans.base=base; + } + trie->lasttrans = tp + 1; + } + } else { + /* + Second Pass -- Flat Table Representation. + + we dont use the 0 slot of either trans[] or states[] so we add 1 to each. + We know that we will need Charcount+1 trans at most to store the data + (one row per char at worst case) So we preallocate both structures + assuming worst case. + + We then construct the trie using only the .next slots of the entry + structs. + + We use the .check field of the first entry of the node temporarily to + make compression both faster and easier by keeping track of how many non + zero fields are in the node. + + Since trans are numbered from 1 any 0 pointer in the table is a FAIL + transition. + + There are two terms at use here: state as a TRIE_NODEIDX() which is a + number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and + TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there + are 2 entrys per node. eg: + + A B A B + 1. 2 4 1. 3 7 + 2. 0 3 3. 0 5 + 3. 0 0 5. 0 0 + 4. 0 0 7. 0 0 + + The table is internally in the right hand, idx form. However as we also + have to deal with the states array which is indexed by nodenum we have to + use TRIE_NODENUM() to convert. + + */ + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using table compiler\n", + (int)depth * 2 + 2, "")); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) + * trie->uniquecharcount + 1, + sizeof(reg_trie_trans) ); + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + next_alloc = trie->uniquecharcount + 1; + + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode * const noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 * const e = uc + STR_LEN( noper ); + + U32 state = 1; /* required init */ + + U16 charid = 0; /* sanity init */ + U32 accept_state = 0; /* sanity init */ + U8 *scan = (U8*)NULL; /* sanity init */ + + STRLEN foldlen = 0; /* required init */ + U32 wordlen = 0; /* required init */ + STRLEN skiplen = 0; + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + + + if ( OP(noper) != NOTHING ) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + charid = svpp ? (U16)SvIV(*svpp) : 0; + } + if ( charid ) { + charid--; + if ( !trie->trans[ state + charid ].next ) { + trie->trans[ state + charid ].next = next_alloc; + trie->trans[ state ].check++; + prev_states[TRIE_NODENUM(next_alloc)] + = TRIE_NODENUM(state); + next_alloc += trie->uniquecharcount; + } + state = trie->trans[ state + charid ].next; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + /* charid is now 0 if we dont know the char read, or nonzero if we do */ + } + } + accept_state = TRIE_NODENUM( state ); + TRIE_HANDLE_WORD(accept_state); + + } /* end second pass */ + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, + revcharmap, + next_alloc, depth+1)); + + { + /* + * Inplace compress the table.* + + For sparse data sets the table constructed by the trie algorithm will + be mostly 0/FAIL transitions or to put it another way mostly empty. + (Note that leaf nodes will not contain any transitions.) + + This algorithm compresses the tables by eliminating most such + transitions, at the cost of a modest bit of extra work during lookup: + + - Each states[] entry contains a .base field which indicates the + index in the state[] array wheres its transition data is stored. + + - If .base is 0 there are no valid transitions from that node. + + - If .base is nonzero then charid is added to it to find an entry in + the trans array. + + -If trans[states[state].base+charid].check!=state then the + transition is taken to be a 0/Fail transition. Thus if there are fail + transitions at the front of the node then the .base offset will point + somewhere inside the previous nodes data (or maybe even into a node + even earlier), but the .check field determines if the transition is + valid. + + XXX - wrong maybe? + The following process inplace converts the table to the compressed + table: We first do not compress the root node 1,and mark all its + .check pointers as 1 and set its .base pointer as 1 as well. This + allows us to do a DFA construction from the compressed table later, + and ensures that any .base pointers we calculate later are greater + than 0. + + - We set 'pos' to indicate the first entry of the second node. + + - We then iterate over the columns of the node, finding the first and + last used entry at l and m. We then copy l..m into pos..(pos+m-l), + and set the .check pointers accordingly, and advance pos + appropriately and repreat for the next node. Note that when we copy + the next pointers we have to convert them from the original + NODEIDX form to NODENUM form as the former is not valid post + compression. + + - If a node has no transitions used we mark its base as 0 and do not + advance the pos pointer. + + - If a node only has one transition we use a second pointer into the + structure to fill in allocated fail transitions from other states. + This pointer is independent of the main pointer and scans forward + looking for null transitions that are allocated to a state. When it + finds one it writes the single transition into the "hole". If the + pointer doesnt find one the single transition is appended as normal. + + - Once compressed we can Renew/realloc the structures to release the + excess space. + + See "Table-Compression Methods" in sec 3.9 of the Red Dragon, + specifically Fig 3.47 and the associated pseudocode. + + demq + */ + const U32 laststate = TRIE_NODENUM( next_alloc ); + U32 state, charid; + U32 pos = 0, zp=0; + trie->statecount = laststate; + + for ( state = 1 ; state < laststate ; state++ ) { + U8 flag = 0; + const U32 stateidx = TRIE_NODEIDX( state ); + const U32 o_used = trie->trans[ stateidx ].check; + U32 used = trie->trans[ stateidx ].check; + trie->trans[ stateidx ].check = 0; + + for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { + if ( flag || trie->trans[ stateidx + charid ].next ) { + if ( trie->trans[ stateidx + charid ].next ) { + if (o_used == 1) { + for ( ; zp < pos ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + break; + } + } + trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; + trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->trans[ zp ].check = state; + if ( ++zp > pos ) pos = zp; + break; + } + used--; + } + if ( !flag ) { + flag = 1; + trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; + } + trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].check = state; + pos++; + } + } + } + trie->lasttrans = pos + 1; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, laststate + * sizeof(reg_trie_state) ); + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + ); + + } /* end table compress */ + } + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + (int)depth * 2 + 2, "", + (UV)trie->statecount, + (UV)trie->lasttrans) + ); + /* resize the trans array to remove unused space */ + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, trie->lasttrans + * sizeof(reg_trie_trans) ); + + { /* Modify the program and insert the new TRIE node */ + U8 nodetype =(U8)(flags & 0xFF); + char *str=NULL; + +#ifdef DEBUGGING + regnode *optimize = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + + U32 mjd_offset = 0; + U32 mjd_nodelen = 0; +#endif /* RE_TRACK_PATTERN_OFFSETS */ +#endif /* DEBUGGING */ + /* + This means we convert either the first branch or the first Exact, + depending on whether the thing following (in 'last') is a branch + or not and whther first is the startbranch (ie is it a sub part of + the alternation or is it the whole thing.) + Assuming its a sub part we convert the EXACT otherwise we convert + the whole branch sequence, including the first. + */ + /* Find the node we are going to overwrite */ + if ( first != startbranch || OP( last ) == BRANCH ) { + /* branch sub-chain */ + NEXT_OFF( first ) = (U16)(last - first); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_r({ + mjd_offset= Node_Offset((convert)); + mjd_nodelen= Node_Length((convert)); + }); +#endif + /* whole branch chain */ + } +#ifdef RE_TRACK_PATTERN_OFFSETS + else { + DEBUG_r({ + const regnode *nop = NEXTOPER( convert ); + mjd_offset= Node_Offset((nop)); + mjd_nodelen= Node_Length((nop)); + }); + } + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + (int)depth * 2 + 2, "", + (UV)mjd_offset, (UV)mjd_nodelen) + ); +#endif + /* But first we check to see if there is a common prefix we can + split out as an EXACT and put in front of the TRIE node. */ + trie->startstate= 1; + if ( trie->bitmap && !widecharmap && !trie->jump ) { + U32 state; + for ( state = 1 ; state < trie->statecount-1 ; state++ ) { + U32 ofs = 0; + I32 idx = -1; + U32 count = 0; + const U32 base = trie->states[ state ].trans.base; + + if ( trie->states[state].wordnum ) + count = 1; + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) && + ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && + trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + { + if ( ++count > 1 ) { + SV **tmp = av_fetch( revcharmap, ofs, 0); + const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); + if ( state == 1 ) break; + if ( count == 2 ) { + Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*sNew Start State=%"UVuf" Class: [", + (int)depth * 2 + 2, "", + (UV)state)); + if (idx >= 0) { + SV ** const tmp = av_fetch( revcharmap, idx, 0); + const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); + + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie, folder[ *ch ]); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, "%s", (char*)ch) + ); + } + } + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie,folder[ *ch ]); + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); + } + idx = ofs; + } + } + if ( count == 1 ) { + SV **tmp = av_fetch( revcharmap, idx, 0); + STRLEN len; + char *ch = SvPV( *tmp, len ); + DEBUG_OPTIMISE_r({ + SV *sv=sv_newmortal(); + PerlIO_printf( Perl_debug_log, + "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", + (int)depth * 2 + 2, "", + (UV)state, (UV)idx, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + }); + if ( state==1 ) { + OP( convert ) = nodetype; + str=STRING(convert); + STR_LEN(convert)=0; + } + STR_LEN(convert) += len; + while (len--) + *str++ = *ch++; + } else { +#ifdef DEBUGGING + if (state>1) + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); +#endif + break; + } + } + trie->prefixlen = (state-1); + if (str) { + regnode *n = convert+NODE_SZ_STR(convert); + NEXT_OFF(convert) = NODE_SZ_STR(convert); + trie->startstate = state; + trie->minlen -= (state - 1); + trie->maxlen -= (state - 1); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + regnode *fix = convert; + U32 word = trie->wordcount; + mjd_nodelen++; + Set_Node_Offset_Length(convert, mjd_offset, state - 1); + while( ++fix < n ) { + Set_Node_Offset_Length(fix, 0, 0); + } + while (word--) { + SV ** const tmp = av_fetch( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif + if (trie->maxlen) { + convert = n; + } else { + NEXT_OFF(convert) = (U16)(tail - convert); + DEBUG_r(optimize= n); + } + } + } + if (!jumper) + jumper = last; + if ( trie->maxlen ) { + NEXT_OFF( convert ) = (U16)(tail - convert); + ARG_SET( convert, data_slot ); + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. + We use this when dumping a trie and during optimisation. */ + if (trie->jump) + trie->jump[0] = (U16)(nextbranch - convert); + + /* If the start state is not accepting (meaning there is no empty string/NOTHING) + * and there is a bitmap + * and the first "jump target" node we found leaves enough room + * then convert the TRIE node into a TRIEC node, with the bitmap + * embedded inline in the opcode - this is hypothetically faster. + */ + if ( !trie->states[trie->startstate].wordnum + && trie->bitmap + && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) + { + OP( convert ) = TRIEC; + Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); + PerlMemShared_free(trie->bitmap); + trie->bitmap= NULL; + } else + OP( convert ) = TRIE; + + /* store the type in the flags */ + convert->flags = nodetype; + DEBUG_r({ + optimize = convert + + NODE_STEP_REGNODE + + regarglen[ OP( convert ) ]; + }); + /* XXX We really should free up the resource in trie now, + as we won't use them - (which resources?) dmq */ + } + /* needed for dumping*/ + DEBUG_r(if (optimize) { + regnode *opt = convert; + + while ( ++opt < optimize) { + Set_Node_Offset_Length(opt,0,0); + } + /* + Try to clean up some of the debris left after the + optimisation. + */ + while( optimize < jumper ) { + mjd_nodelen += Node_Length((optimize)); + OP( optimize ) = OPTIMIZED; + Set_Node_Offset_Length(optimize,0,0); + optimize++; + } + Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); + }); + } /* end node insert */ + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert); + + /* Finish populating the prev field of the wordinfo array. Walk back + * from each accept state until we find another accept state, and if + * so, point the first word's .prev field at the second word. If the + * second already has a .prev field set, stop now. This will be the + * case either if we've already processed that word's accept state, + * or that state had multiple words, and the overspill words were + * already linked up earlier. + */ + { + U16 word; + U32 state; + U16 prev; + + for (word=1; word <= trie->wordcount; word++) { + prev = 0; + if (trie->wordinfo[word].prev) + continue; + state = trie->wordinfo[word].accept; + while (state) { + state = prev_states[state]; + if (!state) + break; + prev = trie->states[state].wordnum; + if (prev) + break; + } + trie->wordinfo[word].prev = prev; + } + Safefree(prev_states); + } + + + /* and now dump out the compressed format */ + DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); + + RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; +#ifdef DEBUGGING + RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; + RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; +#else + SvREFCNT_dec(revcharmap); +#endif + return trie->jump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE + : MADE_TRIE; +} + +STATIC void +S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) +{ +/* The Trie is constructed and compressed now so we can build a fail array if it's needed + + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 + ISBN 0-201-10088-6 + + We find the fail state for each state in the trie, this state is the longest proper + suffix of the current state's 'word' that is also a proper prefix of another word in our + trie. State 1 represents the word '' and is thus the default fail state. This allows + the DFA not to have to restart after its tried and failed a word at a given point, it + simply continues as though it had been matching the other word in the first place. + Consider + 'abcdgu'=~/abcdefg|cdgu/ + When we get to 'd' we are still matching the first word, we would encounter 'g' which would + fail, which would bring us to the state representing 'd' in the second word where we would + try 'g' and succeed, proceeding to match 'cdgu'. + */ + /* add a fail transition */ + const U32 trie_offset = ARG(source); + reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; + U32 *q; + const U32 ucharcount = trie->uniquecharcount; + const U32 numstates = trie->statecount; + const U32 ubound = trie->lasttrans + ucharcount; + U32 q_read = 0; + U32 q_write = 0; + U32 charid; + U32 base = trie->states[ 1 ].trans.base; + U32 *fail; + reg_ac_data *aho; + const U32 data_slot = add_data( pRExC_state, 1, "T" ); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + + ARG_SET( stclass, data_slot ); + aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); + RExC_rxi->data->data[ data_slot ] = (void*)aho; + aho->trie=trie_offset; + aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); + Copy( trie->states, aho->states, numstates, reg_trie_state ); + Newxz( q, numstates, U32); + aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); + aho->refcount = 1; + fail = aho->fail; + /* initialize fail[0..1] to be 1 so that we always have + a valid final fail state */ + fail[ 0 ] = fail[ 1 ] = 1; + + for ( charid = 0; charid < ucharcount ; charid++ ) { + const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); + if ( newstate ) { + q[ q_write ] = newstate; + /* set to point at the root */ + fail[ q[ q_write++ ] ]=1; + } + } + while ( q_read < q_write) { + const U32 cur = q[ q_read++ % numstates ]; + base = trie->states[ cur ].trans.base; + + for ( charid = 0 ; charid < ucharcount ; charid++ ) { + const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); + if (ch_state) { + U32 fail_state = cur; + U32 fail_base; + do { + fail_state = fail[ fail_state ]; + fail_base = aho->states[ fail_state ].trans.base; + } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); + + fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); + fail[ ch_state ] = fail_state; + if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) + { + aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; + } + q[ q_write++ % numstates] = ch_state; + } + } + } + /* restore fail[0..1] to 0 so that we "fall out" of the AC loop + when we fail in state 1, this allows us to use the + charclass scan to find a valid start char. This is based on the principle + that theres a good chance the string being searched contains lots of stuff + that cant be a start char. + */ + fail[ 0 ] = fail[ 1 ] = 0; + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%*sStclass Failtable (%"UVuf" states): 0", + (int)(depth * 2), "", (UV)numstates + ); + for( q_read=1; q_read%3d: %s (%d)\n", \ + (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ + Next ? (REG_NODE_NUM(Next)) : 0 ); \ + }}); + + +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one, and looks for problematic sequences of characters whose folds vs. + * non-folds have sufficiently different lengths, that the optimizer would be + * fooled into rejecting legitimate matches of them, and the trie construction + * code can't cope with them. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING kind nodes, and + * these get optimized out + * + * If there are problematic code sequences, *min_subtract is set to the delta + * that the minimum size of the node can be less than its actual size. And, + * the node type of the result is changed to reflect that it contains these + * sequences. + * + * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF + * and contains LATIN SMALL LETTER SHARP S + * + * This is as good a place as any to discuss the design of handling these + * problematic sequences. It's been wrong in Perl for a very long time. There + * are three code points in Unicode whose folded lengths differ so much from + * the un-folded lengths that it causes problems for the optimizer and trie + * construction. Why only these are problematic, and not others where lengths + * also differ is something I (khw) do not understand. New versions of Unicode + * might add more such code points. Hopefully the logic in fold_grind.t that + * figures out what to test (in part by verifying that each size-combination + * gets tested) will catch any that do come along, so they can be added to the + * special handling below. The chances of new ones are actually rather small, + * as most, if not all, of the world's scripts that have casefolding have + * already been encoded by Unicode. Also, a number of Unicode's decisions were + * made to allow compatibility with pre-existing standards, and almost all of + * those have already been dealt with. These would otherwise be the most + * likely candidates for generating further tricky sequences. In other words, + * Unicode by itself is unlikely to add new ones unless it is for compatibility + * with pre-existing standards, and there aren't many of those left. + * + * The previous designs for dealing with these involved assigning a special + * node for them. This approach doesn't work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both these fold to "sss", but if the pattern is parsed to create a node of + * that would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFU_SS node + * that is "sss". + * + * There are a number of components to the approach (a lot of work for just + * three code points!): + * 1) This routine examines each EXACTFish node that could contain the + * problematic sequences. It returns in *min_subtract how much to + * subtract from the the actual length of the string to get a real minimum + * for one that could match it. This number is usually 0 except for the + * problematic sequences. This delta is used by the caller to adjust the + * min length of the match, and the delta between min and max, so that the + * optimizer doesn't reject these possibilities based on size constraints. + * 2) These sequences are not currently correctly handled by the trie code + * either, so it changes the joined node type to ops that are not handled + * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. + * 3) This is sufficient for the two Greek sequences (described below), but + * the one involving the Sharp s (\xDF) needs more. The node type + * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" + * sequence in it. For non-UTF-8 patterns and strings, this is the only + * case where there is a possible fold length change. That means that a + * regular EXACTFU node without UTF-8 involvement doesn't have to concern + * itself with length changes, and so can be processed faster. regexec.c + * takes advantage of this. Generally, an EXACTFish node that is in UTF-8 + * is pre-folded by regcomp.c. This saves effort in regex matching. + * However, probably mostly for historical reasons, the pre-folding isn't + * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL + * nodes, as what they fold to isn't known until runtime.) The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string + * are members of a fold-pair, and arrays are set up for all of them + * that quickly find the other member of the pair. It might actually + * be faster to pre-fold these, but it isn't currently done, except for + * the sharp s. Code elsewhere in this file makes sure that it gets + * folded to 'ss', even if the pattern isn't UTF-8. This avoids the + * issues described in the next item. + * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches + * 'ss' or not is not knowable at compile time. It will match iff the + * target string is in UTF-8, unlike the EXACTFU nodes, where it always + * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus + * it can't be folded to "ss" at compile time, unlike EXACTFU does as + * described in item 3). An assumption that the optimizer part of + * regexec.c (probably unwittingly) makes is that a character in the + * pattern corresponds to at most a single character in the target string. + * (And I do mean character, and not byte here, unlike other parts of the + * documentation that have never been updated to account for multibyte + * Unicode.) This assumption is wrong only in this case, as all other + * cases are either 1-1 folds when no UTF-8 is involved; or is true by + * virtue of having this file pre-fold UTF-8 patterns. I'm + * reluctant to try to change this assumption, so instead the code punts. + * This routine examines EXACTF nodes for the sharp s, and returns a + * boolean indicating whether or not the node is an EXACTF node that + * contains a sharp s. When it is true, the caller sets a flag that later + * causes the optimizer in this file to not set values for the floating + * and fixed string lengths, and thus avoids the optimizer code in + * regexec.c that makes the invalid assumption. Thus, there is no + * optimization based on string lengths for EXACTF nodes that contain the + * sharp s. This only happens for /id rules (which means the pattern + * isn't in UTF-8). + */ + +#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ + if (PL_regkind[OP(scan)] == EXACT) \ + join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) + +STATIC U32 +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { + /* Merge several consecutive EXACTish nodes into one. */ + regnode *n = regnext(scan); + U32 stringok = 1; + regnode *next = scan + NODE_SZ_STR(scan); + U32 merged = 0; + U32 stopnow = 0; +#ifdef DEBUGGING + regnode *stop = scan; + GET_RE_DEBUG_FLAGS_DECL; +#else + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_JOIN_EXACT; +#ifndef EXPERIMENTAL_INPLACESCAN + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(val); +#endif + DEBUG_PEEP("join",scan,depth); + + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) + && NEXT_OFF(n) + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { + + if (OP(n) == TAIL || n > next) + stringok = 0; + if (PL_regkind[OP(n)] == NOTHING) { + DEBUG_PEEP("skip:",n,depth); + NEXT_OFF(scan) += NEXT_OFF(n); + next = n + NODE_STEP_REGNODE; +#ifdef DEBUGGING + if (stringok) + stop = n; +#endif + n = regnext(n); + } + else if (stringok) { + const unsigned int oldl = STR_LEN(scan); + regnode * const nnext = regnext(n); + + if (oldl + STR_LEN(n) > U8_MAX) + break; + + DEBUG_PEEP("merg",n,depth); + merged++; + + NEXT_OFF(scan) += NEXT_OFF(n); + STR_LEN(scan) += STR_LEN(n); + next = n + NODE_SZ_STR(n); + /* Now we can overwrite *n : */ + Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); +#ifdef DEBUGGING + stop = next - 1; +#endif + n = nnext; + if (stopnow) break; + } + +#ifdef EXPERIMENTAL_INPLACESCAN + if (flags && !NEXT_OFF(n)) { + DEBUG_PEEP("atch", val, depth); + if (reg_off_by_arg[OP(n)]) { + ARG_SET(n, val - n); + } + else { + NEXT_OFF(n) = val - n; + } + stopnow = 1; + } +#endif + } + + *min_subtract = 0; + *has_exactf_sharp_s = FALSE; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ + if (OP(scan) != EXACT) { + U8 *s; + U8 * s0 = (U8*) STRING(scan); + U8 * const s_end = s0 + STR_LEN(scan); + + /* The below is perhaps overboard, but this allows us to save a test + * each time through the loop at the expense of a mask. This is + * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a + * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64. + * This uses an exclusive 'or' to find that bit and then inverts it to + * form a mask, with just a single 0, in the bit position where 'S' and + * 's' differ. */ + const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); + const U8 s_masked = 's' & S_or_s_mask; + + /* One pass is made over the node's string looking for all the + * possibilities. to avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { + + /* There are two problematic Greek code points in Unicode + * casefolding + * + * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + * + * which casefold to + * + * Unicode UTF-8 + * + * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + * + * This means that in case-insensitive matching (or "loose + * matching", as Unicode calls it), an EXACTF of length six (the + * UTF-8 encoded byte length of the above casefolded versions) can + * match a target string of length two (the byte length of UTF-8 + * encoded U+0390 or U+03B0). This would rather mess up the + * minimum length computation. (there are other code points that + * also fold to these two sequences, but the delta is smaller) + * + * If these sequences are found, the minimum length is decreased by + * four (six minus two). + * + * Similarly, 'ss' may match the single char and byte LATIN SMALL + * LETTER SHARP S. We decrease the min length by 1 for each + * occurrence of 'ss' found */ + +#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ +# define U390_first_byte 0xb4 + const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42"; +# define U3B0_first_byte 0xb5 + const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42"; +#else +# define U390_first_byte 0xce + const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81"; +# define U3B0_first_byte 0xcf + const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81"; +#endif + const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; + yields a net of 0 */ + /* Examine the string for one of the problematic sequences */ + for (s = s0; + s < s_end - 1; /* Can stop 1 before the end, as minimum length + * sequence we are looking for is 2 */ + s += UTF8SKIP(s)) + { + + /* Look for the first byte in each problematic sequence */ + switch (*s) { + /* We don't have to worry about other things that fold to + * 's' (such as the long s, U+017F), as all above-latin1 + * code points have been pre-folded */ + case 's': + case 'S': + + /* Current character is an 's' or 'S'. If next one is + * as well, we have the dreaded sequence */ + if (((*(s+1) & S_or_s_mask) == s_masked) + /* These two node types don't have special handling + * for 'ss' */ + && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + { + *min_subtract += 1; + OP(scan) = EXACTFU_SS; + s++; /* No need to look at this character again */ + } + break; + + case U390_first_byte: + if (s_end - s >= len + + /* The 1's are because are skipping comparing the + * first byte */ + && memEQ(s + 1, U390_tail, len - 1)) + { + goto greek_sequence; + } + break; + + case U3B0_first_byte: + if (! (s_end - s >= len + && memEQ(s + 1, U3B0_tail, len - 1))) + { + break; + } + greek_sequence: + *min_subtract += 4; + + /* This can't currently be handled by trie's, so change + * the node type to indicate this. If EXACTFA and + * EXACTFL were ever to be handled by trie's, this + * would have to be changed. If this node has already + * been changed to EXACTFU_SS in this loop, leave it as + * is. (I (khw) think it doesn't matter in regexec.c + * for UTF patterns, but no need to change it */ + if (OP(scan) == EXACTFU) { + OP(scan) = EXACTFU_TRICKYFOLD; + } + s += 6; /* We already know what this sequence is. Skip + the rest of it */ + break; + } + } + } + else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + + /* Here, the pattern is not UTF-8. We need to look only for the + * 'ss' sequence, and in the EXACTF case, the sharp s, which can be + * in the final position. Otherwise we can stop looking 1 byte + * earlier because have to find both the first and second 's' */ + const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; + + for (s = s0; s < upper; s++) { + switch (*s) { + case 'S': + case 's': + if (s_end - s > 1 + && ((*(s+1) & S_or_s_mask) == s_masked)) + { + *min_subtract += 1; + + /* EXACTF nodes need to know that the minimum + * length changed so that a sharp s in the string + * can match this ss in the pattern, but they + * remain EXACTF nodes, as they are not trie'able, + * so don't have to invent a new node type to + * exclude them from the trie code */ + if (OP(scan) != EXACTF) { + OP(scan) = EXACTFU_SS; + } + s++; + } + break; + case LATIN_SMALL_LETTER_SHARP_S: + if (OP(scan) == EXACTF) { + *has_exactf_sharp_s = TRUE; + } + break; + } + } + } + } + +#ifdef DEBUGGING + /* Allow dumping but overwriting the collection of skipped + * ops and/or strings with fake optimized ops */ + n = scan + NODE_SZ_STR(scan); + while (n <= stop) { + OP(n) = OPTIMIZED; + FLAGS(n) = 0; + NEXT_OFF(n) = 0; + n++; + } +#endif + DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); + return stopnow; +} + +/* REx optimizer. Converts nodes into quicker variants "in place". + Finds fixed substrings. */ + +/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set + to the position after last scanned or to NULL. */ + +#define INIT_AND_WITHP \ + assert(!and_withp); \ + Newx(and_withp,1,struct regnode_charclass_class); \ + SAVEFREEPV(and_withp) + +/* this is a chain of data about sub patterns we are processing that + need to be handled separately/specially in study_chunk. Its so + we can simulate recursion without losing state. */ +struct scan_frame; +typedef struct scan_frame { + regnode *last; /* last node to process in this frame */ + regnode *next; /* next node to process when last is reached */ + struct scan_frame *prev; /*previous frame*/ + I32 stop; /* what stopparen do we use */ +} scan_frame; + + +#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) + +#define CASE_SYNST_FNC(nAmE) \ +case nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break; \ +case N ## nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break + + + +STATIC I32 +S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, + I32 *minlenp, I32 *deltap, + regnode *last, + scan_data_t *data, + I32 stopparen, + U8* recursed, + struct regnode_charclass_class *and_withp, + U32 flags, U32 depth) + /* scanp: Start here (read-write). */ + /* deltap: Write maxlen-minlen here. */ + /* last: Stop before this one. */ + /* data: string data about the pattern */ + /* stopparen: treat close N as END */ + /* recursed: which subroutines have we recursed into */ + /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ +{ + dVAR; + I32 min = 0, pars = 0, code; + regnode *scan = *scanp, *next; + I32 delta = 0; + int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); + int is_inf_internal = 0; /* The studied chunk is infinite */ + I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; + scan_data_t data_fake; + SV *re_trie_maxbuff = NULL; + regnode *first_non_open = scan; + I32 stopmin = I32_MAX; + scan_frame *frame = NULL; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_STUDY_CHUNK; + +#ifdef DEBUGGING + StructCopy(&zero_scan_data, &data_fake, scan_data_t); +#endif + + if ( depth == 0 ) { + while (first_non_open && OP(first_non_open) == OPEN) + first_non_open=regnext(first_non_open); + } + + + fake_study_recurse: + while ( scan && OP(scan) != END && scan < last ){ + UV min_subtract = 0; /* How much to subtract from the minimum node + length to get a real minimum (because the + folded version may be shorter) */ + bool has_exactf_sharp_s = FALSE; + /* Peephole optimizer: */ + DEBUG_STUDYDATA("Peep:", data,depth); + DEBUG_PEEP("Peep",scan,depth); + + /* Its not clear to khw or hv why this is done here, and not in the + * clauses that deal with EXACT nodes. khw's guess is that it's + * because of a previous design */ + JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); + + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. */ + if (OP(scan) != CURLYX) { + const int max = (reg_off_by_arg[OP(scan)] + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); + int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); + int noff; + regnode *n = scan; + + /* Skip NOTHING and LONGJMP. */ + while ((n = regnext(n)) + && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n)))) + && off + noff < max) + off += noff; + if (reg_off_by_arg[OP(scan)]) + ARG(scan) = off; + else + NEXT_OFF(scan) = off; + } + + + + /* The principal pseudo-switch. Cannot be a switch, since we + look into several different things. */ + if (OP(scan) == BRANCH || OP(scan) == BRANCHJ + || OP(scan) == IFTHEN) { + next = regnext(scan); + code = OP(scan); + /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ + + if (OP(next) == code || code == IFTHEN) { + /* NOTE - There is similar code to this block below for handling + TRIE nodes on a re-study. If you change stuff here check there + too. */ + I32 max1 = 0, min1 = I32_MAX, num = 0; + struct regnode_charclass_class accum; + regnode * const startbranch=scan; + + if (flags & SCF_DO_SUBSTR) + SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_STCLASS) + cl_init_zero(pRExC_state, &accum); + + while (OP(scan) == code) { + I32 deltanext, minnext, f = 0, fake; + struct regnode_charclass_class this_class; + + num++; + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + + data_fake.pos_delta = delta; + next = regnext(scan); + scan = NEXTOPER(scan); + if (code != BRANCH) + scan = NEXTOPER(scan); + if (flags & SCF_DO_STCLASS) { + cl_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + /* we suppose the run is continuous, last=next...*/ + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + next, &data_fake, + stopparen, recursed, NULL, f,depth+1); + if (min1 > minnext) + min1 = minnext; + if (max1 < minnext + deltanext) + max1 = minnext + deltanext; + if (deltanext == I32_MAX) + is_inf = is_inf_internal = 1; + scan = next; + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > minnext) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + cl_or(pRExC_state, &accum, &this_class); + } + if (code == IFTHEN && num < 2) /* Empty ELSE branch */ + min1 = 0; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + cl_or(pRExC_state, data->start_class, &accum); + if (min1) { + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + cl_and(data->start_class, &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, + struct regnode_charclass_class); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, + struct regnode_charclass_class); + flags |= SCF_DO_STCLASS_OR; + data->start_class->flags |= ANYOF_EOS; + } + } + + if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { + /* demq. + + Assuming this was/is a branch we are dealing with: 'scan' now + points at the item that follows the branch sequence, whatever + it is. We now start at the beginning of the sequence and look + for subsequences of + + BRANCH->EXACT=>x1 + BRANCH->EXACT=>x2 + tail + + which would be constructed from a pattern like /A|LIST|OF|WORDS/ + + If we can find such a subsequence we need to turn the first + element into a trie and then add the subsequent branch exact + strings to the trie. + + We have two cases + + 1. patterns where the whole set of branches can be converted. + + 2. patterns where only a subset can be converted. + + In case 1 we can replace the whole set with a single regop + for the trie. In case 2 we need to keep the start and end + branches so + + 'BRANCH EXACT; BRANCH EXACT; BRANCH X' + becomes BRANCH TRIE; BRANCH X; + + There is an additional case, that being where there is a + common prefix, which gets split out into an EXACT like node + preceding the TRIE node. + + If x(1..n)==tail then we can do a simple trie, if not we make + a "jump" trie, such that when we match the appropriate word + we "jump" to the appropriate tail node. Essentially we turn + a nested if into a case structure of sorts. + + */ + + int made=0; + if (!re_trie_maxbuff) { + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + if (!SvIOK(re_trie_maxbuff)) + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + if ( SvIV(re_trie_maxbuff)>=0 ) { + regnode *cur; + regnode *first = (regnode *)NULL; + regnode *last = (regnode *)NULL; + regnode *tail = scan; + U8 trietype = 0; + U32 count=0; + +#ifdef DEBUGGING + SV * const mysv = sv_newmortal(); /* for dumping */ +#endif + /* var tail is used because there may be a TAIL + regop in the way. Ie, the exacts will point to the + thing following the TAIL, but the last branch will + point at the TAIL. So we advance tail. If we + have nested (?:) we may have to move through several + tails. + */ + + while ( OP( tail ) == TAIL ) { + /* this is the TAIL generated by (?:) */ + tail = regnext( tail ); + } + + + DEBUG_OPTIMISE_r({ + regprop(RExC_rx, mysv, tail ); + PerlIO_printf( Perl_debug_log, "%*s%s%s\n", + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) + ); + }); + + /* + + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this /FOO[xyz]|BAR[pqr]/ + via a "jump trie" but we also support building with NOJUMPTRIE, + which restricts the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is a possible optimization + target. If we are building under NOJUMPTRIE then we require that noper_next + is the same as scan (our current position in the regex program). + + Once we have two or more consecutive such branches we can create a + trie of the EXACT's contents and stitch it in place into the program. + + If the sequence represents all of the branches in the alternation we + replace the entire thing with a single TRIE node. + + Otherwise when it is a subsequence we need to stitch it in place and + replace only the relevant branches. This means the first branch has + to remain as it is used by the alternation logic, and its next pointer, + and needs to be repointed at the item on the branch chain following + the last branch we have optimized away. + + This could be either a BRANCH, in which case the subsequence is internal, + or it could be the item following the branch sequence in which case the + subsequence is at the end (which does not necessarily mean the first node + is the start of the alternation). + + TRIE_TYPE(X) is a define which maps the optype to a trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACTFU | EXACTFU + EXACTFU_SS | EXACTFU + EXACTFU_TRICKYFOLD | EXACTFU + EXACTFA | 0 + + + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ + ( EXACT == (X) ) ? EXACT : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + 0 ) + + /* dont use tail as the end marker for this traverse */ + for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { + regnode * const noper = NEXTOPER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); +#if defined(DEBUGGING) || defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); +#endif + + DEBUG_OPTIMISE_r({ + regprop(RExC_rx, mysv, cur); + PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", + (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); + + regprop(RExC_rx, mysv, noper); + PerlIO_printf( Perl_debug_log, " -> %s", + SvPV_nolen_const(mysv)); + + if ( noper_next ) { + regprop(RExC_rx, mysv, noper_next ); + PerlIO_printf( Perl_debug_log,"\t=> %s\t", + SvPV_nolen_const(mysv)); + } + PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n", + REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) ); + }); + + /* Is noper a trieable nodetype that can be merged with the + * current trie (if there is one)? */ + if ( noper_trietype + && + ( + /* XXX: Currently we cannot allow a NOTHING node to be the first element + * of a TRIEABLE sequence, Otherwise we will overwrite the regop following + * the NOTHING with the TRIE regop later on. This is because a NOTHING node + * is only one regnode wide, and a TRIE is two regnodes. An example of a + * problematic pattern is: "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/ + * At a later point of time we can somewhat workaround this by handling + * NOTHING -> EXACT sequences as generated by /(?:)A|(?:)B/ type patterns, + * as we can effectively ignore the NOTHING regop in that case. + * This clause, which allows NOTHING to start a sequence is left commented + * out as a reference. + * - Yves + + ( noper_trietype == NOTHING) + || ( trietype == NOTHING ) + */ + ( noper_trietype == NOTHING && trietype ) + || ( trietype == noper_trietype ) + ) +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + && count < U16_MAX) + { + /* Handle mergable triable node + * Either we are the first node in a new trieable sequence, + * in which case we do some bookkeeping, otherwise we update + * the end pointer. */ + count++; + if ( !first ) { + first = cur; + trietype = noper_trietype; + } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; + last = cur; + } + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can not be tried + * together with the current trie, or a non triable node */ + if ( last ) { + /* If last is set and trietype is not NOTHING then we have found + * at least two triable branch sequences in a row of a similar + * trietype so we can turn them into a trie. If/when we + * allow NOTHING to start a trie sequence this condition will be + * required, and it isn't expensive so we leave it in for now. */ + if ( trietype != NOTHING ) + make_trie( pRExC_state, + startbranch, first, cur, tail, count, + trietype, depth+1 ); + last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ + } + if ( noper_trietype +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + ){ + /* noper is triable, so we can start a new trie sequence */ + count = 1; + first = cur; + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the current node is not triable then we have + * to reset the first information. */ + count = 0; + first = NULL; + trietype = 0; + } + } /* end handle unmergable node */ + } /* loop over branches */ + DEBUG_OPTIMISE_r({ + regprop(RExC_rx, mysv, cur); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + if ( last && trietype != NOTHING ) { + /* the last branch of the sequence was part of a trie, + * so we have to construct it here outside of the loop + */ + made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); +#ifdef TRIE_STUDY_OPT + if ( ((made == MADE_EXACT_TRIE && + startbranch == first) + || ( first_non_open == first )) && + depth==0 ) { + flags |= SCF_TRIE_RESTUDY; + if ( startbranch == first + && scan == tail ) + { + RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + } + } +#endif + } /* end if ( last) */ + } /* TRIE_MAXBUF is non zero */ + + } /* do trie */ + + } + else if ( code == BRANCHJ ) { /* single branch is optimized. */ + scan = NEXTOPER(NEXTOPER(scan)); + } else /* single branch is optimized. */ + scan = NEXTOPER(scan); + continue; + } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { + scan_frame *newframe = NULL; + I32 paren; + regnode *start; + regnode *end; + + if (OP(scan) != SUSPEND) { + /* set the pointer */ + if (OP(scan) == GOSUB) { + paren = ARG(scan); + RExC_recurse[ARG2L(scan)] = scan; + start = RExC_open_parens[paren-1]; + end = RExC_close_parens[paren-1]; + } else { + paren = 0; + start = RExC_rxi->program + 1; + end = RExC_opend; + } + if (!recursed) { + Newxz(recursed, (((RExC_npar)>>3) +1), U8); + SAVEFREEPV(recursed); + } + if (!PAREN_TEST(recursed,paren+1)) { + PAREN_SET(recursed,paren+1); + Newx(newframe,1,scan_frame); + } else { + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + cl_anything(pRExC_state, data->start_class); + flags &= ~SCF_DO_STCLASS; + } + } else { + Newx(newframe,1,scan_frame); + paren = stopparen; + start = scan+2; + end = regnext(scan); + } + if (newframe) { + assert(start); + assert(end); + SAVEFREEPV(newframe); + newframe->next = regnext(scan); + newframe->last = last; + newframe->stop = stopparen; + newframe->prev = frame; + + frame = newframe; + scan = start; + stopparen = paren; + last = end; + + continue; + } + } + else if (OP(scan) == EXACT) { + I32 l = STR_LEN(scan); + UV uc; + if (UTF) { + const U8 * const s = (U8*)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } else { + uc = *((U8*)STRING(scan)); + } + min += l; + if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ + /* The code below prefers earlier match for fixed + offset, later match for variable offset. */ + if (data->last_end == -1) { /* Update the start info. */ + data->last_start_min = data->pos_min; + data->last_start_max = is_inf + ? I32_MAX : data->pos_min + data->pos_delta; + } + sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + if (UTF) + SvUTF8_on(data->last_found); + { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += utf8_length((U8*)STRING(scan), + (U8*)STRING(scan)+STR_LEN(scan)); + } + data->last_end = data->pos_min + l; + data->pos_min += l; /* As in the first entry. */ + data->flags &= ~SF_BEFORE_EOL; + } + if (flags & SCF_DO_STCLASS_AND) { + /* Check whether it is compatible with what we know already! */ + int compat = 1; + + + /* If compatible, we or it in below. It is compatible if is + * in the bitmp and either 1) its bit or its fold is set, or 2) + * it's for a locale. Even if there isn't unicode semantics + * here, at runtime there may be because of matching against a + * utf8 string, so accept a possible false positive for + * latin1-range folds */ + if (uc >= 0x100 || + (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, uc) + && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) + || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) + ) + { + compat = 0; + } + ANYOF_CLASS_ZERO(data->start_class); + ANYOF_BITMAP_ZERO(data->start_class); + if (compat) + ANYOF_BITMAP_SET(data->start_class, uc); + else if (uc >= 0x100) { + int i; + + /* Some Unicode code points fold to the Latin1 range; as + * XXX temporary code, instead of figuring out if this is + * one, just assume it is and set all the start class bits + * that could be some such above 255 code point's fold + * which will generate fals positives. As the code + * elsewhere that does compute the fold settles down, it + * can be extracted out and re-used here */ + for (i = 0; i < 256; i++){ + if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { + ANYOF_BITMAP_SET(data->start_class, i); + } + } + } + data->start_class->flags &= ~ANYOF_EOS; + if (uc < 0x100) + data->start_class->flags &= ~ANYOF_UNICODE_ALL; + } + else if (flags & SCF_DO_STCLASS_OR) { + /* false positive possible if the class is case-folded */ + if (uc < 0x100) + ANYOF_BITMAP_SET(data->start_class, uc); + else + data->start_class->flags |= ANYOF_UNICODE_ALL; + data->start_class->flags &= ~ANYOF_EOS; + cl_and(data->start_class, and_withp); + } + flags &= ~SCF_DO_STCLASS; + } + else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ + I32 l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); + + /* Search for fixed substrings supports EXACT only. */ + if (flags & SCF_DO_SUBSTR) { + assert(data); + SCAN_COMMIT(pRExC_state, data, minlenp); + } + if (UTF) { + const U8 * const s = (U8 *)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } + else if (has_exactf_sharp_s) { + RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + } + min += l - min_subtract; + if (min < 0) { + min = 0; + } + delta += min_subtract; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += l - min_subtract; + if (data->pos_min < 0) { + data->pos_min = 0; + } + data->pos_delta += min_subtract; + if (min_subtract) { + data->longest = &(data->longest_float); + } + } + if (flags & SCF_DO_STCLASS_AND) { + /* Check whether it is compatible with what we know already! */ + int compat = 1; + if (uc >= 0x100 || + (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, uc) + && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) + { + compat = 0; + } + ANYOF_CLASS_ZERO(data->start_class); + ANYOF_BITMAP_ZERO(data->start_class); + if (compat) { + ANYOF_BITMAP_SET(data->start_class, uc); + data->start_class->flags &= ~ANYOF_EOS; + data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD; + if (OP(scan) == EXACTFL) { + /* XXX This set is probably no longer necessary, and + * probably wrong as LOCALE now is on in the initial + * state */ + data->start_class->flags |= ANYOF_LOCALE; + } + else { + + /* Also set the other member of the fold pair. In case + * that unicode semantics is called for at runtime, use + * the full latin1 fold. (Can't do this for locale, + * because not known until runtime) */ + ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + + /* All other (EXACTFL handled above) folds except under + * /iaa that include s, S, and sharp_s also may include + * the others */ + if (OP(scan) != EXACTFA) { + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } + } + } + } + else if (uc >= 0x100) { + int i; + for (i = 0; i < 256; i++){ + if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { + ANYOF_BITMAP_SET(data->start_class, i); + } + } + } + } + else if (flags & SCF_DO_STCLASS_OR) { + if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) { + /* false positive possible if the class is case-folded. + Assume that the locale settings are the same... */ + if (uc < 0x100) { + ANYOF_BITMAP_SET(data->start_class, uc); + if (OP(scan) != EXACTFL) { + + /* And set the other member of the fold pair, but + * can't do that in locale because not known until + * run-time */ + ANYOF_BITMAP_SET(data->start_class, + PL_fold_latin1[uc]); + + /* All folds except under /iaa that include s, S, + * and sharp_s also may include the others */ + if (OP(scan) != EXACTFA) { + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } + } + } + } + data->start_class->flags &= ~ANYOF_EOS; + } + cl_and(data->start_class, and_withp); + } + flags &= ~SCF_DO_STCLASS; + } + else if (REGNODE_VARIES(OP(scan))) { + I32 mincount, maxcount, minnext, deltanext, fl = 0; + I32 f = flags, pos_before = 0; + regnode * const oscan = scan; + struct regnode_charclass_class this_class; + struct regnode_charclass_class *oclass = NULL; + I32 next_is_eval = 0; + + switch (PL_regkind[OP(scan)]) { + case WHILEM: /* End of (?:...)* . */ + scan = NEXTOPER(scan); + goto finish; + case PLUS: + if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { + next = NEXTOPER(scan); + if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { + mincount = 1; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + } + if (flags & SCF_DO_SUBSTR) + data->pos_min++; + min++; + /* Fall through. */ + case STAR: + if (flags & SCF_DO_STCLASS) { + mincount = 0; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + is_inf = is_inf_internal = 1; + scan = regnext(scan); + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ + data->longest = &(data->longest_float); + } + goto optimize_curly_tail; + case CURLY: + if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) + && (scan->flags == stopparen)) + { + mincount = 1; + maxcount = 1; + } else { + mincount = ARG1(scan); + maxcount = ARG2(scan); + } + next = regnext(scan); + if (OP(scan) == CURLYX) { + I32 lp = (data ? *(data->last_closep) : 0); + scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); + } + scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; + next_is_eval = (OP(scan) == EVAL); + do_curly: + if (flags & SCF_DO_SUBSTR) { + if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ + pos_before = data->pos_min; + } + if (data) { + fl = data->flags; + data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); + if (is_inf) + data->flags |= SF_IS_INF; + } + if (flags & SCF_DO_STCLASS) { + cl_init(pRExC_state, &this_class); + oclass = data->start_class; + data->start_class = &this_class; + f |= SCF_DO_STCLASS_AND; + f &= ~SCF_DO_STCLASS_OR; + } + /* Exclude from super-linear cache processing any {n,m} + regops for which the combination of input pos and regex + pos is not enough information to determine if a match + will be possible. + + For example, in the regex /foo(bar\s*){4,8}baz/ with the + regex pos at the \s*, the prospects for a match depend not + only on the input position but also on how many (bar\s*) + repeats into the {4,8} we are. */ + if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) + f &= ~SCF_WHILEM_VISITED_POS; + + /* This will finish on WHILEM, setting scan, or on NULL: */ + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) : f),depth+1); + + if (flags & SCF_DO_STCLASS) + data->start_class = oclass; + if (mincount == 0 || minnext == 0) { + if (flags & SCF_DO_STCLASS_OR) { + cl_or(pRExC_state, data->start_class, &this_class); + } + else if (flags & SCF_DO_STCLASS_AND) { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, + struct regnode_charclass_class); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&this_class, data->start_class, + struct regnode_charclass_class); + flags |= SCF_DO_STCLASS_OR; + data->start_class->flags |= ANYOF_EOS; + } + } else { /* Non-zero len */ + if (flags & SCF_DO_STCLASS_OR) { + cl_or(pRExC_state, data->start_class, &this_class); + cl_and(data->start_class, and_withp); + } + else if (flags & SCF_DO_STCLASS_AND) + cl_and(data->start_class, &this_class); + flags &= ~SCF_DO_STCLASS; + } + if (!scan) /* It was not CURLYX, but CURLY. */ + scan = next; + if ( /* ? quantifier ok, except for (?{ ... }) */ + (next_is_eval || !(mincount == 0 && maxcount == 1)) + && (minnext == 0) && (deltanext == 0) + && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) + && maxcount <= REG_INFTY/3) /* Complement check for big count */ + { + ckWARNreg(RExC_parse, + "Quantifier unexpected on zero-length expression"); + } + + min += minnext * mincount; + is_inf_internal |= ((maxcount == REG_INFTY + && (minnext + deltanext) > 0) + || deltanext == I32_MAX); + is_inf |= is_inf_internal; + delta += (minnext + deltanext) * maxcount - minnext * mincount; + + /* Try powerful optimization CURLYX => CURLYN. */ + if ( OP(oscan) == CURLYX && data + && data->flags & SF_IN_PAR + && !(data->flags & SF_HAS_EVAL) + && !deltanext && minnext == 1 ) { + /* Try to optimize to CURLYN. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; + regnode * const nxt1 = nxt; +#ifdef DEBUGGING + regnode *nxt2; +#endif + + /* Skip open. */ + nxt = regnext(nxt); + if (!REGNODE_SIMPLE(OP(nxt)) + && !(PL_regkind[OP(nxt)] == EXACT + && STR_LEN(nxt) == 1)) + goto nogo; +#ifdef DEBUGGING + nxt2 = nxt; +#endif + nxt = regnext(nxt); + if (OP(nxt) != CLOSE) + goto nogo; + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ + } + /* Now we know that nxt2 is the only contents: */ + oscan->flags = (U8)ARG(nxt); + OP(oscan) = CURLYN; + OP(nxt1) = NOTHING; /* was OPEN. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ +#endif + } + nogo: + + /* Try optimization CURLYX => CURLYM. */ + if ( OP(oscan) == CURLYX && data + && !(data->flags & SF_HAS_PAR) + && !(data->flags & SF_HAS_EVAL) + && !deltanext /* atom is fixed width */ + && minnext != 0 /* CURLYM can't handle zero width */ + ) { + /* XXXX How to optimize if data == 0? */ + /* Optimize to a simpler form. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ + regnode *nxt2; + + OP(oscan) = CURLYM; + while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ + && (OP(nxt2) != WHILEM)) + nxt = nxt2; + OP(nxt2) = SUCCEED; /* Whas WHILEM */ + /* Need to optimize away parenths. */ + if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { + /* Set the parenth number. */ + regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ + + oscan->flags = (U8)ARG(nxt); + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ + } + OP(nxt1) = OPTIMIZED; /* was OPEN. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ +#endif +#if 0 + while ( nxt1 && (OP(nxt1) != WHILEM)) { + regnode *nnxt = regnext(nxt1); + if (nnxt == nxt) { + if (reg_off_by_arg[OP(nxt1)]) + ARG_SET(nxt1, nxt2 - nxt1); + else if (nxt2 - nxt1 < U16_MAX) + NEXT_OFF(nxt1) = nxt2 - nxt1; + else + OP(nxt) = NOTHING; /* Cannot beautify */ + } + nxt1 = nnxt; + } +#endif + /* Optimize again: */ + study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, + NULL, stopparen, recursed, NULL, 0,depth+1); + } + else + oscan->flags = 0; + } + else if ((OP(oscan) == CURLYX) + && (flags & SCF_WHILEM_VISITED_POS) + /* See the comment on a similar expression above. + However, this time it's not a subexpression + we care about, but the expression itself. */ + && (maxcount == REG_INFTY) + && data && ++data->whilem_c < 16) { + /* This stays as CURLYX, we can put the count/of pair. */ + /* Find WHILEM (as in regexec.c) */ + regnode *nxt = oscan + NEXT_OFF(oscan); + + if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ + nxt += ARG(nxt); + PREVOPER(nxt)->flags = (U8)(data->whilem_c + | (RExC_whilem_seen << 4)); /* On WHILEM */ + } + if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (flags & SCF_DO_SUBSTR) { + SV *last_str = NULL; + int counted = mincount != 0; + + if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ +#if defined(SPARC64_GCC_WORKAROUND) + I32 b = 0; + STRLEN l = 0; + const char *s = NULL; + I32 old = 0; + + if (pos_before >= data->last_start_min) + b = pos_before; + else + b = data->last_start_min; + + l = 0; + s = SvPV_const(data->last_found, l); + old = b - data->last_start_min; + +#else + I32 b = pos_before >= data->last_start_min + ? pos_before : data->last_start_min; + STRLEN l; + const char * const s = SvPV_const(data->last_found, l); + I32 old = b - data->last_start_min; +#endif + + if (UTF) + old = utf8_hop((U8*)s, old) - (U8*)s; + l -= old; + /* Get the added string: */ + last_str = newSVpvn_utf8(s + old, l, UTF); + if (deltanext == 0 && pos_before == b) { + /* What was added is a constant string */ + if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); + repeatcpy(SvPVX(last_str) + l, + SvPVX_const(last_str), l, mincount - 1); + SvCUR_set(last_str, SvCUR(last_str) * mincount); + /* Add additional parts. */ + SvCUR_set(data->last_found, + SvCUR(data->last_found) - l); + sv_catsv(data->last_found, last_str); + { + SV * sv = data->last_found; + MAGIC *mg = + SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += CHR_SVLEN(last_str) - l; + } + data->last_end += l * (mincount - 1); + } + } else { + /* start offset must point into the last copy */ + data->last_start_min += minnext * (mincount - 1); + data->last_start_max += is_inf ? I32_MAX + : (maxcount - 1) * (minnext + data->pos_delta); + } + } + /* It is counted once already... */ + data->pos_min += minnext * (mincount - counted); + data->pos_delta += - counted * deltanext + + (minnext + deltanext) * maxcount - minnext * mincount; + if (mincount != maxcount) { + /* Cannot extend fixed substrings found inside + the group. */ + SCAN_COMMIT(pRExC_state,data,minlenp); + if (mincount && last_str) { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + + if (mg) + mg->mg_len = -1; + sv_setsv(sv, last_str); + data->last_end = data->pos_min; + data->last_start_min = + data->pos_min - CHR_SVLEN(last_str); + data->last_start_max = is_inf + ? I32_MAX + : data->pos_min + data->pos_delta + - CHR_SVLEN(last_str); + } + data->longest = &(data->longest_float); + } + SvREFCNT_dec(last_str); + } + if (data && (fl & SF_HAS_EVAL)) + data->flags |= SF_HAS_EVAL; + optimize_curly_tail: + if (OP(oscan) != CURLYX) { + while (PL_regkind[OP(next = regnext(oscan))] == NOTHING + && NEXT_OFF(next)) + NEXT_OFF(oscan) += NEXT_OFF(next); + } + continue; + default: /* REF, ANYOFV, and CLUMP only? */ + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) + cl_anything(pRExC_state, data->start_class); + flags &= ~SCF_DO_STCLASS; + break; + } + } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + int value = 0; + data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + if (flags & SCF_DO_STCLASS_AND) { + for (value = 0; value < 256; value++) + if (!is_VERTWS_cp(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + for (value = 0; value < 256; value++) + if (is_VERTWS_cp(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + min += 1; + delta += 1; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + } + else if (REGNODE_SIMPLE(OP(scan))) { + int value = 0; + + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); + data->pos_min++; + } + min++; + if (flags & SCF_DO_STCLASS) { + data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + + /* Some of the logic below assumes that switching + locale on will only add false positives. */ + switch (PL_regkind[OP(scan)]) { + case SANY: + default: + do_default: + /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + cl_anything(pRExC_state, data->start_class); + break; + case REG_ANY: + if (OP(scan) == SANY) + goto do_default; + if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ + value = (ANYOF_BITMAP_TEST(data->start_class,'\n') + || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); + cl_anything(pRExC_state, data->start_class); + } + if (flags & SCF_DO_STCLASS_AND || !value) + ANYOF_BITMAP_CLEAR(data->start_class,'\n'); + break; + case ANYOF: + if (flags & SCF_DO_STCLASS_AND) + cl_and(data->start_class, + (struct regnode_charclass_class*)scan); + else + cl_or(pRExC_state, data->start_class, + (struct regnode_charclass_class*)scan); + break; + case ALNUM: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); + if (OP(scan) == ALNUMU) { + for (value = 0; value < 256; value++) { + if (!isWORDCHAR_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (!isALNUM(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } + } + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); + + /* Even if under locale, set the bits for non-locale + * in case it isn't a true locale-node. This will + * create false positives if it truly is locale */ + if (OP(scan) == ALNUMU) { + for (value = 0; value < 256; value++) { + if (isWORDCHAR_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isALNUM(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } + } + break; + case NALNUM: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); + if (OP(scan) == NALNUMU) { + for (value = 0; value < 256; value++) { + if (isWORDCHAR_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isALNUM(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } + } + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); + + /* Even if under locale, set the bits for non-locale in + * case it isn't a true locale-node. This will create + * false positives if it truly is locale */ + if (OP(scan) == NALNUMU) { + for (value = 0; value < 256; value++) { + if (! isWORDCHAR_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (! isALNUM(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } + } + break; + case SPACE: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); + if (OP(scan) == SPACEU) { + for (value = 0; value < 256; value++) { + if (!isSPACE_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (!isSPACE(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } + } + } + else { + if (data->start_class->flags & ANYOF_LOCALE) { + ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); + } + if (OP(scan) == SPACEU) { + for (value = 0; value < 256; value++) { + if (isSPACE_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isSPACE(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } + } + break; + case NSPACE: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); + if (OP(scan) == NSPACEU) { + for (value = 0; value < 256; value++) { + if (isSPACE_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isSPACE(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } + } + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); + if (OP(scan) == NSPACEU) { + for (value = 0; value < 256; value++) { + if (!isSPACE_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } + else { + for (value = 0; value < 256; value++) { + if (!isSPACE(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } + } + break; + case DIGIT: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); + for (value = 0; value < 256; value++) + if (!isDIGIT(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); + for (value = 0; value < 256; value++) + if (isDIGIT(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + break; + case NDIGIT: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); + for (value = 0; value < 256; value++) + if (isDIGIT(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); + for (value = 0; value < 256; value++) + if (!isDIGIT(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + break; + CASE_SYNST_FNC(VERTWS); + CASE_SYNST_FNC(HORIZWS); + + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { + data->flags |= (OP(scan) == MEOL + ? SF_BEFORE_MEOL + : SF_BEFORE_SEOL); + } + else if ( PL_regkind[OP(scan)] == BRANCHJ + /* Lookbehind, or need to calculate parens/evals/stclass: */ + && (scan->flags || data || (flags & SCF_DO_STCLASS)) + && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + || OP(scan) == UNLESSM ) + { + /* Negative Lookahead/lookbehind + In this case we can't do fixed string optimisation. + */ + + I32 deltanext, minnext, fake = 0; + regnode *nscan; + struct regnode_charclass_class intrnl; + int f = 0; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + cl_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, recursed, NULL, f, depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (minnext > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + } + scan->flags = (U8)minnext; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (f & SCF_DO_STCLASS_AND) { + if (flags & SCF_DO_STCLASS_OR) { + /* OR before, AND after: ideally we would recurse with + * data_fake to get the AND applied by study of the + * remainder of the pattern, and then derecurse; + * *** HACK *** for now just treat as "no information". + * See [perl #56690]. + */ + cl_init(pRExC_state, data->start_class); + } else { + /* AND before and after: combine and continue */ + const int was = (data->start_class->flags & ANYOF_EOS); + + cl_and(data->start_class, &intrnl); + if (was) + data->start_class->flags |= ANYOF_EOS; + } + } + } +#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY + else { + /* Positive Lookahead/lookbehind + In this case we can do fixed string optimisation, + but we must be careful about it. Note in the case of + lookbehind the positions will be offset by the minimum + length of the pattern, something we won't know about + until after the recurse. + */ + I32 deltanext, fake = 0; + regnode *nscan; + struct regnode_charclass_class intrnl; + int f = 0; + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated + minlens when it's all done. This way we don't + have to worry about freeing them when we know + they wont be used, which would be a pain. + */ + I32 *minnextp; + Newx( minnextp, 1, I32 ); + SAVEFREEPV(minnextp); + + if (data) { + StructCopy(data, &data_fake, scan_data_t); + if ((flags & SCF_DO_SUBSTR) && data->last_found) { + f |= SCF_DO_SUBSTR; + if (scan->flags) + SCAN_COMMIT(pRExC_state, &data_fake,minlenp); + data_fake.last_found=newSVsv(data->last_found); + } + } + else + data_fake.last_closep = &fake; + data_fake.flags = 0; + data_fake.pos_delta = delta; + if (is_inf) + data_fake.flags |= SF_IS_INF; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + cl_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, + last, &data_fake, stopparen, recursed, NULL, f,depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (*minnextp > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + } + scan->flags = (U8)*minnextp; + } + + *minnextp += min; + + if (f & SCF_DO_STCLASS_AND) { + const int was = (data->start_class->flags & ANYOF_EOS); + + cl_and(data->start_class, &intrnl); + if (was) + data->start_class->flags |= ANYOF_EOS; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { + if (RExC_rx->minlen<*minnextp) + RExC_rx->minlen=*minnextp; + SCAN_COMMIT(pRExC_state, &data_fake, minnextp); + SvREFCNT_dec(data_fake.last_found); + + if ( data_fake.minlen_fixed != minlenp ) + { + data->offset_fixed= data_fake.offset_fixed; + data->minlen_fixed= data_fake.minlen_fixed; + data->lookbehind_fixed+= scan->flags; + } + if ( data_fake.minlen_float != minlenp ) + { + data->minlen_float= data_fake.minlen_float; + data->offset_float_min=data_fake.offset_float_min; + data->offset_float_max=data_fake.offset_float_max; + data->lookbehind_float+= scan->flags; + } + } + } + + + } +#endif + } + else if (OP(scan) == OPEN) { + if (stopparen != (I32)ARG(scan)) + pars++; + } + else if (OP(scan) == CLOSE) { + if (stopparen == (I32)ARG(scan)) { + break; + } + if ((I32)ARG(scan) == is_par) { + next = regnext(scan); + + if ( next && (OP(next) != WHILEM) && next < last) + is_par = 0; /* Disable optimization */ + } + if (data) + *(data->last_closep) = ARG(scan); + } + else if (OP(scan) == EVAL) { + if (data) + data->flags |= SF_HAS_EVAL; + } + else if ( PL_regkind[OP(scan)] == ENDLIKE ) { + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); + flags &= ~SCF_DO_SUBSTR; + } + if (data && OP(scan)==ACCEPT) { + data->flags |= SCF_SEEN_ACCEPT; + if (stopmin > min) + stopmin = min; + } + } + else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ + { + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + cl_anything(pRExC_state, data->start_class); + flags &= ~SCF_DO_STCLASS; + } + else if (OP(scan) == GPOS) { + if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) + { + if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->extflags |= RXf_ANCH_GPOS; + if (RExC_rx->gofs < (U32)min) + RExC_rx->gofs = min; + } else { + RExC_rx->extflags |= RXf_GPOS_FLOAT; + RExC_rx->gofs = 0; + } + } +#ifdef TRIE_STUDY_OPT +#ifdef FULL_TRIE_STUDY + else if (PL_regkind[OP(scan)] == TRIE) { + /* NOTE - There is similar code to this block above for handling + BRANCH nodes on the initial study. If you change stuff here + check there too. */ + regnode *trie_node= scan; + regnode *tail= regnext(scan); + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + I32 max1 = 0, min1 = I32_MAX; + struct regnode_charclass_class accum; + + if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ + SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_STCLASS) + cl_init_zero(pRExC_state, &accum); + + if (!trie->jump) { + min1= trie->minlen; + max1= trie->maxlen; + } else { + const regnode *nextbranch= NULL; + U32 word; + + for ( word=1 ; word <= trie->wordcount ; word++) + { + I32 deltanext=0, minnext=0, f = 0, fake; + struct regnode_charclass_class this_class; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if (flags & SCF_DO_STCLASS) { + cl_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + if (trie->jump[word]) { + if (!nextbranch) + nextbranch = trie_node + trie->jump[0]; + scan= trie_node + trie->jump[word]; + /* We go from the jump point to the branch that follows + it. Note this means we need the vestigal unused branches + even though they arent otherwise used. + */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed, NULL, f,depth+1); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode*)nextbranch); + + if (min1 > (I32)(minnext + trie->minlen)) + min1 = minnext + trie->minlen; + if (max1 < (I32)(minnext + deltanext + trie->maxlen)) + max1 = minnext + deltanext + trie->maxlen; + if (deltanext == I32_MAX) + is_inf = is_inf_internal = 1; + + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > min + min1) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + cl_or(pRExC_state, &accum, &this_class); + } + } + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + cl_or(pRExC_state, data->start_class, &accum); + if (min1) { + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + cl_and(data->start_class, &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, + struct regnode_charclass_class); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, + struct regnode_charclass_class); + flags |= SCF_DO_STCLASS_OR; + data->start_class->flags |= ANYOF_EOS; + } + } + scan= tail; + continue; + } +#else + else if (PL_regkind[OP(scan)] == TRIE) { + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + U8*bang=NULL; + + min += trie->minlen; + delta += (trie->maxlen - trie->minlen); + flags &= ~SCF_DO_STCLASS; /* xxx */ + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += trie->minlen; + data->pos_delta += (trie->maxlen - trie->minlen); + if (trie->maxlen != trie->minlen) + data->longest = &(data->longest_float); + } + if (trie->jump) /* no more substrings -- for now /grr*/ + flags &= ~SCF_DO_SUBSTR; + } +#endif /* old or new */ +#endif /* TRIE_STUDY_OPT */ + + /* Else: zero-length, ignore. */ + scan = regnext(scan); + } + if (frame) { + last = frame->last; + scan = frame->next; + stopparen = frame->stop; + frame = frame->prev; + goto fake_study_recurse; + } + + finish: + assert(!frame); + DEBUG_STUDYDATA("pre-fin:",data,depth); + + *scanp = scan; + *deltap = is_inf_internal ? I32_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) + data->pos_delta = I32_MAX - data->pos_min; + if (is_par > (I32)U8_MAX) + is_par = 0; + if (is_par && pars==1 && data) { + data->flags |= SF_IN_PAR; + data->flags &= ~SF_HAS_PAR; + } + else if (pars && data) { + data->flags |= SF_HAS_PAR; + data->flags &= ~SF_IN_PAR; + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, and_withp); + if (flags & SCF_TRIE_RESTUDY) + data->flags |= SCF_TRIE_RESTUDY; + + DEBUG_STUDYDATA("post-fin:",data,depth); + + return min < stopmin ? min : stopmin; +} + +STATIC U32 +S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) +{ + U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; + + PERL_ARGS_ASSERT_ADD_DATA; + + Renewc(RExC_rxi->data, + sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), + char, struct reg_data); + if(count) + Renew(RExC_rxi->data->what, count + n, U8); + else + Newx(RExC_rxi->data->what, n, U8); + RExC_rxi->data->count = count + n; + Copy(s, RExC_rxi->data->what + count, n, U8); + return count; +} + +/*XXX: todo make this not included in a non debugging perl */ +#ifndef PERL_IN_XSUB_RE +void +Perl_reginitcolors(pTHX) +{ + dVAR; + const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); + if (s) { + char *t = savepv(s); + int i = 0; + PL_colors[0] = t; + while (++i < 6) { + t = strchr(t, '\t'); + if (t) { + *t = '\0'; + PL_colors[i] = ++t; + } + else + PL_colors[i] = t = (char *)""; + } + } else { + int i = 0; + while (i < 6) + PL_colors[i++] = (char *)""; + } + PL_colorset = 1; +} +#endif + + +#ifdef TRIE_STUDY_OPT +#define CHECK_RESTUDY_GOTO \ + if ( \ + (data.flags & SCF_TRIE_RESTUDY) \ + && ! restudied++ \ + ) goto reStudy +#else +#define CHECK_RESTUDY_GOTO +#endif + +/* + - pregcomp - compile a regular expression into internal code + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) [NB: not true in perl] + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. [I'll say.] + */ + + + +#ifndef PERL_IN_XSUB_RE +#define RE_ENGINE_PTR &reh_regexp_engine +#else +extern const struct regexp_engine my_reg_engine; +#define RE_ENGINE_PTR &my_reg_engine +#endif + +#ifndef PERL_IN_XSUB_RE +REGEXP * +Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) +{ + dVAR; + HV * const table = GvHV(PL_hintgv); + + PERL_ARGS_ASSERT_PREGCOMP; + + /* Dispatch a request to compile a regexp to correct + regexp engine. */ + if (table) { + SV **ptr= hv_fetchs(table, "regcomp", FALSE); + GET_RE_DEBUG_FLAGS_DECL; + if (ptr && SvIOK(*ptr) && SvIV(*ptr)) { + const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr)); + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + SvIV(*ptr)); + }); + return CALLREGCOMP_ENG(eng, pattern, flags); + } + } + return Perl_re_compile(aTHX_ pattern, flags); +} +#endif + +REGEXP * +Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) +{ + dVAR; + REGEXP *rx; + struct regexp *r; + register regexp_internal *ri; + STRLEN plen; + char* VOL exp; + char* xend; + regnode *scan; + I32 flags; + I32 minlen = 0; + U32 pm_flags; + + /* these are all flags - maybe they should be turned + * into a single int with different bit masks */ + I32 sawlookahead = 0; + I32 sawplus = 0; + I32 sawopen = 0; + bool used_setjump = FALSE; + regex_charset initial_charset = get_regex_charset(orig_pm_flags); + + U8 jump_ret = 0; + dJMPENV; + scan_data_t data; + RExC_state_t RExC_state; + RExC_state_t * const pRExC_state = &RExC_state; +#ifdef TRIE_STUDY_OPT + int restudied; + RExC_state_t copyRExC_state; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_COMPILE; + + DEBUG_r(if (!PL_colorset) reginitcolors()); + +#ifndef PERL_IN_XSUB_RE + /* Initialize these here instead of as-needed, as is quick and avoids + * having to test them each time otherwise */ + if (! PL_AboveLatin1) { + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_ASCII = _new_invlist_C_array(ASCII_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist); + PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist); + + PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist); + PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist); + + PL_L1Cased = _new_invlist_C_array(L1Cased_invlist); + + PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist); + PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist); + + PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist); + + PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); + PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); + PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); + + PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist); + PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist); + + PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist); + PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist); + + PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist); + PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist); + + PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist); + PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist); + + PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist); + PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist); + + PL_VertSpace = _new_invlist_C_array(VertSpace_invlist); + + PL_PosixWord = _new_invlist_C_array(PosixWord_invlist); + PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist); + + PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist); + PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist); + } +#endif + + exp = SvPV(pattern, plen); + + if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */ + RExC_utf8 = RExC_orig_utf8 = 0; + } + else { + RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); + } + RExC_uni_semantics = 0; + RExC_contains_locale = 0; + + /****************** LONG JUMP TARGET HERE***********************/ + /* Longjmp back to here if have to switch in midstream to utf8 */ + if (! RExC_orig_utf8) { + JMPENV_PUSH(jump_ret); + used_setjump = TRUE; + } + + if (jump_ret == 0) { /* First time through */ + xend = exp + plen; + + DEBUG_COMPILE_r({ + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RExC_utf8, + dsv, exp, plen, 60); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", + PL_colors[4],PL_colors[5],s); + }); + } + else { /* longjumped back */ + STRLEN len = plen; + + /* If the cause for the longjmp was other than changing to utf8, pop + * our own setjmp, and longjmp to the correct handler */ + if (jump_ret != UTF8_LONGJMP) { + JMPENV_POP; + JMPENV_JUMP(jump_ret); + } + + GET_RE_DEBUG_FLAGS; + + /* It's possible to write a regexp in ascii that represents Unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + -- dmq */ + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + exp = (char*)Perl_bytes_to_utf8(aTHX_ + (U8*)SvPV_nomg(pattern, plen), + &len); + xend = exp + len; + RExC_orig_utf8 = RExC_utf8 = 1; + SAVEFREEPV(exp); + } + +#ifdef TRIE_STUDY_OPT + restudied = 0; +#endif + + pm_flags = orig_pm_flags; + + if (initial_charset == REGEX_LOCALE_CHARSET) { + RExC_contains_locale = 1; + } + else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + + /* Set to use unicode semantics if the pattern is in utf8 and has the + * 'depends' charset specified, as it means unicode when utf8 */ + set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); + } + + RExC_precomp = exp; + RExC_flags = pm_flags; + RExC_sawback = 0; + + RExC_seen = 0; + RExC_in_lookbehind = 0; + RExC_seen_zerolen = *exp == '^' ? -1 : 0; + RExC_seen_evals = 0; + RExC_extralen = 0; + RExC_override_recoding = 0; + + /* First pass: determine size, legality. */ + RExC_parse = exp; + RExC_start = exp; + RExC_end = xend; + RExC_naughty = 0; + RExC_npar = 1; + RExC_nestroot = 0; + RExC_size = 0L; + RExC_emit = &PL_regdummy; + RExC_whilem_seen = 0; + RExC_open_parens = NULL; + RExC_close_parens = NULL; + RExC_opend = NULL; + RExC_paren_names = NULL; +#ifdef DEBUGGING + RExC_paren_name_list = NULL; +#endif + RExC_recurse = NULL; + RExC_recurse_count = 0; + +#if 0 /* REGC() is (currently) a NOP at the first pass. + * Clever compilers notice this and complain. --jhi */ + REGC((U8)REG_MAGIC, (char*)RExC_emit); +#endif + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + RExC_lastnum=0; + RExC_lastparse=NULL; + ); + if (reg(pRExC_state, 0, &flags,1) == NULL) { + RExC_precomp = NULL; + return(NULL); + } + + /* Here, finished first pass. Get rid of any added setjmp */ + if (used_setjump) { + JMPENV_POP; + } + + DEBUG_PARSE_r({ + PerlIO_printf(Perl_debug_log, + "Required size %"IVdf" nodes\n" + "Starting second pass (creation)\n", + (IV)RExC_size); + RExC_lastnum=0; + RExC_lastparse=NULL; + }); + + /* The first pass could have found things that force Unicode semantics */ + if ((RExC_utf8 || RExC_uni_semantics) + && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) + { + set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); + } + + /* Small enough for pointer-storage convention? + If extralen==0, this means that we will not need long jumps. */ + if (RExC_size >= 0x10000L && RExC_extralen) + RExC_size += RExC_extralen; + else + RExC_extralen = 0; + if (RExC_whilem_seen > 15) + RExC_whilem_seen = 15; + + /* Allocate space and zero-initialize. Note, the two step process + of zeroing when in debug mode, thus anything assigned has to + happen after that */ + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = (struct regexp*)SvANY(rx); + Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char, regexp_internal); + if ( r == NULL || ri == NULL ) + FAIL("Regexp out of space"); +#ifdef DEBUGGING + /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); +#else + /* bulk initialize base fields with 0. */ + Zero(ri, sizeof(regexp_internal), char); +#endif + + /* non-zero initialization begins here */ + RXi_SET( r, ri ); + r->engine= RE_ENGINE_PTR; + r->extflags = pm_flags; + { + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); + + /* The caret is output if there are any defaults: if not all the STD + * flags are set, or if no character set specifier is needed */ + bool has_default = + (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) + || ! has_charset); + bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + >> RXf_PMf_STD_PMMOD_SHIFT); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + /* Allocate for the worst case, which is all the std flags are turned + * on. If more precision is desired, we could do a population count of + * the flags set. This could be done with a small lookup table, or by + * shifting, masking and adding, or even, when available, assembly + * language for a machine-language population count. + * We never output a minus, as all those are defaults, so are + * covered by the caret */ + const STRLEN wraplen = plen + has_p + has_runon + + has_default /* If needs a caret */ + + /* If needs a character set specifier */ + + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */ + SvPOK_on(rx); + SvFLAGS(rx) |= SvUTF8(pattern); + *p++='('; *p++='?'; + + /* If a default, cover it using the caret */ + if (has_default) { + *p++= DEFAULT_PAT_MOD; + } + if (has_charset) { + STRLEN len; + const char* const name = get_regex_charset_name(r->extflags, &len); + Copy(name, p, len, char); + p += len; + } + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ + { + char ch; + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + reganch >>= 1; + } + } + + *p++ = ':'; + Copy(RExC_precomp, p, plen, char); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); + p += plen; + if (has_runon) + *p++ = '\n'; + *p++ = ')'; + *p = 0; + SvCUR_set(rx, p - SvPVX_const(rx)); + } + + r->intflags = 0; + r->nparens = RExC_npar - 1; /* set early to validate backrefs */ + + if (RExC_seen & REG_SEEN_RECURSE) { + Newxz(RExC_open_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_open_parens); + Newxz(RExC_close_parens,RExC_npar,regnode *); + SAVEFREEPV(RExC_close_parens); + } + + /* Useful during FAIL. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ + DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, + "%s %"UVuf" bytes for offset annotations.\n", + ri->u.offsets ? "Got" : "Couldn't get", + (UV)((2*RExC_size+1) * sizeof(U32)))); +#endif + SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; + RExC_rx = r; + RExC_rxi = ri; + REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx); + + /* Second pass: emit code. */ + RExC_flags = pm_flags; /* don't let top level (?i) bleed */ + RExC_parse = exp; + RExC_end = xend; + RExC_naughty = 0; + RExC_npar = 1; + RExC_emit_start = ri->program; + RExC_emit = ri->program; + RExC_emit_bound = ri->program + RExC_size + 1; + + /* Store the count of eval-groups for security checks: */ + RExC_rx->seen_evals = RExC_seen_evals; + REGC((U8)REG_MAGIC, (char*) RExC_emit++); + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(rx); + return(NULL); + } + /* XXXX To minimize changes to RE engine we always allocate + 3-units-long substrs field. */ + Newx(r->substrs, 1, struct reg_substr_data); + if (RExC_recurse_count) { + Newxz(RExC_recurse,RExC_recurse_count,regnode *); + SAVEFREEPV(RExC_recurse); + } + +reStudy: + r->minlen = minlen = sawlookahead = sawplus = sawopen = 0; + Zero(r->substrs, 1, struct reg_substr_data); + +#ifdef TRIE_STUDY_OPT + if (!restudied) { + StructCopy(&zero_scan_data, &data, scan_data_t); + copyRExC_state = RExC_state; + } else { + U32 seen=RExC_seen; + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); + + RExC_state = copyRExC_state; + if (seen & REG_TOP_LEVEL_BRANCHES) + RExC_seen |= REG_TOP_LEVEL_BRANCHES; + else + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; + if (data.last_found) { + SvREFCNT_dec(data.longest_fixed); + SvREFCNT_dec(data.longest_float); + SvREFCNT_dec(data.last_found); + } + StructCopy(&zero_scan_data, &data, scan_data_t); + } +#else + StructCopy(&zero_scan_data, &data, scan_data_t); +#endif + + /* Dig out information for optimizations. */ + r->extflags = RExC_flags; /* was pm_op */ + /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ + + if (UTF) + SvUTF8_on(rx); /* Unicode in it? */ + ri->regstclass = NULL; + if (RExC_naughty >= 10) /* Probably an expensive pattern. */ + r->intflags |= PREGf_NAUGHTY; + scan = ri->program + 1; /* First BRANCH. */ + + /* testing for BRANCH here tells us whether there is "must appear" + data in the pattern. If there is then we can use it for optimisations */ + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ + I32 fake; + STRLEN longest_float_length, longest_fixed_length; + struct regnode_charclass_class ch_class; /* pointed to by data */ + int stclass_flag; + I32 last_close = 0; /* pointed to by data */ + regnode *first= scan; + regnode *first_next= regnext(first); + /* + * Skip introductions and multiplicators >= 1 + * so that we can extract the 'meat' of the pattern that must + * match in the large if() sequence following. + * NOTE that EXACT is NOT covered here, as it is normally + * picked up by the optimiser separately. + * + * This is unfortunate as the optimiser isnt handling lookahead + * properly currently. + * + */ + while ((OP(first) == OPEN && (sawopen = 1)) || + /* An OR of *one* alternative - should not happen now. */ + (OP(first) == BRANCH && OP(first_next) != BRANCH) || + /* for now we can't handle lookbehind IFMATCH*/ + (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || + (OP(first) == PLUS) || + (OP(first) == MINMOD) || + /* An {n,m} with n>0 */ + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) + { + /* + * the only op that could be a regnode is PLUS, all the rest + * will be regnode_1 or regnode_2. + * + */ + if (OP(first) == PLUS) + sawplus = 1; + else + first += regarglen[OP(first)]; + + first = NEXTOPER(first); + first_next= regnext(first); + } + + /* Starting-point info. */ + again: + DEBUG_PEEP("first:",first,0); + /* Ignore EXACT as we deal with it later. */ + if (PL_regkind[OP(first)] == EXACT) { + if (OP(first) == EXACT) + NOOP; /* Empty, get anchored substr later. */ + else + ri->regstclass = first; + } +#ifdef TRIE_STCLASS + else if (PL_regkind[OP(first)] == TRIE && + ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) + { + regnode *trie_op; + /* this can happen only on restudy */ + if ( OP(first) == TRIE ) { + struct regnode_1 *trieop = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(first,trieop,struct regnode_1); + trie_op=(regnode *)trieop; + } else { + struct regnode_charclass *trieop = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(first,trieop,struct regnode_charclass); + trie_op=(regnode *)trieop; + } + OP(trie_op)+=2; + make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); + ri->regstclass = trie_op; + } +#endif + else if (REGNODE_SIMPLE(OP(first))) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOUND || + PL_regkind[OP(first)] == NBOUND) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOL) { + r->extflags |= (OP(first) == MBOL + ? RXf_ANCH_MBOL + : (OP(first) == SBOL + ? RXf_ANCH_SBOL + : RXf_ANCH_BOL)); + first = NEXTOPER(first); + goto again; + } + else if (OP(first) == GPOS) { + r->extflags |= RXf_ANCH_GPOS; + first = NEXTOPER(first); + goto again; + } + else if ((!sawopen || !RExC_sawback) && + (OP(first) == STAR && + PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && + !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) + { + /* turn .* into ^.* with an implied $*=1 */ + const int type = + (OP(NEXTOPER(first)) == REG_ANY) + ? RXf_ANCH_MBOL + : RXf_ANCH_SBOL; + r->extflags |= type; + r->intflags |= PREGf_IMPLICIT; + first = NEXTOPER(first); + goto again; + } + if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) + && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ + /* x+ must match at the 1st pos of run of x's */ + r->intflags |= PREGf_SKIP; + + /* Scan is after the zeroth branch, first is atomic matcher. */ +#ifdef TRIE_STUDY_OPT + DEBUG_PARSE_r( + if (!restudied) + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#else + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#endif + + + /* + * If there's something expensive in the r.e., find the + * longest literal string that must appear and make it the + * regmust. Resolve ties in favor of later strings, since + * the regstart check works with the beginning of the r.e. + * and avoiding duplication strengthens checking. Not a + * strong reason, but sufficient in the absence of others. + * [Now we resolve ties in favor of the earlier string if + * it happens that c_offset_min has been invalidated, since the + * earlier string may buy us something the later one won't.] + */ + + data.longest_fixed = newSVpvs(""); + data.longest_float = newSVpvs(""); + data.last_found = newSVpvs(""); + data.longest = &(data.longest_fixed); + first = scan; + if (!ri->regstclass) { + cl_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + stclass_flag = SCF_DO_STCLASS_AND; + } else /* XXXX Check for BOUND? */ + stclass_flag = 0; + data.last_closep = &last_close; + + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ + &data, -1, NULL, NULL, + SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); + + + CHECK_RESTUDY_GOTO; + + + if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) + && data.last_start_min == 0 && data.last_end > 0 + && !RExC_seen_zerolen + && !(RExC_seen & REG_SEEN_VERBARG) + && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + r->extflags |= RXf_CHECK_ALL; + scan_commit(pRExC_state, &data,&minlen,0); + SvREFCNT_dec(data.last_found); + + /* Note that code very similar to this but for anchored string + follows immediately below, changes may need to be made to both. + Be careful. + */ + longest_float_length = CHR_SVLEN(data.longest_float); + if (longest_float_length + || (data.flags & SF_FL_BEFORE_EOL + && (!(data.flags & SF_FL_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE)))) + { + I32 t,ml; + + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) + || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) + goto remove_float; /* As in (a)+. */ + + /* copy the information about the longest float from the reg_scan_data + over to the program. */ + if (SvUTF8(data.longest_float)) { + r->float_utf8 = data.longest_float; + r->float_substr = NULL; + } else { + r->float_substr = data.longest_float; + r->float_utf8 = NULL; + } + /* float_end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = data.minlen_float ? *(data.minlen_float) + : (I32)longest_float_length; + r->float_end_shift = ml - data.offset_float_min + - longest_float_length + (SvTAIL(data.longest_float) != 0) + + data.lookbehind_float; + r->float_min_offset = data.offset_float_min - data.lookbehind_float; + r->float_max_offset = data.offset_float_max; + if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ + r->float_max_offset -= data.lookbehind_float; + + t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ + && (!(data.flags & SF_FL_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE))); + fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); + } + else { + remove_float: + r->float_substr = r->float_utf8 = NULL; + SvREFCNT_dec(data.longest_float); + longest_float_length = 0; + } + + /* Note that code very similar to this but for floating string + is immediately above, changes may need to be made to both. + Be careful. + */ + longest_fixed_length = CHR_SVLEN(data.longest_fixed); + + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) + && (longest_fixed_length + || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE)))) ) + { + I32 t,ml; + + /* copy the information about the longest fixed + from the reg_scan_data over to the program. */ + if (SvUTF8(data.longest_fixed)) { + r->anchored_utf8 = data.longest_fixed; + r->anchored_substr = NULL; + } else { + r->anchored_substr = data.longest_fixed; + r->anchored_utf8 = NULL; + } + /* fixed_end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = data.minlen_fixed ? *(data.minlen_fixed) + : (I32)longest_fixed_length; + r->anchored_end_shift = ml - data.offset_fixed + - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0) + + data.lookbehind_fixed; + r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; + + t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE))); + fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); + } + else { + r->anchored_substr = r->anchored_utf8 = NULL; + SvREFCNT_dec(data.longest_fixed); + longest_fixed_length = 0; + } + if (ri->regstclass + && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) + ri->regstclass = NULL; + + if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) + && stclass_flag + && !(data.start_class->flags & ANYOF_EOS) + && !cl_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; + + Newx(RExC_rxi->data->data[n], 1, + struct regnode_charclass_class); + StructCopy(data.start_class, + (struct regnode_charclass_class*)RExC_rxi->data->data[n], + struct regnode_charclass_class); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + } + + /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ + if (longest_fixed_length > longest_float_length) { + r->check_end_shift = r->anchored_end_shift; + r->check_substr = r->anchored_substr; + r->check_utf8 = r->anchored_utf8; + r->check_offset_min = r->check_offset_max = r->anchored_offset; + if (r->extflags & RXf_ANCH_SINGLE) + r->extflags |= RXf_NOSCAN; + } + else { + r->check_end_shift = r->float_end_shift; + r->check_substr = r->float_substr; + r->check_utf8 = r->float_utf8; + r->check_offset_min = r->float_min_offset; + r->check_offset_max = r->float_max_offset; + } + /* XXXX Currently intuiting is not compatible with ANCH_GPOS. + This should be changed ASAP! */ + if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { + r->extflags |= RXf_USE_INTUIT; + if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) + r->extflags |= RXf_INTUIT_TAIL; + } + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) + if ( (STRLEN)minlen < longest_float_length ) + minlen= longest_float_length; + if ( (STRLEN)minlen < longest_fixed_length ) + minlen= longest_fixed_length; + */ + } + else { + /* Several toplevels. Best we can is to set minlen. */ + I32 fake; + struct regnode_charclass_class ch_class; + I32 last_close = 0; + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); + + scan = ri->program + 1; + cl_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + data.last_closep = &last_close; + + + minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, + &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); + + CHECK_RESTUDY_GOTO; + + r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 + = r->float_substr = r->float_utf8 = NULL; + + if (!(data.start_class->flags & ANYOF_EOS) + && !cl_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; + + Newx(RExC_rxi->data->data[n], 1, + struct regnode_charclass_class); + StructCopy(data.start_class, + (struct regnode_charclass_class*)RExC_rxi->data->data[n], + struct regnode_charclass_class); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + } + } + + /* Guard against an embedded (?=) or (?<=) with a longer minlen than + the "real" pattern. */ + DEBUG_OPTIMISE_r({ + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", + (IV)minlen, (IV)r->minlen); + }); + r->minlenret = minlen; + if (r->minlen < minlen) + r->minlen = minlen; + + if (RExC_seen & REG_SEEN_GPOS) + r->extflags |= RXf_GPOS_SEEN; + if (RExC_seen & REG_SEEN_LOOKBEHIND) + r->extflags |= RXf_LOOKBEHIND_SEEN; + if (RExC_seen & REG_SEEN_EVAL) + r->extflags |= RXf_EVAL_SEEN; + if (RExC_seen & REG_SEEN_CANY) + r->extflags |= RXf_CANY_SEEN; + if (RExC_seen & REG_SEEN_VERBARG) + r->intflags |= PREGf_VERBARG_SEEN; + if (RExC_seen & REG_SEEN_CUTGROUP) + r->intflags |= PREGf_CUTGROUP_SEEN; + if (RExC_paren_names) + RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); + else + RXp_PAREN_NAMES(r) = NULL; + +#ifdef STUPID_PATTERN_CHECKS + if (RX_PRELEN(rx) == 0) + r->extflags |= RXf_NULL; + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) + r->extflags |= RXf_WHITE; + else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') + r->extflags |= RXf_START_ONLY; +#else + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else { + regnode *first = ri->program + 1; + U8 fop = OP(first); + + if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE + && OP(regnext(first)) == END) + r->extflags |= RXf_WHITE; + } +#endif +#ifdef DEBUGGING + if (RExC_paren_names) { + ri->name_list_idx = add_data( pRExC_state, 1, "a" ); + ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); + } else +#endif + ri->name_list_idx = 0; + + if (RExC_recurse_count) { + for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { + const regnode *scan = RExC_recurse[RExC_recurse_count-1]; + ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); + } + } + Newxz(r->offs, RExC_npar, regexp_paren_pair); + /* assume we don't need to swap parens around before we match */ + + DEBUG_DUMP_r({ + PerlIO_printf(Perl_debug_log,"Final program:\n"); + regdump(r); + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_OFFSETS_r(if (ri->u.offsets) { + const U32 len = ri->u.offsets[0]; + U32 i; + GET_RE_DEBUG_FLAGS_DECL; + PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + for (i = 1; i <= len; i++) { + if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) + PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", + (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); + } + PerlIO_printf(Perl_debug_log, "\n"); + }); +#endif + return rx; +} + +#undef RE_ENGINE_PTR + + +SV* +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak_no_modify(aTHX); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) +{ + AV *retarray = NULL; + SV *ret; + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + + if (flags & RXapif_ALL) + retarray=newAV(); + + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) + { + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); + if (!retarray) + return ret; + } else { + if (retarray) + ret = newSVsv(&PL_sv_undef); + } + if (retarray) + av_push(retarray, ret); + } + if (retarray) + return newRV_noinc(MUTABLE_SV(retarray)); + } + } + return NULL; +} + +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, + const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & RXapif_ALL) { + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); + if (sv) { + SvREFCNT_dec(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; + + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); + + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + return newSVhek(HeKEY_hek(temphe)); + } + } + } + return NULL; +} + +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) +{ + SV *ret; + AV *av; + I32 length; + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = MUTABLE_AV(SvRV(ret)); + length = av_len(av); + SvREFCNT_dec(ret); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + AV *av = newAV(); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + av_push(av, newSVhek(HeKEY_hek(temphe))); + } + } + } + + return newRV_noinc(MUTABLE_SV(av)); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + char *s = NULL; + I32 i = 0; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; + + if (!rx->subbeg) { + sv_setsv(sv,&PL_sv_undef); + return; + } + else + if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { + /* $` */ + i = rx->offs[0].start; + s = rx->subbeg; + } + else + if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { + /* $' */ + s = rx->subbeg + rx->offs[0].end; + i = rx->sublen - rx->offs[0].end; + } + else + if ( 0 <= paren && paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + /* $& $1 ... */ + i = t1 - s1; + s = rx->subbeg + s1; + } else { + sv_setsv(sv,&PL_sv_undef); + return; + } + assert(rx->sublen >= (s - rx->subbeg) + i ); + if (i >= 0) { + const int oldtainted = PL_tainted; + TAINT_NOT; + sv_setpvn(sv, s, i); + PL_tainted = oldtainted; + if ( (rx->extflags & RXf_CANY_SEEN) + ? (RXp_MATCH_UTF8(rx) + && (!i || is_utf8_string((U8*)s, i))) + : (RXp_MATCH_UTF8(rx)) ) + { + SvUTF8_on(sv); + } + else + SvUTF8_off(sv); + if (PL_tainting) { + if (RXp_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + PL_tainted = 1; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); + } + } else { + PL_tainted = 1; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); + } + } else { + sv_setsv(sv,&PL_sv_undef); + return; + } +} + +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak_no_modify(aTHX); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, + const I32 paren) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + I32 i; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + + /* Some of this code was originally in C in F */ + switch (paren) { + /* $` / ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + /* $' / ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + /* $& / ${^MATCH}, $1, $2, ... */ + default: + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((const SV *)sv); + return 0; + } + } + getlen: + if (i > 0 && RXp_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_ARGS_ASSERT_REG_QR_PACKAGE; + PERL_UNUSED_ARG(rx); + if (0) + return NULL; + else + return newSVpvs("Regexp"); +} + +/* Scans the name of a named buffer from the pattern. + * If flags is REG_RSN_RETURN_NULL returns null. + * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name + * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding + * to the parsed name as looked up in the RExC_paren_names hash. + * If there is an error throws a vFAIL().. type exception. + */ + +#define REG_RSN_RETURN_NULL 0 +#define REG_RSN_RETURN_NAME 1 +#define REG_RSN_RETURN_DATA 2 + +STATIC SV* +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) +{ + char *name_start = RExC_parse; + + PERL_ARGS_ASSERT_REG_SCAN_NAME; + + if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + /* skip IDFIRST by using do...while */ + if (UTF) + do { + RExC_parse += UTF8SKIP(RExC_parse); + } while (isALNUM_utf8((U8*)RExC_parse)); + else + do { + RExC_parse++; + } while (isALNUM(*RExC_parse)); + } + + if ( flags ) { + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); + if ( flags == REG_RSN_RETURN_NAME) + return sv_name; + else if (flags==REG_RSN_RETURN_DATA) { + HE *he_str = NULL; + SV *sv_dat = NULL; + if ( ! sv_name ) /* should not happen*/ + Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) + vFAIL("Reference to nonexistent named group"); + return sv_dat; + } + else { + Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", + (unsigned long) flags); + } + /* NOT REACHED */ + } + return NULL; +} + +#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ + int rem=(int)(RExC_end - RExC_parse); \ + int cut; \ + int num; \ + int iscut=0; \ + if (rem>10) { \ + rem=10; \ + iscut=1; \ + } \ + cut=10-rem; \ + if (RExC_lastparse!=RExC_parse) \ + PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ + rem, RExC_parse, \ + cut + 4, \ + iscut ? "..." : "<" \ + ); \ + else \ + PerlIO_printf(Perl_debug_log,"%16s",""); \ + \ + if (SIZE_ONLY) \ + num = RExC_size + 1; \ + else \ + num=REG_NODE_NUM(RExC_emit); \ + if (RExC_lastnum!=num) \ + PerlIO_printf(Perl_debug_log,"|%4d",num); \ + else \ + PerlIO_printf(Perl_debug_log,"|%4s",""); \ + PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ + (int)((depth*2)), "", \ + (funcname) \ + ); \ + RExC_lastnum=num; \ + RExC_lastparse=RExC_parse; \ +}) + + + +#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,"%4s","\n"); \ +}) +#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,fmt "\n",args); \ +}) + +/* This section of code defines the inversion list object and its methods. The + * interfaces are highly subject to change, so as much as possible is static to + * this file. An inversion list is here implemented as a malloc'd C UV array + * with some added info that is placed as UVs at the beginning in a header + * portion. An inversion list for Unicode is an array of code points, sorted + * by ordinal number. The zeroth element is the first code point in the list. + * The 1th element is the first element beyond that not in the list. In other + * words, the first range is + * invlist[0]..(invlist[1]-1) + * The other ranges follow. Thus every element whose index is divisible by two + * marks the beginning of a range that is in the list, and every element not + * divisible by two marks the beginning of a range not in the list. A single + * element inversion list that contains the single code point N generally + * consists of two elements + * invlist[0] == N + * invlist[1] == N+1 + * (The exception is when N is the highest representable value on the + * machine, in which case the list containing just it would be a single + * element, itself. By extension, if the last range in the list extends to + * infinity, then the first element of that range will be in the inversion list + * at a position that is divisible by two, and is the final element in the + * list.) + * Taking the complement (inverting) an inversion list is quite simple, if the + * first element is 0, remove it; otherwise add a 0 element at the beginning. + * This implementation reserves an element at the beginning of each inversion list + * to contain 0 when the list contains 0, and contains 1 otherwise. The actual + * beginning of the list is either that element if 0, or the next one if 1. + * + * More about inversion lists can be found in "Unicode Demystified" + * Chapter 13 by Richard Gillam, published by Addison-Wesley. + * More will be coming when functionality is added later. + * + * The inversion list data structure is currently implemented as an SV pointing + * to an array of UVs that the SV thinks are bytes. This allows us to have an + * array of UV whose memory management is automatically handled by the existing + * facilities for SV's. + * + * Some of the methods should always be private to the implementation, and some + * should eventually be made public */ + +#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */ +#define INVLIST_ITER_OFFSET 1 /* Current iteration position */ + +/* This is a combination of a version and data structure type, so that one + * being passed in can be validated to be an inversion list of the correct + * vintage. When the structure of the header is changed, a new random number + * in the range 2**31-1 should be generated and the new() method changed to + * insert that at this location. Then, if an auxiliary program doesn't change + * correspondingly, it will be discovered immediately */ +#define INVLIST_VERSION_ID_OFFSET 2 +#define INVLIST_VERSION_ID 1064334010 + +/* For safety, when adding new elements, remember to #undef them at the end of + * the inversion list code section */ + +#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */ +/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list + * contains the code point U+00000, and begins here. If 1, the inversion list + * doesn't contain U+0000, and it begins at the next UV in the array. + * Inverting an inversion list consists of adding or removing the 0 at the + * beginning of it. By reserving a space for that 0, inversion can be made + * very fast */ + +#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1) + +/* Internally things are UVs */ +#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV)) +#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH) + +#define INVLIST_INITIAL_LEN 10 + +PERL_STATIC_INLINE UV* +S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) +{ + /* Returns a pointer to the first element in the inversion list's array. + * This is called upon initialization of an inversion list. Where the + * array begins depends on whether the list has the code point U+0000 + * in it or not. The other parameter tells it whether the code that + * follows this call is about to put a 0 in the inversion list or not. + * The first element is either the element with 0, if 0, or the next one, + * if 1 */ + + UV* zero = get_invlist_zero_addr(invlist); + + PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; + + /* Must be empty */ + assert(! *get_invlist_len_addr(invlist)); + + /* 1^1 = 0; 1^0 = 1 */ + *zero = 1 ^ will_have_0; + return zero + *zero; +} + +PERL_STATIC_INLINE UV* +S_invlist_array(pTHX_ SV* const invlist) +{ + /* Returns the pointer to the inversion list's array. Every time the + * length changes, this needs to be called in case malloc or realloc moved + * it */ + + PERL_ARGS_ASSERT_INVLIST_ARRAY; + + /* Must not be empty. If these fail, you probably didn't check for + * being non-zero before trying to get the array */ + assert(*get_invlist_len_addr(invlist)); + assert(*get_invlist_zero_addr(invlist) == 0 + || *get_invlist_zero_addr(invlist) == 1); + + /* The array begins either at the element reserved for zero if the + * list contains 0 (that element will be set to 0), or otherwise the next + * element (in which case the reserved element will be set to 1). */ + return (UV *) (get_invlist_zero_addr(invlist) + + *get_invlist_zero_addr(invlist)); +} + +PERL_STATIC_INLINE UV* +S_get_invlist_len_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that contains the current number + * of used elements in the inversion list */ + + PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR; + + return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV))); +} + +PERL_STATIC_INLINE UV +S_invlist_len(pTHX_ SV* const invlist) +{ + /* Returns the current number of elements stored in the inversion list's + * array */ + + PERL_ARGS_ASSERT_INVLIST_LEN; + + return *get_invlist_len_addr(invlist); +} + +PERL_STATIC_INLINE void +S_invlist_set_len(pTHX_ SV* const invlist, const UV len) +{ + /* Sets the current number of elements stored in the inversion list */ + + PERL_ARGS_ASSERT_INVLIST_SET_LEN; + + *get_invlist_len_addr(invlist) = len; + + assert(len <= SvLEN(invlist)); + + SvCUR_set(invlist, TO_INTERNAL_SIZE(len)); + /* If the list contains U+0000, that element is part of the header, + * and should not be counted as part of the array. It will contain + * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and + * subtract: + * SvCUR_set(invlist, + * TO_INTERNAL_SIZE(len + * - (*get_invlist_zero_addr(inv_list) ^ 1))); + * But, this is only valid if len is not 0. The consequences of not doing + * this is that the memory allocation code may think that 1 more UV is + * being used than actually is, and so might do an unnecessary grow. That + * seems worth not bothering to make this the precise amount. + * + * Note that when inverting, SvCUR shouldn't change */ +} + +PERL_STATIC_INLINE UV +S_invlist_max(pTHX_ SV* const invlist) +{ + /* Returns the maximum number of elements storable in the inversion list's + * array, without having to realloc() */ + + PERL_ARGS_ASSERT_INVLIST_MAX; + + return FROM_INTERNAL_SIZE(SvLEN(invlist)); +} + +PERL_STATIC_INLINE UV* +S_get_invlist_zero_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that is reserved to hold 0 if the inversion + * list contains 0. This has to be the last element of the heading, as the + * list proper starts with either it if 0, or the next element if not. + * (But we force it to contain either 0 or 1) */ + + PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR; + + return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV))); +} + +#ifndef PERL_IN_XSUB_RE +SV* +Perl__new_invlist(pTHX_ IV initial_size) +{ + + /* Return a pointer to a newly constructed inversion list, with enough + * space to store 'initial_size' elements. If that number is negative, a + * system default is used instead */ + + SV* new_list; + + if (initial_size < 0) { + initial_size = INVLIST_INITIAL_LEN; + } + + /* Allocate the initial space */ + new_list = newSV(TO_INTERNAL_SIZE(initial_size)); + invlist_set_len(new_list, 0); + + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = UV_MAX; + + /* This should force a segfault if a method doesn't initialize this + * properly */ + *get_invlist_zero_addr(new_list) = UV_MAX; + + *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; +#if HEADER_LENGTH != 4 +# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length +#endif + + return new_list; +} +#endif + +STATIC SV* +S__new_invlist_C_array(pTHX_ UV* list) +{ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to , which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands */ + + SV* invlist = newSV_type(SVt_PV); + + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; + + SvPV_set(invlist, (char *) list); + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist))); + + if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + return invlist; +} + +STATIC void +S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) +{ + /* Grow the maximum size of an inversion list */ + + PERL_ARGS_ASSERT_INVLIST_EXTEND; + + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max)); +} + +PERL_STATIC_INLINE void +S_invlist_trim(pTHX_ SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_TRIM; + + /* Change the length of the inversion list to how many entries it currently + * has */ + + SvPV_shrink_to_cur((SV *) invlist); +} + +/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, + * etc */ +#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) +#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) + +#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) + +STATIC void +S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) +{ + /* Subject to change or removal. Append the range from 'start' to 'end' at + * the end of the inversion list. The range must be above any existing + * ones. */ + + UV* array; + UV max = invlist_max(invlist); + UV len = invlist_len(invlist); + + PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; + + if (len == 0) { /* Empty lists must be initialized */ + array = _invlist_array_init(invlist, start == 0); + } + else { + /* Here, the existing list is non-empty. The current max entry in the + * list is generally the first value not in the set, except when the + * set extends to the end of permissible values, in which case it is + * the first entry in that final set, and so this call is an attempt to + * append out-of-order */ + + UV final_element = len - 1; + array = invlist_array(invlist); + if (array[final_element] > start + || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) + { + Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); + } + + /* Here, it is a legal append. If the new range begins with the first + * value not in the set, it is extending the set, so the new first + * value not in the set is one greater than the newly extended range. + * */ + if (array[final_element] == start) { + if (end != UV_MAX) { + array[final_element] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, + * just let the range that this would extend to have no end */ + invlist_set_len(invlist, len - 1); + } + return; + } + } + + /* Here the new range doesn't extend any existing set. Add it */ + + len += 2; /* Includes an element each for the start and end of range */ + + /* If overflows the existing space, extend, which may cause the array to be + * moved */ + if (max < len) { + invlist_extend(invlist, len); + invlist_set_len(invlist, len); /* Have to set len here to avoid assert + failure in invlist_array() */ + array = invlist_array(invlist); + } + else { + invlist_set_len(invlist, len); + } + + /* The next item on the list starts the range, the one after that is + * one past the new range. */ + array[len - 2] = start; + if (end != UV_MAX) { + array[len - 1] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, just let + * the range have no end */ + invlist_set_len(invlist, len - 1); + } +} + +#ifndef PERL_IN_XSUB_RE + +STATIC IV +S_invlist_search(pTHX_ SV* const invlist, const UV cp) +{ + /* Searches the inversion list for the entry that contains the input code + * point . If is not in the list, -1 is returned. Otherwise, the + * return value is the index into the list's array of the range that + * contains */ + + IV low = 0; + IV high = invlist_len(invlist); + const UV * const array = invlist_array(invlist); + + PERL_ARGS_ASSERT_INVLIST_SEARCH; + + /* If list is empty or the code point is before the first element, return + * failure. */ + if (high == 0 || cp < array[0]) { + return -1; + } + + /* Binary search. What we are looking for is such that + * array[i] <= cp < array[i+1] + * The loop below converges on the i+1. */ + while (low < high) { + IV mid = (low + high) / 2; + if (array[mid] <= cp) { + low = mid + 1; + + /* We could do this extra test to exit the loop early. + if (cp < array[low]) { + return mid; + } + */ + } + else { /* cp < array[mid] */ + high = mid; + } + } + + return high - 1; +} + +void +Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) +{ + /* populates a swatch of a swash the same way swatch_get() does in utf8.c, + * but is used when the swash has an inversion list. This makes this much + * faster, as it uses a binary search instead of a linear one. This is + * intimately tied to that function, and perhaps should be in utf8.c, + * except it is intimately tied to inversion lists as well. It assumes + * that is all 0's on input */ + + UV current = start; + const IV len = invlist_len(invlist); + IV i; + const UV * array; + + PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; + + if (len == 0) { /* Empty inversion list */ + return; + } + + array = invlist_array(invlist); + + /* Find which element it is */ + i = invlist_search(invlist, start); + + /* We populate from to */ + while (current < end) { + UV upper; + + /* The inversion list gives the results for every possible code point + * after the first one in the list. Only those ranges whose index is + * even are ones that the inversion list matches. For the odd ones, + * and if the initial code point is not in the list, we have to skip + * forward to the next element */ + if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { + i++; + if (i >= len) { /* Finished if beyond the end of the array */ + return; + } + current = array[i]; + if (current >= end) { /* Finished if beyond the end of what we + are populating */ + return; + } + } + assert(current >= start); + + /* The current range ends one below the next one, except don't go past + * */ + i++; + upper = (i < len && array[i] < end) ? array[i] : end; + + /* Here we are in a range that matches. Populate a bit in the 3-bit U8 + * for each code point in it */ + for (; current < upper; current++) { + const STRLEN offset = (STRLEN)(current - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + + /* Quit if at the end of the list */ + if (i >= len) { + + /* But first, have to deal with the highest possible code point on + * the platform. The previous code assumes that is one + * beyond where we want to populate, but that is impossible at the + * platform's infinity, so have to handle it specially */ + if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) + { + const STRLEN offset = (STRLEN)(end - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + return; + } + + /* Advance to the next range, which will be for code points not in the + * inversion list */ + current = array[i]; + } + + return; +} + + +void +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) +{ + /* Take the union of two inversion lists and point to it. *output + * should be defined upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented. The first list, + * , may be NULL, in which case a copy of the second list is returned. + * If is TRUE, the union is taken of the complement + * (inversion) of instead of b itself. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. + * + * The algorithm is like a merge sort. + * + * XXX A potential performance improvement is to keep track as we go along + * if only one of the inputs contributes to the result, meaning the other + * is a subset of that one. In that case, we can skip the final copy and + * return the larger of the input lists, but then outside code might need + * to keep track of whether to free the input list or not */ + + UV* array_a; /* a's array */ + UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* u; /* the resulting union */ + UV* array_u; + UV len_u; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_u = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 0. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is zero is something not in the set. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* If either one is empty, the union is the other one */ + if (a == NULL || ((len_a = invlist_len(a)) == 0)) { + if (*output == a) { + if (a != NULL) { + SvREFCNT_dec(a); + } + } + if (*output != b) { + *output = invlist_clone(b); + if (complement_b) { + _invlist_invert(*output); + } + } /* else *output already = b; */ + return; + } + else if ((len_b = invlist_len(b)) == 0) { + if (*output == b) { + SvREFCNT_dec(b); + } + + /* The complement of an empty list is a list that has everything in it, + * so the union with includes everything too */ + if (complement_b) { + if (a == *output) { + SvREFCNT_dec(a); + } + *output = _new_invlist(1); + _append_range_to_invlist(*output, 0, UV_MAX); + } + else if (*output != a) { + *output = invlist_clone(a); + } + /* else *output already = a; */ + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + + /* Size the union for the worst case: that the sets are completely + * disjoint */ + u = _new_invlist(len_a + len_b); + + /* Will contain U+0000 if either component does */ + array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) + || (len_b > 0 && array_b[0] == 0)); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the union's array */ + bool cp_in_set; /* is it in the the input list's set or not */ + + /* We need to take one or the other of the two inputs for the union. + * Since we are merging two sorted lists, we take the smaller of the + * next items. In case of a tie, we take the one that is in its set + * first. If we took one not in the set first, it would decrement the + * count, possibly to 0 which would cause it to be output as ending the + * range, and the next time through we would take the same number, and + * output it again as beginning the next range. By doing it the + * opposite way, there is no possibility that the count will be + * momentarily decremented to 0, and thus the two adjoining ranges will + * be seamlessly merged. (In a tie and both are in the set or both not + * in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp= array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 0, which marks the + * beginning/end of a range in that's in the set */ + if (cp_in_set) { + if (count == 0) { + array_u[i_u++] = cp; + } + count++; + } + else { + count--; + if (count == 0) { + array_u[i_u++] = cp; + } + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that hasn't been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to the element beyond + * the one we care about.) If in the set, we decrement 'count'; if 0, there + * is potentially more to output. + * There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. What's left + * in the union is entirely from the non-exhausted set. + * 2) Both were in their sets, count is 2. Nothing further should + * be output, as everything that remains will be in the exhausted + * list's set, hence in the union; decrementing to 1 but not 0 insures + * that + * 3) the exhausted was in its set, non-exhausted isn't, count is 1. + * Nothing further should be output because the union includes + * everything from the exhausted set. Not decrementing ensures that. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; + * decrementing to 0 insures that we look at the remainder of the + * non-exhausted set */ + if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count--; + } + + /* The final length is what we've output so far, plus what else is about to + * be output. (If 'count' is non-zero, then the input list we exhausted + * has everything remaining up to the machine's limit in its set, and hence + * in the union, so there will be no further output. */ + len_u = i_u; + if (count == 0) { + /* At most one of the subexpressions will be non-zero */ + len_u += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_u, so + * re-find it */ + if (len_u != invlist_len(u)) { + invlist_set_len(u, len_u); + invlist_trim(u); + array_u = invlist_array(u); + } + + /* When 'count' is 0, the list that was exhausted (if one was shorter than + * the other) ended with everything above it not in its set. That means + * that the remaining part of the union is precisely the same as the + * non-exhausted list, so can just copy it unchanged. (If both list were + * exhausted at the same time, then the operations below will be both 0.) + */ + if (count == 0) { + IV copy_count; /* At most one will have a non-zero copy count */ + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_u + i_u, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_u + i_u, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs */ + if (a == *output || b == *output) { + SvREFCNT_dec(*output); + } + + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + + *output = u; + return; +} + +void +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i) +{ + /* Take the intersection of two inversion lists and point to it. *i + * should be defined upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented. + * If is TRUE, the result will be the intersection of + * and the complement (or inversion) of instead of directly. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. In fact, it had bugs + * + * The algorithm is like a merge sort, and is essentially the same as the + * union above + */ + + UV* array_a; /* a's array */ + UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* r; /* the resulting intersection */ + UV* array_r; + UV len_r; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_r = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 2. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is 2 is something in the intersection. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* Special case if either one is empty */ + len_a = invlist_len(a); + if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) { + + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the above 'if', 'b' must + * be empty. Here, also we are using 'b's complement, which hence + * must be every possible code point. Thus the intersection is + * simply 'a'. */ + if (*i != a) { + *i = invlist_clone(a); + + if (*i == b) { + SvREFCNT_dec(b); + } + } + /* else *i is already 'a' */ + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ + if (*i == a) { + SvREFCNT_dec(a); + } + else if (*i == b) { + SvREFCNT_dec(b); + } + *i = _new_invlist(0); + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + + /* Size the intersection for the worst case: that the intersection ends up + * fragmenting everything to be completely disjoint */ + r= _new_invlist(len_a + len_b); + + /* Will contain U+0000 iff both components do */ + array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 + && len_b > 0 && array_b[0] == 0); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the intersection's + array */ + bool cp_in_set; /* Is it in the input list's set or not */ + + /* We need to take one or the other of the two inputs for the + * intersection. Since we are merging two sorted lists, we take the + * smaller of the next items. In case of a tie, we take the one that + * is not in its set first (a difference from the union algorithm). If + * we took one in the set first, it would increment the count, possibly + * to 2 which would cause it to be output as starting a range in the + * intersection, and the next time through we would take that same + * number, and output it again as ending the set. By doing it the + * opposite of this, there is no possibility that the count will be + * momentarily incremented to 2. (In a tie and both are in the set or + * both not in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp= array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 2, which marks the + * beginning/end of a range that's in the intersection */ + if (cp_in_set) { + count++; + if (count == 2) { + array_r[i_r++] = cp; + } + } + else { + if (count == 2) { + array_r[i_r++] = cp; + } + count--; + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that has been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to elements 1 beyond + * the ones we care about.) There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. There's + * nothing left in the intersection. + * 2) Both were in their sets, count is 2 and perhaps is incremented to + * above 2. What should be output is exactly that which is in the + * non-exhausted set, as everything it has is also in the intersection + * set, and everything it doesn't have can't be in the intersection + * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and + * gets incremented to 2. Like the previous case, the intersection is + * everything that remains in the non-exhausted set. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and + * remains 1. And the intersection has nothing more. */ + if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count++; + } + + /* The final length is what we've output so far plus what else is in the + * intersection. At most one of the subexpressions below will be non-zero */ + len_r = i_r; + if (count >= 2) { + len_r += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_r, so + * re-find it */ + if (len_r != invlist_len(r)) { + invlist_set_len(r, len_r); + invlist_trim(r); + array_r = invlist_array(r); + } + + /* Finish outputting any remaining */ + if (count >= 2) { /* At most one will have a non-zero copy count */ + IV copy_count; + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_r + i_r, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_r + i_r, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs */ + if (a == *i || b == *i) { + SvREFCNT_dec(*i); + } + + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + + *i = r; + return; +} + +SV* +Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) +{ + /* Add the range from 'start' to 'end' inclusive to the inversion list's + * set. A pointer to the inversion list is returned. This may actually be + * a new list, in which case the passed in one has been destroyed. The + * passed in inversion list can be NULL, in which case a new one is created + * with just the one range in it */ + + SV* range_invlist; + UV len; + + if (invlist == NULL) { + invlist = _new_invlist(2); + len = 0; + } + else { + len = invlist_len(invlist); + } + + /* If comes after the final entry, can just append it to the end */ + if (len == 0 + || start >= invlist_array(invlist) + [invlist_len(invlist) - 1]) + { + _append_range_to_invlist(invlist, start, end); + return invlist; + } + + /* Here, can't just append things, create and return a new inversion list + * which is the union of this range and the existing inversion list */ + range_invlist = _new_invlist(2); + _append_range_to_invlist(range_invlist, start, end); + + _invlist_union(invlist, range_invlist, &invlist); + + /* The temporary can be freed */ + SvREFCNT_dec(range_invlist); + + return invlist; +} + +#endif + +PERL_STATIC_INLINE SV* +S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { + return _add_range_to_invlist(invlist, cp, cp); +} + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_invert(pTHX_ SV* const invlist) +{ + /* Complement the input inversion list. This adds a 0 if the list didn't + * have a zero; removes it otherwise. As described above, the data + * structure is set up so that this is very efficient */ + + UV* len_pos = get_invlist_len_addr(invlist); + + PERL_ARGS_ASSERT__INVLIST_INVERT; + + /* The inverse of matching nothing is matching everything */ + if (*len_pos == 0) { + _append_range_to_invlist(invlist, 0, UV_MAX); + return; + } + + /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the + * zero element was a 0, so it is being removed, so the length decrements + * by 1; and vice-versa. SvCUR is unaffected */ + if (*get_invlist_zero_addr(invlist) ^= 1) { + (*len_pos)--; + } + else { + (*len_pos)++; + } +} + +void +Perl__invlist_invert_prop(pTHX_ SV* const invlist) +{ + /* Complement the input inversion list (which must be a Unicode property, + * all of which don't match above the Unicode maximum code point.) And + * Perl has chosen to not have the inversion match above that either. This + * adds a 0x110000 if the list didn't end with it, and removes it if it did + */ + + UV len; + UV* array; + + PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; + + _invlist_invert(invlist); + + len = invlist_len(invlist); + + if (len != 0) { /* If empty do nothing */ + array = invlist_array(invlist); + if (array[len - 1] != PERL_UNICODE_MAX + 1) { + /* Add 0x110000. First, grow if necessary */ + len++; + if (invlist_max(invlist) < len) { + invlist_extend(invlist, len); + array = invlist_array(invlist); + } + invlist_set_len(invlist, len); + array[len - 1] = PERL_UNICODE_MAX + 1; + } + else { /* Remove the 0x110000 */ + invlist_set_len(invlist, len - 1); + } + } + + return; +} +#endif + +PERL_STATIC_INLINE SV* +S_invlist_clone(pTHX_ SV* const invlist) +{ + + /* Return a new inversion list that is a copy of the input one, which is + * unchanged */ + + /* Need to allocate extra space to accommodate Perl's addition of a + * trailing NUL to SvPV's, since it thinks they are always strings */ + SV* new_invlist = _new_invlist(invlist_len(invlist) + 1); + STRLEN length = SvCUR(invlist); + + PERL_ARGS_ASSERT_INVLIST_CLONE; + + SvCUR_set(new_invlist, length); /* This isn't done automatically */ + Copy(SvPVX(invlist), SvPVX(new_invlist), length, char); + + return new_invlist; +} + +PERL_STATIC_INLINE UV* +S_get_invlist_iter_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that contains the current iteration + * position */ + + PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; + + return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV))); +} + +PERL_STATIC_INLINE UV* +S_get_invlist_version_id_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that contains the version id. */ + + PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR; + + return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV))); +} + +PERL_STATIC_INLINE void +S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ +{ + PERL_ARGS_ASSERT_INVLIST_ITERINIT; + + *get_invlist_iter_addr(invlist) = 0; +} + +STATIC bool +S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) +{ + /* An C call on must be used to set this up. + * This call sets in <*start> and <*end>, the next range in . + * Returns if successful and the next call will return the next + * range; if was already at the end of the list. If the latter, + * <*start> and <*end> are unchanged, and the next call to this function + * will start over at the beginning of the list */ + + UV* pos = get_invlist_iter_addr(invlist); + UV len = invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_ITERNEXT; + + if (*pos >= len) { + *pos = UV_MAX; /* Force iternit() to be required next time */ + return FALSE; + } + + array = invlist_array(invlist); + + *start = array[(*pos)++]; + + if (*pos >= len) { + *end = UV_MAX; + } + else { + *end = array[(*pos)++] - 1; + } + + return TRUE; +} + +#ifndef PERL_IN_XSUB_RE +SV * +Perl__invlist_contents(pTHX_ SV* const invlist) +{ + /* Get the contents of an inversion list into a string SV so that they can + * be printed out. It uses the format traditionally done for debug tracing + */ + + UV start, end; + SV* output = newSVpvs("\n"); + + PERL_ARGS_ASSERT__INVLIST_CONTENTS; + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); + } + else if (end != start) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", + start, end); + } + else { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); + } + } + + return output; +} +#endif + +#if 0 +void +S_invlist_dump(pTHX_ SV* const invlist, const char * const header) +{ + /* Dumps out the ranges in an inversion list. The string 'header' + * if present is output on a line before the first range */ + + UV start, end; + + if (header && strlen(header)) { + PerlIO_printf(Perl_debug_log, "%s\n", header); + } + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); + } + else { + PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end); + } + } +} +#endif + +#undef HEADER_LENGTH +#undef INVLIST_INITIAL_LENGTH +#undef TO_INTERNAL_SIZE +#undef FROM_INTERNAL_SIZE +#undef INVLIST_LEN_OFFSET +#undef INVLIST_ZERO_OFFSET +#undef INVLIST_ITER_OFFSET +#undef INVLIST_VERSION_ID + +/* End of inversion list object */ + +/* + - reg - regular expression, i.e. main body or parenthesized thing + * + * Caller must absorb opening parenthesis. + * + * Combining parenthesis handling with the base level of regular expression + * is a trifle forced, but the need to tie the tails of the branches to what + * follows makes it hard to avoid. + */ +#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) +#ifdef DEBUGGING +#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) +#else +#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) +#endif + +STATIC regnode * +S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) + /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ +{ + dVAR; + register regnode *ret; /* Will be the head of the group. */ + register regnode *br; + register regnode *lastbr; + register regnode *ender = NULL; + register I32 parno = 0; + I32 flags; + U32 oregflags = RExC_flags; + bool have_branch = 0; + bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; + + /* for (?g), (?gc), and (?o) warnings; warning + about (?c) will warn about (?g) -- japhy */ + +#define WASTED_O 0x01 +#define WASTED_G 0x02 +#define WASTED_C 0x04 +#define WASTED_GC (0x02|0x04) + I32 wastedflags = 0x00; + + char * parse_start = RExC_parse; /* MJD */ + char * const oregcomp_parse = RExC_parse; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG; + DEBUG_PARSE("reg "); + + *flagp = 0; /* Tentatively. */ + + + /* Make an OPEN node, if parenthesized. */ + if (paren) { + if ( *RExC_parse == '*') { /* (*VERB:ARG) */ + char *start_verb = RExC_parse; + STRLEN verb_len = 0; + char *start_arg = NULL; + unsigned char op = 0; + int argok = 1; + int internal_argval = 0; /* internal_argval is only useful if !argok */ + while ( *RExC_parse && *RExC_parse != ')' ) { + if ( *RExC_parse == ':' ) { + start_arg = RExC_parse + 1; + break; + } + RExC_parse++; + } + ++start_verb; + verb_len = RExC_parse - start_verb; + if ( start_arg ) { + RExC_parse++; + while ( *RExC_parse && *RExC_parse != ')' ) + RExC_parse++; + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern argument"); + if ( RExC_parse == start_arg ) + start_arg = NULL; + } else { + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern"); + } + + switch ( *start_verb ) { + case 'A': /* (*ACCEPT) */ + if ( memEQs(start_verb,verb_len,"ACCEPT") ) { + op = ACCEPT; + internal_argval = RExC_nestroot; + } + break; + case 'C': /* (*COMMIT) */ + if ( memEQs(start_verb,verb_len,"COMMIT") ) + op = COMMIT; + break; + case 'F': /* (*FAIL) */ + if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { + op = OPFAIL; + argok = 0; + } + break; + case ':': /* (*:NAME) */ + case 'M': /* (*MARK:NAME) */ + if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { + op = MARKPOINT; + argok = -1; + } + break; + case 'P': /* (*PRUNE) */ + if ( memEQs(start_verb,verb_len,"PRUNE") ) + op = PRUNE; + break; + case 'S': /* (*SKIP) */ + if ( memEQs(start_verb,verb_len,"SKIP") ) + op = SKIP; + break; + case 'T': /* (*THEN) */ + /* [19:06] :: is then */ + if ( memEQs(start_verb,verb_len,"THEN") ) { + op = CUTGROUP; + RExC_seen |= REG_SEEN_CUTGROUP; + } + break; + } + if ( ! op ) { + RExC_parse++; + vFAIL3("Unknown verb pattern '%.*s'", + verb_len, start_verb); + } + if ( argok ) { + if ( start_arg && internal_argval ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else if ( argok < 0 && !start_arg ) { + vFAIL3("Verb pattern '%.*s' has a mandatory argument", + verb_len, start_verb); + } else { + ret = reganode(pRExC_state, op, internal_argval); + if ( ! internal_argval && ! SIZE_ONLY ) { + if (start_arg) { + SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, 1, "S" ); + RExC_rxi->data->data[ARG(ret)]=(void*)sv; + ret->flags = 0; + } else { + ret->flags = 1; + } + } + } + if (!internal_argval) + RExC_seen |= REG_SEEN_VERBARG; + } else if ( start_arg ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else { + ret = reg_node(pRExC_state, op); + } + nextchar(pRExC_state); + return ret; + } else + if (*RExC_parse == '?') { /* (?...) */ + bool is_logical = 0; + const char * const seqstart = RExC_parse; + bool has_use_defaults = FALSE; + + RExC_parse++; + paren = *RExC_parse++; + ret = NULL; /* For look-ahead/behind. */ + switch (paren) { + + case 'P': /* (?P...) variants for those used to PCRE/Python */ + paren = *RExC_parse++; + if ( paren == '<') /* (?P<...>) named capture */ + goto named_capture; + else if (paren == '>') { /* (?P>name) named recursion */ + goto named_recursion; + } + else if (paren == '=') { /* (?P=...) named backref */ + /* this pretty much dupes the code for \k in regatom(), if + you change this make sure you change that */ + char* name_start = RExC_parse; + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ')') + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, 1, "S" ); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (MORE_ASCII_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret); /* MJD */ + + nextchar(pRExC_state); + return ret; + } + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + case '<': /* (?<...) */ + if (*RExC_parse == '!') + paren = ','; + else if (*RExC_parse != '=') + named_capture: + { /* (?<...>) */ + char *name_start; + SV *svname; + paren= '>'; + case '\'': /* (?'...') */ + name_start= RExC_parse; + svname = reg_scan_name(pRExC_state, + SIZE_ONLY ? /* reverse test from the others */ + REG_RSN_RETURN_NAME : + REG_RSN_RETURN_NULL); + if (RExC_parse == name_start) { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + if (*RExC_parse != paren) + vFAIL2("Sequence (?%c... not terminated", + paren=='>' ? '<' : paren); + if (SIZE_ONLY) { + HE *he_str; + SV *sv_dat = NULL; + if (!svname) /* shouldn't happen */ + Perl_croak(aTHX_ + "panic: reg_scan_name returned NULL"); + if (!RExC_paren_names) { + RExC_paren_names= newHV(); + sv_2mortal(MUTABLE_SV(RExC_paren_names)); +#ifdef DEBUGGING + RExC_paren_name_list= newAV(); + sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); +#endif + } + he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) { + /* croak baby croak */ + Perl_croak(aTHX_ + "panic: paren_name hash element allocation failed"); + } else if ( SvPOK(sv_dat) ) { + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIV_set(sv_dat, SvIVX(sv_dat) + 1); + } + } else { + (void)SvUPGRADE(sv_dat,SVt_PVNV); + sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); + SvIOK_on(sv_dat); + SvIV_set(sv_dat, 1); + } +#ifdef DEBUGGING + /* Yes this does cause a memory leak in debugging Perls */ + if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) + SvREFCNT_dec(svname); +#endif + + /*sv_dump(sv_dat);*/ + } + nextchar(pRExC_state); + paren = 1; + goto capturing_parens; + } + RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_in_lookbehind++; + RExC_parse++; + case '=': /* (?=...) */ + RExC_seen_zerolen++; + break; + case '!': /* (?!...) */ + RExC_seen_zerolen++; + if (*RExC_parse == ')') { + ret=reg_node(pRExC_state, OPFAIL); + nextchar(pRExC_state); + return ret; + } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; + case ':': /* (?:...) */ + case '>': /* (?>...) */ + break; + case '$': /* (?$...) */ + case '@': /* (?@...) */ + vFAIL2("Sequence (?%c...) not implemented", (int)paren); + break; + case '#': /* (?#...) */ + while (*RExC_parse && *RExC_parse != ')') + RExC_parse++; + if (*RExC_parse != ')') + FAIL("Sequence (?#... not terminated"); + nextchar(pRExC_state); + *flagp = TRYAGAIN; + return NULL; + case '0' : /* (?0) */ + case 'R' : /* (?R) */ + if (*RExC_parse != ')') + FAIL("Sequence (?R) not terminated"); + ret = reg_node(pRExC_state, GOSTART); + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + /*notreached*/ + { /* named and numeric backreferences */ + I32 num; + case '&': /* (?&NAME) */ + parse_start = RExC_parse - 1; + named_recursion: + { + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + goto gen_recurse_regop; + /* NOT REACHED */ + case '+': + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse++; + vFAIL("Illegal pattern"); + } + goto parse_recursion; + /* NOT REACHED*/ + case '-': /* (?-1) */ + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse--; /* rewind to let it be handled later */ + goto parse_flags; + } + /*FALLTHROUGH */ + case '1': case '2': case '3': case '4': /* (?1) */ + case '5': case '6': case '7': case '8': case '9': + RExC_parse--; + parse_recursion: + num = atoi(RExC_parse); + parse_start = RExC_parse - 1; /* MJD */ + if (*RExC_parse == '-') + RExC_parse++; + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (*RExC_parse!=')') + vFAIL("Expecting close bracket"); + + gen_recurse_regop: + if ( paren == '-' ) { + /* + Diagram of capture buffer numbering. + Top line is the normal capture buffer numbers + Bottom line is the negative indexing as from + the X (the (?-2)) + + + 1 2 3 4 5 X 6 7 + /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ + - 5 4 3 2 1 X x x + + */ + num = RExC_npar + num; + if (num < 1) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + } else if ( paren == '+' ) { + num = RExC_npar + num - 1; + } + + ret = reganode(pRExC_state, GOSUB, num); + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + ARG2L_SET( ret, RExC_recurse_count++); + RExC_emit++; + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); + } else { + RExC_size++; + } + RExC_seen |= REG_SEEN_RECURSE; + Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ + Set_Node_Offset(ret, parse_start); /* MJD */ + + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + } /* named and numeric backreferences */ + /* NOT REACHED */ + + case '?': /* (??...) */ + is_logical = 1; + if (*RExC_parse != '{') { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + *flagp |= POSTPONED; + paren = *RExC_parse++; + /* FALL THROUGH */ + case '{': /* (?{...}) */ + { + I32 count = 1; + U32 n = 0; + char c; + char *s = RExC_parse; + + RExC_seen_zerolen++; + RExC_seen |= REG_SEEN_EVAL; + while (count && (c = *RExC_parse)) { + if (c == '\\') { + if (RExC_parse[1]) + RExC_parse++; + } + else if (c == '{') + count++; + else if (c == '}') + count--; + RExC_parse++; + } + if (*RExC_parse != ')') { + RExC_parse = s; + vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); + } + if (!SIZE_ONLY) { + PAD *pad; + OP_4tree *sop, *rop; + SV * const sv = newSVpvn(s, RExC_parse - 1 - s); + + ENTER; + Perl_save_re_context(aTHX); + rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad); + sop->op_private |= OPpREFCOUNTED; + /* re_dup will OpREFCNT_inc */ + OpREFCNT_set(sop, 1); + LEAVE; + + n = add_data(pRExC_state, 3, "nop"); + RExC_rxi->data->data[n] = (void*)rop; + RExC_rxi->data->data[n+1] = (void*)sop; + RExC_rxi->data->data[n+2] = (void*)pad; + SvREFCNT_dec(sv); + } + else { /* First pass */ + if (PL_reginterp_cnt < ++RExC_seen_evals + && IN_PERL_RUNTIME) + /* No compiled RE interpolated, has runtime + components ===> unsafe. */ + FAIL("Eval-group not allowed at runtime, use re 'eval'"); + if (PL_tainting && PL_tainted) + FAIL("Eval-group in insecure regular expression"); +#if PERL_VERSION > 8 + if (IN_PERL_COMPILETIME) + PL_cv_has_eval = 1; +#endif + } + + nextchar(pRExC_state); + if (is_logical) { + ret = reg_node(pRExC_state, LOGICAL); + if (!SIZE_ONLY) + ret->flags = 2; + REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); + /* deal with the length of this later - MJD */ + return ret; + } + ret = reganode(pRExC_state, EVAL, n); + Set_Node_Length(ret, RExC_parse - parse_start + 1); + Set_Node_Offset(ret, parse_start); + return ret; + } + case '(': /* (?(?{...})...) and (?(?=...)...) */ + { + int is_define= 0; + if (RExC_parse[0] == '?') { /* (?(?...)) */ + if (RExC_parse[1] == '=' || RExC_parse[1] == '!' + || RExC_parse[1] == '<' + || RExC_parse[1] == '{') { /* Lookahead or eval. */ + I32 flag; + + ret = reg_node(pRExC_state, LOGICAL); + if (!SIZE_ONLY) + ret->flags = 1; + REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1)); + goto insert_if; + } + } + else if ( RExC_parse[0] == '<' /* (?()...) */ + || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ + { + char ch = RExC_parse[0] == '<' ? '>' : '\''; + char *name_start= RExC_parse++; + U32 num = 0; + SV *sv_dat=reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ch) + vFAIL2("Sequence (?(%c... not terminated", + (ch == '>' ? '<' : ch)); + RExC_parse++; + if (!SIZE_ONLY) { + num = add_data( pRExC_state, 1, "S" ); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + ret = reganode(pRExC_state,NGROUPP,num); + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'D' && + RExC_parse[1] == 'E' && + RExC_parse[2] == 'F' && + RExC_parse[3] == 'I' && + RExC_parse[4] == 'N' && + RExC_parse[5] == 'E') + { + ret = reganode(pRExC_state,DEFINEP,0); + RExC_parse +=6 ; + is_define = 1; + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'R') { + RExC_parse++; + parno = 0; + if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + parno = atoi(RExC_parse++); + while (isDIGIT(*RExC_parse)) + RExC_parse++; + } else if (RExC_parse[0] == '&') { + SV *sv_dat; + RExC_parse++; + sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + ret = reganode(pRExC_state,INSUBP,parno); + goto insert_if_check_paren; + } + else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + /* (?(1)...) */ + char c; + parno = atoi(RExC_parse++); + + while (isDIGIT(*RExC_parse)) + RExC_parse++; + ret = reganode(pRExC_state, GROUPP, parno); + + insert_if_check_paren: + if ((c = *nextchar(pRExC_state)) != ')') + vFAIL("Switch condition not recognized"); + insert_if: + REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); + br = regbranch(pRExC_state, &flags, 1,depth+1); + if (br == NULL) + br = reganode(pRExC_state, LONGJMP, 0); + else + REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); + c = *nextchar(pRExC_state); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + if (c == '|') { + if (is_define) + vFAIL("(?(DEFINE)....) does not allow branches"); + lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ + regbranch(pRExC_state, &flags, 1,depth+1); + REGTAIL(pRExC_state, ret, lastbr); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + c = *nextchar(pRExC_state); + } + else + lastbr = NULL; + if (c != ')') + vFAIL("Switch (?(condition)... contains too many branches"); + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, br, ender); + if (lastbr) { + REGTAIL(pRExC_state, lastbr, ender); + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); + } + else + REGTAIL(pRExC_state, ret, ender); + RExC_size++; /* XXX WHY do we need this?!! + For large programs it seems to be required + but I can't figure out why. -- dmq*/ + return ret; + } + else { + vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); + } + } + case 0: + RExC_parse--; /* for vFAIL to print correctly */ + vFAIL("Sequence (? incomplete"); + break; + case DEFAULT_PAT_MOD: /* Use default flags with the exceptions + that follow */ + has_use_defaults = TRUE; + STD_PMMOD_FLAGS_CLEAR(&RExC_flags); + set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET); + goto parse_flags; + default: + --RExC_parse; + parse_flags: /* (?i) */ + { + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; + char has_charset_modifier = '\0'; + regex_charset cs = get_regex_charset(RExC_flags); + if (cs == REGEX_DEPENDS_CHARSET + && (RExC_utf8 || RExC_uni_semantics)) + { + cs = REGEX_UNICODE_CHARSET; + } + + while (*RExC_parse) { + /* && strchr("iogcmsx", *RExC_parse) */ + /* (?g), (?gc) and (?o) are useless here + and must be globally applied -- japhy */ + switch (*RExC_parse) { + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + case LOCALE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_LOCALE_CHARSET; + has_charset_modifier = LOCALE_PAT_MOD; + RExC_contains_locale = 1; + break; + case UNICODE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_UNICODE_CHARSET; + has_charset_modifier = UNICODE_PAT_MOD; + break; + case ASCII_RESTRICT_PAT_MOD: + if (flagsp == &negflags) { + goto neg_modifier; + } + if (has_charset_modifier) { + if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { + goto excess_modifier; + } + /* Doubled modifier implies more restricted */ + cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; + } + else { + cs = REGEX_ASCII_RESTRICTED_CHARSET; + } + has_charset_modifier = ASCII_RESTRICT_PAT_MOD; + break; + case DEPENDS_PAT_MOD: + if (has_use_defaults) { + goto fail_modifiers; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + else if (has_charset_modifier) { + goto excess_modifier; + } + + /* The dual charset means unicode semantics if the + * pattern (or target, not known until runtime) are + * utf8, or something in the pattern indicates unicode + * semantics */ + cs = (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET; + has_charset_modifier = DEPENDS_PAT_MOD; + break; + excess_modifier: + RExC_parse++; + if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { + vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); + } + else if (has_charset_modifier == *(RExC_parse - 1)) { + vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1)); + } + else { + vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); + } + /*NOTREACHED*/ + neg_modifier: + RExC_parse++; + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); + /*NOTREACHED*/ + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; + if (! (wastedflags & wflagbit) ) { + wastedflags |= wflagbit; + vWARN5( + RExC_parse + 1, + "Useless (%s%c) - %suse /%c modifier", + flagsp == &negflags ? "?-" : "?", + *RExC_parse, + flagsp == &negflags ? "don't " : "", + *RExC_parse + ); + } + } + break; + + case CONTINUE_PAT_MOD: /* 'c' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (! (wastedflags & WASTED_C) ) { + wastedflags |= WASTED_GC; + vWARN3( + RExC_parse + 1, + "Useless (%sc) - %suse /gc modifier", + flagsp == &negflags ? "?-" : "?", + flagsp == &negflags ? "don't " : "" + ); + } + } + break; + case KEEPCOPY_PAT_MOD: /* 'p' */ + if (flagsp == &negflags) { + if (SIZE_ONLY) + ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); + } else { + *flagsp |= RXf_PMf_KEEPCOPY; + } + break; + case '-': + /* A flag is a default iff it is following a minus, so + * if there is a minus, it means will be trying to + * re-specify a default which is an error */ + if (has_use_defaults || flagsp == &negflags) { + fail_modifiers: + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case ':': + paren = ':'; + /*FALLTHROUGH*/ + case ')': + RExC_flags |= posflags; + RExC_flags &= ~negflags; + set_regex_charset(&RExC_flags, cs); + if (paren != ':') { + oregflags |= posflags; + oregflags &= ~negflags; + set_regex_charset(&oregflags, cs); + } + nextchar(pRExC_state); + if (paren != ':') { + *flagp = TRYAGAIN; + return NULL; + } else { + ret = NULL; + goto parse_rest; + } + /*NOTREACHED*/ + default: + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + ++RExC_parse; + } + }} /* one for the default block, one for the switch */ + } + else { /* (...) */ + capturing_parens: + parno = RExC_npar; + RExC_npar++; + + ret = reganode(pRExC_state, OPEN, parno); + if (!SIZE_ONLY ){ + if (!RExC_nestroot) + RExC_nestroot = parno; + if (RExC_seen & REG_SEEN_RECURSE + && !RExC_open_parens[parno-1]) + { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting open paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ret))); + RExC_open_parens[parno-1]= ret; + } + } + Set_Node_Length(ret, 1); /* MJD */ + Set_Node_Offset(ret, RExC_parse); /* MJD */ + is_open = 1; + } + } + else /* ! paren */ + ret = NULL; + + parse_rest: + /* Pick up the branches, linking them together. */ + parse_start = RExC_parse; /* MJD */ + br = regbranch(pRExC_state, &flags, 1,depth+1); + + /* branch_len = (paren != 0); */ + + if (br == NULL) + return(NULL); + if (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, BRANCHJ, br, depth+1); + } + else { /* MJD */ + reginsert(pRExC_state, BRANCH, br, depth+1); + Set_Node_Length(br, paren != 0); + Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); + } + have_branch = 1; + if (SIZE_ONLY) + RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ + } + else if (paren == ':') { + *flagp |= flags&SIMPLE; + } + if (is_open) { /* Starts with OPEN. */ + REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ + } + else if (paren != '?') /* Not Conditional */ + ret = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + lastbr = br; + while (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + ender = reganode(pRExC_state, LONGJMP,0); + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ + } + if (SIZE_ONLY) + RExC_extralen += 2; /* Account for LONGJMP. */ + nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } + br = regbranch(pRExC_state, &flags, 0, depth+1); + + if (br == NULL) + return(NULL); + REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ + lastbr = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + } + + if (have_branch || paren != ':') { + /* Make a closing node, and hook it on the end. */ + switch (paren) { + case ':': + ender = reg_node(pRExC_state, TAIL); + break; + case 1: + ender = reganode(pRExC_state, CLOSE, parno); + if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting close paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ender))); + RExC_close_parens[parno-1]= ender; + if (RExC_nestroot == parno) + RExC_nestroot = 0; + } + Set_Node_Offset(ender,RExC_parse+1); /* MJD */ + Set_Node_Length(ender,1); /* MJD */ + break; + case '<': + case ',': + case '=': + case '!': + *flagp &= ~HASWIDTH; + /* FALL THROUGH */ + case '>': + ender = reg_node(pRExC_state, SUCCEED); + break; + case 0: + ender = reg_node(pRExC_state, END); + if (!SIZE_ONLY) { + assert(!RExC_opend); /* there can only be one! */ + RExC_opend = ender; + } + break; + } + REGTAIL(pRExC_state, lastbr, ender); + + if (have_branch && !SIZE_ONLY) { + if (depth==1) + RExC_seen |= REG_TOP_LEVEL_BRANCHES; + + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br; br = regnext(br)) { + const U8 op = PL_regkind[OP(br)]; + if (op == BRANCH) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); + } + else if (op == BRANCHJ) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); + } + } + } + } + + { + const char *p; + static const char parens[] = "=!<,>"; + + if (paren && (p = strchr(parens, paren))) { + U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; + int flag = (p - parens) > 1; + + if (paren == '>') + node = SUSPEND, flag = 0; + reginsert(pRExC_state, node,ret, depth+1); + Set_Node_Cur_Length(ret); + Set_Node_Offset(ret, parse_start + 1); + ret->flags = flag; + REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); + } + } + + /* Check for proper termination. */ + if (paren) { + RExC_flags = oregflags; + if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ("); + } + } + else if (!paren && RExC_parse < RExC_end) { + if (*RExC_parse == ')') { + RExC_parse++; + vFAIL("Unmatched )"); + } + else + FAIL("Junk on end of regexp"); /* "Can't happen". */ + /* NOTREACHED */ + } + + if (RExC_in_lookbehind) { + RExC_in_lookbehind--; + } + if (after_freeze > RExC_npar) + RExC_npar = after_freeze; + return(ret); +} + +/* + - regbranch - one alternative of an | operator + * + * Implements the concatenation operator. + */ +STATIC regnode * +S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) +{ + dVAR; + register regnode *ret; + register regnode *chain = NULL; + register regnode *latest; + I32 flags = 0, c = 0; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGBRANCH; + + DEBUG_PARSE("brnc"); + + if (first) + ret = NULL; + else { + if (!SIZE_ONLY && RExC_extralen) + ret = reganode(pRExC_state, BRANCHJ,0); + else { + ret = reg_node(pRExC_state, BRANCH); + Set_Node_Length(ret, 1); + } + } + + if (!first && SIZE_ONLY) + RExC_extralen += 1; /* BRANCHJ */ + + *flagp = WORST; /* Tentatively. */ + + RExC_parse--; + nextchar(pRExC_state); + while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { + flags &= ~TRYAGAIN; + latest = regpiece(pRExC_state, &flags,depth+1); + if (latest == NULL) { + if (flags & TRYAGAIN) + continue; + return(NULL); + } + else if (ret == NULL) + ret = latest; + *flagp |= flags&(HASWIDTH|POSTPONED); + if (chain == NULL) /* First piece. */ + *flagp |= flags&SPSTART; + else { + RExC_naughty++; + REGTAIL(pRExC_state, chain, latest); + } + chain = latest; + c++; + } + if (chain == NULL) { /* Loop ran zero times. */ + chain = reg_node(pRExC_state, NOTHING); + if (ret == NULL) + ret = chain; + } + if (c == 1) { + *flagp |= flags&SIMPLE; + } + + return ret; +} + +/* + - regpiece - something followed by possible [*+?] + * + * Note that the branching code sequences used for ? and the general cases + * of * and + are somewhat optimized: they use the same NOTHING node as + * both the endmarker for their branch list and the body of the last branch. + * It might seem that this node could be dispensed with entirely, but the + * endmarker role is not redundant. + */ +STATIC regnode * +S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + dVAR; + register regnode *ret; + register char op; + register char *next; + I32 flags; + const char * const origparse = RExC_parse; + I32 min; + I32 max = REG_INFTY; +#ifdef RE_TRACK_PATTERN_OFFSETS + char *parse_start; +#endif + const char *maxpos = NULL; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPIECE; + + DEBUG_PARSE("piec"); + + ret = regatom(pRExC_state, &flags,depth+1); + if (ret == NULL) { + if (flags & TRYAGAIN) + *flagp |= TRYAGAIN; + return(NULL); + } + + op = *RExC_parse; + + if (op == '{' && regcurly(RExC_parse)) { + maxpos = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; /* MJD */ +#endif + next = RExC_parse + 1; + while (isDIGIT(*next) || *next == ',') { + if (*next == ',') { + if (maxpos) + break; + else + maxpos = next; + } + next++; + } + if (*next == '}') { /* got one */ + if (!maxpos) + maxpos = next; + RExC_parse++; + min = atoi(RExC_parse); + if (*maxpos == ',') + maxpos++; + else + maxpos = RExC_parse; + max = atoi(maxpos); + if (!max && *maxpos != '0') + max = REG_INFTY; /* meaning "infinity" */ + else if (max >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + RExC_parse = next; + nextchar(pRExC_state); + + do_curly: + if ((flags&SIMPLE)) { + RExC_naughty += 2 + RExC_naughty / 2; + reginsert(pRExC_state, CURLY, ret, depth+1); + Set_Node_Offset(ret, parse_start+1); /* MJD */ + Set_Node_Cur_Length(ret); + } + else { + regnode * const w = reg_node(pRExC_state, WHILEM); + + w->flags = 0; + REGTAIL(pRExC_state, ret, w); + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, LONGJMP,ret, depth+1); + reginsert(pRExC_state, NOTHING,ret, depth+1); + NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ + } + reginsert(pRExC_state, CURLYX,ret, depth+1); + /* MJD hk */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Length(ret, + op == '{' ? (RExC_parse - parse_start) : 1); + + if (!SIZE_ONLY && RExC_extralen) + NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ + REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); + if (SIZE_ONLY) + RExC_whilem_seen++, RExC_extralen += 3; + RExC_naughty += 4 + RExC_naughty; /* compound interest */ + } + ret->flags = 0; + + if (min > 0) + *flagp = WORST; + if (max > 0) + *flagp |= HASWIDTH; + if (max < min) + vFAIL("Can't do {n,m} with n > m"); + if (!SIZE_ONLY) { + ARG1_SET(ret, (U16)min); + ARG2_SET(ret, (U16)max); + } + + goto nest_check; + } + } + + if (!ISMULT1(op)) { + *flagp = flags; + return(ret); + } + +#if 0 /* Now runtime fix should be reliable. */ + + /* if this is reinstated, don't forget to put this back into perldiag: + + =item Regexp *+ operand could be empty at {#} in regex m/%s/ + + (F) The part of the regexp subject to either the * or + quantifier + could match an empty string. The {#} shows in the regular + expression about where the problem was discovered. + + */ + + if (!(flags&HASWIDTH) && op != '?') + vFAIL("Regexp *+ operand could be empty"); +#endif + +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; +#endif + nextchar(pRExC_state); + + *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); + + if (op == '*' && (flags&SIMPLE)) { + reginsert(pRExC_state, STAR, ret, depth+1); + ret->flags = 0; + RExC_naughty += 4; + } + else if (op == '*') { + min = 0; + goto do_curly; + } + else if (op == '+' && (flags&SIMPLE)) { + reginsert(pRExC_state, PLUS, ret, depth+1); + ret->flags = 0; + RExC_naughty += 3; + } + else if (op == '+') { + min = 1; + goto do_curly; + } + else if (op == '?') { + min = 0; max = 1; + goto do_curly; + } + nest_check: + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + ckWARN3reg(RExC_parse, + "%.*s matches null string many times", + (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), + origparse); + } + + if (RExC_parse < RExC_end && *RExC_parse == '?') { + nextchar(pRExC_state); + reginsert(pRExC_state, MINMOD, ret, depth+1); + REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); + } +#ifndef REG_ALLOW_MINMOD_SUSPEND + else +#endif + if (RExC_parse < RExC_end && *RExC_parse == '+') { + regnode *ender; + nextchar(pRExC_state); + ender = reg_node(pRExC_state, SUCCEED); + REGTAIL(pRExC_state, ret, ender); + reginsert(pRExC_state, SUSPEND, ret, depth+1); + ret->flags = 0; + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, ret, ender); + /*ret= ender;*/ + } + + if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { + RExC_parse++; + vFAIL("Nested quantifiers"); + } + + return(ret); +} + + +/* reg_namedseq(pRExC_state,UVp, UV depth) + + This is expected to be called by a parser routine that has + recognized '\N' and needs to handle the rest. RExC_parse is + expected to point at the first char following the N at the time + of the call. + + The \N may be inside (indicated by valuep not being NULL) or outside a + character class. + + \N may begin either a named sequence, or if outside a character class, mean + to match a non-newline. For non single-quoted regexes, the tokenizer has + attempted to decide which, and in the case of a named sequence converted it + into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, + where c1... are the characters in the sequence. For single-quoted regexes, + the tokenizer passes the \N sequence through unchanged; this code will not + attempt to determine this nor expand those. The net effect is that if the + beginning of the passed-in pattern isn't '{U+' or there is no '}', it + signals that this \N occurrence means to match a non-newline. + + Only the \N{U+...} form should occur in a character class, for the same + reason that '.' inside a character class means to just match a period: it + just doesn't make sense. + + If valuep is non-null then it is assumed that we are parsing inside + of a charclass definition and the first codepoint in the resolved + string is returned via *valuep and the routine will return NULL. + In this mode if a multichar string is returned from the charnames + handler, a warning will be issued, and only the first char in the + sequence will be examined. If the string returned is zero length + then the value of *valuep is undefined and NON-NULL will + be returned to indicate failure. (This will NOT be a valid pointer + to a regnode.) + + If valuep is null then it is assumed that we are parsing normal text and a + new EXACT node is inserted into the program containing the resolved string, + and a pointer to the new node is returned. But if the string is zero length + a NOTHING node is emitted instead. + + On success RExC_parse is set to the char following the endbrace. + Parsing failures will generate a fatal error via vFAIL(...) + */ +STATIC regnode * +S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth) +{ + char * endbrace; /* '}' following the name */ + regnode *ret = NULL; + char* p; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMEDSEQ; + + GET_RE_DEBUG_FLAGS; + + /* The [^\n] meaning of \N ignores spaces and comments under the /x + * modifier. The other meaning does not */ + p = (RExC_flags & RXf_PMf_EXTENDED) + ? regwhite( pRExC_state, RExC_parse ) + : RExC_parse; + + /* Disambiguate between \N meaning a named character versus \N meaning + * [^\n]. The former is assumed when it can't be the latter. */ + if (*p != '{' || regcurly(p)) { + RExC_parse = p; + if (valuep) { + /* no bare \N in a charclass */ + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + nextchar(pRExC_state); + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + RExC_parse--; + Set_Node_Length(ret, 1); /* MJD */ + return ret; + } + + /* Here, we have decided it should be a named sequence */ + + /* The test above made sure that the next real character is a '{', but + * under the /x modifier, it could be separated by space (or a comment and + * \n) and this is not allowed (for consistency with \x{...} and the + * tokenizer handling of \N{NAME}). */ + if (*RExC_parse != '{') { + vFAIL("Missing braces on \\N{}"); + } + + RExC_parse++; /* Skip past the '{' */ + + if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ + || ! (endbrace == RExC_parse /* nothing between the {} */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ + { + if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ + vFAIL("\\N{NAME} must be resolved by the lexer"); + } + + if (endbrace == RExC_parse) { /* empty: \N{} */ + if (! valuep) { + RExC_parse = endbrace + 1; + return reg_node(pRExC_state,NOTHING); + } + + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class" + ); + RExC_parse = endbrace + 1; + } + *valuep = 0; + return (regnode *) &RExC_parse; /* Invalid regnode pointer */ + } + + REQUIRE_UTF8; /* named sequences imply Unicode semantics */ + RExC_parse += 2; /* Skip past the 'U+' */ + + if (valuep) { /* In a bracketed char class */ + /* We only pay attention to the first char of + multichar strings being returned. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. XXX Solution is to recharacterize as + [rest-of-class]|multi1|multi2... */ + + STRLEN length_of_hex; + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + + char * endchar = RExC_parse + strcspn(RExC_parse, ".}"); + if (endchar < endbrace) { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } + + length_of_hex = (STRLEN)(endchar - RExC_parse); + *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL); + + /* The tokenizer should have guaranteed validity, but it's possible to + * bypass it by using single quoting, so check */ + if (length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) RExC_parse = endchar; + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + + RExC_parse = endbrace + 1; + if (endchar == endbrace) return NULL; + + ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */ + } + else { /* Not a char class */ + + /* What is done here is to convert this to a sub-pattern of the form + * (?:\x{char1}\x{char2}...) + * and then call reg recursively. That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. toke.c has converted the original Unicode values + * to native, so that we can just pass on the hex values unchanged. We + * do have to set a flag to keep recoding from happening in the + * recursion */ + + SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); + STRLEN len; + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + char *orig_end = RExC_end; + + while (RExC_parse < endbrace) { + + /* Code points are separated by dots. If none, there is only one + * code point, and is terminated by the brace */ + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + + /* Convert to notation the rest of the code understands */ + sv_catpv(substitute_parse, "\\x{"); + sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(substitute_parse, "}"); + + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + } + sv_catpv(substitute_parse, ")"); + + RExC_parse = SvPV(substitute_parse, len); + + /* Don't allow empty number */ + if (len < 8) { + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + RExC_end = RExC_parse + len; + + /* The values are Unicode, and therefore not subject to recoding */ + RExC_override_recoding = 1; + + ret = reg(pRExC_state, 1, flagp, depth+1); + + RExC_parse = endbrace; + RExC_end = orig_end; + RExC_override_recoding = 0; + + nextchar(pRExC_state); + } + + return ret; +} + + +/* + * reg_recode + * + * It returns the code point in utf8 for the value in *encp. + * value: a code value in the source encoding + * encp: a pointer to an Encode object + * + * If the result from Encode is not a single character, + * it returns U+FFFD (Replacement character) and sets *encp to NULL. + */ +STATIC UV +S_reg_recode(pTHX_ const char value, SV **encp) +{ + STRLEN numlen = 1; + SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); + const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); + const STRLEN newlen = SvCUR(sv); + UV uv = UNICODE_REPLACEMENT; + + PERL_ARGS_ASSERT_REG_RECODE; + + if (newlen) + uv = SvUTF8(sv) + ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) + : *(U8*)s; + + if (!newlen || numlen != newlen) { + uv = UNICODE_REPLACEMENT; + *encp = NULL; + } + return uv; +} + + +/* + - regatom - the lowest level + + Try to identify anything special at the start of the pattern. If there + is, then handle it as required. This may involve generating a single regop, + such as for an assertion; or it may involve recursing, such as to + handle a () structure. + + If the string doesn't start with something special then we gobble up + as much literal text as we can. + + Once we have been able to handle whatever type of thing started the + sequence, we return. + + Note: we have to be careful with escapes, as they can be both literal + and special, and in the case of \10 and friends can either, depending + on context. Specifically there are two separate switches for handling + escape sequences, with the one for handling literal escapes requiring + a dummy entry for all of the special escapes that are actually handled + by the other. +*/ + +STATIC regnode * +S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + dVAR; + register regnode *ret = NULL; + I32 flags; + char *parse_start = RExC_parse; + U8 op; + GET_RE_DEBUG_FLAGS_DECL; + DEBUG_PARSE("atom"); + *flagp = WORST; /* Tentatively. */ + + PERL_ARGS_ASSERT_REGATOM; + +tryagain: + switch ((U8)*RExC_parse) { + case '^': + RExC_seen_zerolen++; + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MBOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SBOL); + else + ret = reg_node(pRExC_state, BOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '$': + nextchar(pRExC_state); + if (*RExC_parse) + RExC_seen_zerolen++; + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MEOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SEOL); + else + ret = reg_node(pRExC_state, EOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '.': + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SANY); + else + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + Set_Node_Length(ret, 1); /* MJD */ + break; + case '[': + { + char * const oregcomp_parse = ++RExC_parse; + ret = regclass(pRExC_state,depth+1); + if (*RExC_parse != ']') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } + nextchar(pRExC_state); + *flagp |= HASWIDTH|SIMPLE; + Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ + break; + } + case '(': + nextchar(pRExC_state); + ret = reg(pRExC_state, 1, &flags,depth+1); + if (ret == NULL) { + if (flags & TRYAGAIN) { + if (RExC_parse == RExC_end) { + /* Make parent create an empty node if needed. */ + *flagp |= TRYAGAIN; + return(NULL); + } + goto tryagain; + } + return(NULL); + } + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + break; + case '|': + case ')': + if (flags & TRYAGAIN) { + *flagp |= TRYAGAIN; + return NULL; + } + vFAIL("Internal urp"); + /* Supposed to be caught earlier. */ + break; + case '{': + if (!regcurly(RExC_parse)) { + RExC_parse++; + goto defchar; + } + /* FALL THROUGH */ + case '?': + case '+': + case '*': + RExC_parse++; + vFAIL("Quantifier follows nothing"); + break; + case '\\': + /* Special Escapes + + This switch handles escape sequences that resolve to some kind + of special regop and not to literal text. Escape sequnces that + resolve to literal text are handled below in the switch marked + "Literal Escapes". + + Every entry in this switch *must* have a corresponding entry + in the literal escape switch. However, the opposite is not + required, as the default for this switch is to jump to the + literal text handling code. + */ + switch ((U8)*++RExC_parse) { + /* Special Escapes */ + case 'A': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, SBOL); + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'G': + ret = reg_node(pRExC_state, GPOS); + RExC_seen |= REG_SEEN_GPOS; + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'K': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, KEEPS); + *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_SEEN_LOOKBEHIND; + goto finish_meta_pat; + case 'Z': + ret = reg_node(pRExC_state, SEOL); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'z': + ret = reg_node(pRExC_state, EOS); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'C': + ret = reg_node(pRExC_state, CANY); + RExC_seen |= REG_SEEN_CANY; + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'X': + ret = reg_node(pRExC_state, CLUMP); + *flagp |= HASWIDTH; + goto finish_meta_pat; + case 'w': + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = ALNUML; + break; + case REGEX_UNICODE_CHARSET: + op = ALNUMU; + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + op = ALNUMA; + break; + case REGEX_DEPENDS_CHARSET: + op = ALNUM; + break; + default: + goto bad_charset; + } + ret = reg_node(pRExC_state, op); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'W': + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = NALNUML; + break; + case REGEX_UNICODE_CHARSET: + op = NALNUMU; + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + op = NALNUMA; + break; + case REGEX_DEPENDS_CHARSET: + op = NALNUM; + break; + default: + goto bad_charset; + } + ret = reg_node(pRExC_state, op); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'b': + RExC_seen_zerolen++; + RExC_seen |= REG_SEEN_LOOKBEHIND; + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = BOUNDL; + break; + case REGEX_UNICODE_CHARSET: + op = BOUNDU; + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + op = BOUNDA; + break; + case REGEX_DEPENDS_CHARSET: + op = BOUND; + break; + default: + goto bad_charset; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead"); + } + goto finish_meta_pat; + case 'B': + RExC_seen_zerolen++; + RExC_seen |= REG_SEEN_LOOKBEHIND; + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = NBOUNDL; + break; + case REGEX_UNICODE_CHARSET: + op = NBOUNDU; + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + op = NBOUNDA; + break; + case REGEX_DEPENDS_CHARSET: + op = NBOUND; + break; + default: + goto bad_charset; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead"); + } + goto finish_meta_pat; + case 's': + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = SPACEL; + break; + case REGEX_UNICODE_CHARSET: + op = SPACEU; + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + op = SPACEA; + break; + case REGEX_DEPENDS_CHARSET: + op = SPACE; + break; + default: + goto bad_charset; + } + ret = reg_node(pRExC_state, op); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'S': + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = NSPACEL; + break; + case REGEX_UNICODE_CHARSET: + op = NSPACEU; + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + op = NSPACEA; + break; + case REGEX_DEPENDS_CHARSET: + op = NSPACE; + break; + default: + goto bad_charset; + } + ret = reg_node(pRExC_state, op); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'd': + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = DIGITL; + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + op = DIGITA; + break; + case REGEX_DEPENDS_CHARSET: /* No difference between these */ + case REGEX_UNICODE_CHARSET: + op = DIGIT; + break; + default: + goto bad_charset; + } + ret = reg_node(pRExC_state, op); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'D': + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = NDIGITL; + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + op = NDIGITA; + break; + case REGEX_DEPENDS_CHARSET: /* No difference between these */ + case REGEX_UNICODE_CHARSET: + op = NDIGIT; + break; + default: + goto bad_charset; + } + ret = reg_node(pRExC_state, op); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'h': + ret = reg_node(pRExC_state, HORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'H': + ret = reg_node(pRExC_state, NHORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'v': + ret = reg_node(pRExC_state, VERTWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'V': + ret = reg_node(pRExC_state, NVERTWS); + *flagp |= HASWIDTH|SIMPLE; + finish_meta_pat: + nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ + break; + case 'p': + case 'P': + { + char* const oldregxend = RExC_end; +#ifdef DEBUGGING + char* parse_start = RExC_parse - 2; +#endif + + if (RExC_parse[1] == '{') { + /* a lovely hack--pretend we saw [\pX] instead */ + RExC_end = strchr(RExC_parse, '}'); + if (!RExC_end) { + const U8 c = (U8)*RExC_parse; + RExC_parse += 2; + RExC_end = oldregxend; + vFAIL2("Missing right brace on \\%c{}", c); + } + RExC_end++; + } + else { + RExC_end = RExC_parse + 2; + if (RExC_end > oldregxend) + RExC_end = oldregxend; + } + RExC_parse--; + + ret = regclass(pRExC_state,depth+1); + + RExC_end = oldregxend; + RExC_parse--; + + Set_Node_Offset(ret, parse_start + 2); + Set_Node_Cur_Length(ret); + nextchar(pRExC_state); + *flagp |= HASWIDTH|SIMPLE; + } + break; + case 'N': + /* Handle \N and \N{NAME} here and not below because it can be + multicharacter. join_exact() will join them up later on. + Also this makes sure that things like /\N{BLAH}+/ and + \N{BLAH} being multi char Just Happen. dmq*/ + ++RExC_parse; + ret= reg_namedseq(pRExC_state, NULL, flagp, depth); + break; + case 'k': /* Handle \k and \k'NAME' */ + parse_named_seq: + { + char ch= RExC_parse[1]; + if (ch != '<' && ch != '\'' && ch != '{') { + RExC_parse++; + vFAIL2("Sequence %.2s... not terminated",parse_start); + } else { + /* this pretty much dupes the code for (?P=...) in reg(), if + you change this make sure you change that */ + char* name_start = (RExC_parse += 2); + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; + if (RExC_parse == name_start || *RExC_parse != ch) + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, 1, "S" ); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (MORE_ASCII_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret); /* MJD */ + nextchar(pRExC_state); + + } + break; + } + case 'g': + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + { + I32 num; + bool isg = *RExC_parse == 'g'; + bool isrel = 0; + bool hasbrace = 0; + if (isg) { + RExC_parse++; + if (*RExC_parse == '{') { + RExC_parse++; + hasbrace = 1; + } + if (*RExC_parse == '-') { + RExC_parse++; + isrel = 1; + } + if (hasbrace && !isDIGIT(*RExC_parse)) { + if (isrel) RExC_parse--; + RExC_parse -= 2; + goto parse_named_seq; + } } + num = atoi(RExC_parse); + if (isg && num == 0) + vFAIL("Reference to invalid group 0"); + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); + } + if (!isg && num > 9 && num >= RExC_npar) + goto defchar; + else { + char * const parse_start = RExC_parse - 1; /* MJD */ + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (parse_start == RExC_parse - 1) + vFAIL("Unterminated \\g... pattern"); + if (hasbrace) { + if (*RExC_parse != '}') + vFAIL("Unterminated \\g{...} pattern"); + RExC_parse++; + } + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) + vFAIL("Reference to nonexistent group"); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? REF + : (MORE_ASCII_RESTRICTED) + ? REFFA + : (AT_LEAST_UNI_SEMANTICS) + ? REFFU + : (LOC) + ? REFFL + : REFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret); /* MJD */ + RExC_parse--; + nextchar(pRExC_state); + } + } + break; + case '\0': + if (RExC_parse >= RExC_end) + FAIL("Trailing \\"); + /* FALL THROUGH */ + default: + /* Do not generate "unrecognized" warnings here, we fall + back into the quick-grab loop below */ + parse_start--; + goto defchar; + } + break; + + case '#': + if (RExC_flags & RXf_PMf_EXTENDED) { + if ( reg_skipcomment( pRExC_state ) ) + goto tryagain; + } + /* FALL THROUGH */ + + default: + + parse_start = RExC_parse - 1; + + RExC_parse++; + + defchar: { + register STRLEN len; + register UV ender; + register char *p; + char *s; + STRLEN foldlen; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; + U8 node_type; + + /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so, + * it is folded to 'ss' even if not utf8 */ + bool is_exactfu_sharp_s; + + ender = 0; + node_type = ((! FOLD) ? EXACT + : (LOC) + ? EXACTFL + : (MORE_ASCII_RESTRICTED) + ? EXACTFA + : (AT_LEAST_UNI_SEMANTICS) + ? EXACTFU + : EXACTF); + ret = reg_node(pRExC_state, node_type); + s = STRING(ret); + + /* XXX The node can hold up to 255 bytes, yet this only goes to + * 127. I (khw) do not know why. Keeping it somewhat less than + * 255 allows us to not have to worry about overflow due to + * converting to utf8 and fold expansion, but that value is + * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes + * split up by this limit into a single one using the real max of + * 255. Even at 127, this breaks under rare circumstances. If + * folding, we do not want to split a node at a character that is a + * non-final in a multi-char fold, as an input string could just + * happen to want to match across the node boundary. The join + * would solve that problem if the join actually happens. But a + * series of more than two nodes in a row each of 127 would cause + * the first join to succeed to get to 254, but then there wouldn't + * be room for the next one, which could at be one of those split + * multi-char folds. I don't know of any fool-proof solution. One + * could back off to end with only a code point that isn't such a + * non-final, but it is possible for there not to be any in the + * entire node. */ + for (len = 0, p = RExC_parse - 1; + len < 127 && p < RExC_end; + len++) + { + char * const oldp = p; + + if (RExC_flags & RXf_PMf_EXTENDED) + p = regwhite( pRExC_state, p ); + switch ((U8)*p) { + case '^': + case '$': + case '.': + case '[': + case '(': + case ')': + case '|': + goto loopdone; + case '\\': + /* Literal Escapes Switch + + This switch is meant to handle escape sequences that + resolve to a literal character. + + Every escape sequence that represents something + else, like an assertion or a char class, is handled + in the switch marked 'Special Escapes' above in this + routine, but also has an entry here as anything that + isn't explicitly mentioned here will be treated as + an unescaped equivalent literal. + */ + + switch ((U8)*++p) { + /* These are all the special escapes. */ + case 'A': /* Start assertion */ + case 'b': case 'B': /* Word-boundary assertion*/ + case 'C': /* Single char !DANGEROUS! */ + case 'd': case 'D': /* digit class */ + case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ + case 'k': case 'K': /* named backref, keep marker */ + case 'N': /* named char sequence */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ + case 's': case 'S': /* space class */ + case 'v': case 'V': /* VERTWS */ + case 'w': case 'W': /* word class */ + case 'X': /* eXtended Unicode "combining character sequence" */ + case 'z': case 'Z': /* End of line/string assertion */ + --p; + goto loopdone; + + /* Anything after here is an escape that resolves to a + literal. (Except digits, which may or may not) + */ + case 'n': + ender = '\n'; + p++; + break; + case 'r': + ender = '\r'; + p++; + break; + case 't': + ender = '\t'; + p++; + break; + case 'f': + ender = '\f'; + p++; + break; + case 'e': + ender = ASCII_TO_NATIVE('\033'); + p++; + break; + case 'a': + ender = ASCII_TO_NATIVE('\007'); + p++; + break; + case 'o': + { + STRLEN brace_len = len; + UV result; + const char* error_msg; + + bool valid = grok_bslash_o(p, + &result, + &brace_len, + &error_msg, + 1); + p += brace_len; + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + else + { + ender = result; + } + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } + case 'x': + if (*++p == '{') { + char* const e = strchr(p, '}'); + + if (!e) { + RExC_parse = p + 1; + vFAIL("Missing right brace on \\x{}"); + } + else { + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX; + STRLEN numlen = e - p - 1; + ender = grok_hex(p + 1, &numlen, &flags, NULL); + if (ender > 0xff) + REQUIRE_UTF8; + p = e + 1; + } + } + else { + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + STRLEN numlen = 2; + ender = grok_hex(p, &numlen, &flags, NULL); + p += numlen; + } + if (PL_encoding && ender < 0x100) + goto recode_encoding; + break; + case 'c': + p++; + ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); + break; + case '0': case '1': case '2': case '3':case '4': + case '5': case '6': case '7': case '8':case '9': + if (*p == '0' || + (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) + { + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + STRLEN numlen = 3; + ender = grok_oct(p, &numlen, &flags, NULL); + if (ender > 0xff) { + REQUIRE_UTF8; + } + p += numlen; + } + else { + --p; + goto loopdone; + } + if (PL_encoding && ender < 0x100) + goto recode_encoding; + break; + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + ender = reg_recode((const char)(U8)ender, &enc); + if (!enc && SIZE_ONLY) + ckWARNreg(p, "Invalid escape in the specified encoding"); + REQUIRE_UTF8; + } + break; + case '\0': + if (p >= RExC_end) + FAIL("Trailing \\"); + /* FALL THROUGH */ + default: + if (!SIZE_ONLY&& isALPHA(*p)) { + /* Include any { following the alpha to emphasize + * that it could be part of an escape at some point + * in the future */ + int len = (*(p + 1) == '{') ? 2 : 1; + ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p); + } + goto normal_default; + } + break; + default: + normal_default: + if (UTF8_IS_START(*p) && UTF) { + STRLEN numlen; + ender = utf8n_to_uvchr((U8*)p, RExC_end - p, + &numlen, UTF8_ALLOW_DEFAULT); + p += numlen; + } + else + ender = (U8) *p++; + break; + } /* End of switch on the literal */ + + is_exactfu_sharp_s = (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S); + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regwhite( pRExC_state, p ); + if ((UTF && FOLD) || is_exactfu_sharp_s) { + /* Prime the casefolded buffer. Locale rules, which apply + * only to code points < 256, aren't known until execution, + * so for them, just output the original character using + * utf8. If we start to fold non-UTF patterns, be sure to + * update join_exact() */ + if (LOC && ender < 256) { + if (UNI_IS_INVARIANT(ender)) { + *tmpbuf = (U8) ender; + foldlen = 1; + } else { + *tmpbuf = UTF8_TWO_BYTE_HI(ender); + *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender); + foldlen = 2; + } + } + else if (isASCII(ender)) { /* Note: Here can't also be LOC + */ + ender = toLOWER(ender); + *tmpbuf = (U8) ender; + foldlen = 1; + } + else if (! MORE_ASCII_RESTRICTED && ! LOC) { + + /* Locale and /aa require more selectivity about the + * fold, so are handled below. Otherwise, here, just + * use the fold */ + ender = toFOLD_uni(ender, tmpbuf, &foldlen); + } + else { + /* Under locale rules or /aa we are not to mix, + * respectively, ords < 256 or ASCII with non-. So + * reject folds that mix them, using only the + * non-folded code point. So do the fold to a + * temporary, and inspect each character in it. */ + U8 trialbuf[UTF8_MAXBYTES_CASE+1]; + U8* s = trialbuf; + UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen); + U8* e = s + foldlen; + bool fold_ok = TRUE; + + while (s < e) { + if (isASCII(*s) + || (LOC && (UTF8_IS_INVARIANT(*s) + || UTF8_IS_DOWNGRADEABLE_START(*s)))) + { + fold_ok = FALSE; + break; + } + s += UTF8SKIP(s); + } + if (fold_ok) { + Copy(trialbuf, tmpbuf, foldlen, U8); + ender = tmpender; + } + else { + uvuni_to_utf8(tmpbuf, ender); + foldlen = UNISKIP(ender); + } + } + } + if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ + if (len) + p = oldp; + else if (UTF || is_exactfu_sharp_s) { + if (FOLD) { + /* Emit all the Unicode characters. */ + STRLEN numlen; + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) { + + /* tmpbuf has been constructed by us, so we + * know it is valid utf8 */ + ender = valid_utf8_to_uvchr(foldbuf, &numlen); + if (numlen > 0) { + const STRLEN unilen = reguni(pRExC_state, ender, s); + s += unilen; + len += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; /* "Can't happen." */ + } + } + else { + const STRLEN unilen = reguni(pRExC_state, ender, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + } + } + else { + len++; + REGC((char)ender, s++); + } + break; + } + if (UTF || is_exactfu_sharp_s) { + if (FOLD) { + /* Emit all the Unicode characters. */ + STRLEN numlen; + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) { + ender = valid_utf8_to_uvchr(foldbuf, &numlen); + if (numlen > 0) { + const STRLEN unilen = reguni(pRExC_state, ender, s); + len += unilen; + s += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; + } + } + else { + const STRLEN unilen = reguni(pRExC_state, ender, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + } + len--; + } + else { + REGC((char)ender, s++); + } + } + loopdone: /* Jumped to when encounters something that shouldn't be in + the node */ + RExC_parse = p - 1; + Set_Node_Cur_Length(ret); /* MJD */ + nextchar(pRExC_state); + { + /* len is STRLEN which is unsigned, need to copy to signed */ + IV iv = len; + if (iv < 0) + vFAIL("Internal disaster"); + } + if (len > 0) + *flagp |= HASWIDTH; + if (len == 1 && UNI_IS_INVARIANT(ender)) + *flagp |= SIMPLE; + + if (SIZE_ONLY) + RExC_size += STR_SZ(len); + else { + STR_LEN(ret) = len; + RExC_emit += STR_SZ(len); + } + } + break; + } + + return(ret); + +/* Jumped to when an unrecognized character set is encountered */ +bad_charset: + Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags)); + return(NULL); +} + +STATIC char * +S_regwhite( RExC_state_t *pRExC_state, char *p ) +{ + const char *e = RExC_end; + + PERL_ARGS_ASSERT_REGWHITE; + + while (p < e) { + if (isSPACE(*p)) + ++p; + else if (*p == '#') { + bool ended = 0; + do { + if (*p++ == '\n') { + ended = 1; + break; + } + } while (p < e); + if (!ended) + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + } + else + break; + } + return p; +} + +/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. + Character classes ([:foo:]) can also be negated ([:^foo:]). + Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. + Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, + but trigger failures because they are currently unimplemented. */ + +#define POSIXCC_DONE(c) ((c) == ':') +#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') +#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) + +STATIC I32 +S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) +{ + dVAR; + I32 namedclass = OOB_NAMEDCLASS; + + PERL_ARGS_ASSERT_REGPPOSIXCC; + + if (value == '[' && RExC_parse + 1 < RExC_end && + /* I smell either [: or [= or [. -- POSIX has been here, right? */ + POSIXCC(UCHARAT(RExC_parse))) { + const char c = UCHARAT(RExC_parse); + char* const s = RExC_parse++; + + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) + RExC_parse++; + if (RExC_parse == RExC_end) + /* Grandfather lone [:, [=, [. */ + RExC_parse = s; + else { + const char* const t = RExC_parse++; /* skip over the c */ + assert(*t == c); + + if (UCHARAT(RExC_parse) == ']') { + const char *posixcc = s + 1; + RExC_parse++; /* skip over the ending ] */ + + if (*s == ':') { + const I32 complement = *posixcc == '^' ? *posixcc++ : 0; + const I32 skip = t - posixcc; + + /* Initially switch on the length of the name. */ + switch (skip) { + case 4: + if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ + namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM; + break; + case 5: + /* Names all of length 5. */ + /* alnum alpha ascii blank cntrl digit graph lower + print punct space upper */ + /* Offset 4 gives the best switch position. */ + switch (posixcc[4]) { + case 'a': + if (memEQ(posixcc, "alph", 4)) /* alpha */ + namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA; + break; + case 'e': + if (memEQ(posixcc, "spac", 4)) /* space */ + namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; + break; + case 'h': + if (memEQ(posixcc, "grap", 4)) /* graph */ + namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH; + break; + case 'i': + if (memEQ(posixcc, "asci", 4)) /* ascii */ + namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; + break; + case 'k': + if (memEQ(posixcc, "blan", 4)) /* blank */ + namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK; + break; + case 'l': + if (memEQ(posixcc, "cntr", 4)) /* cntrl */ + namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL; + break; + case 'm': + if (memEQ(posixcc, "alnu", 4)) /* alnum */ + namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; + break; + case 'r': + if (memEQ(posixcc, "lowe", 4)) /* lower */ + namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER; + else if (memEQ(posixcc, "uppe", 4)) /* upper */ + namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER; + break; + case 't': + if (memEQ(posixcc, "digi", 4)) /* digit */ + namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT; + else if (memEQ(posixcc, "prin", 4)) /* print */ + namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT; + else if (memEQ(posixcc, "punc", 4)) /* punct */ + namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT; + break; + } + break; + case 6: + if (memEQ(posixcc, "xdigit", 6)) + namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; + break; + } + + if (namedclass == OOB_NAMEDCLASS) + Simple_vFAIL3("POSIX class [:%.*s:] unknown", + t - s - 1, s + 1); + assert (posixcc[skip] == ':'); + assert (posixcc[skip+1] == ']'); + } else if (!SIZE_ONLY) { + /* [[=foo=]] and [[.foo.]] are still future. */ + + /* adjust RExC_parse so the warning shows after + the class closes */ + while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') + RExC_parse++; + Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } + } else { + /* Maternal grandfather: + * "[:" ending in ":" but not in ":]" */ + RExC_parse = s; + } + } + } + + return namedclass; +} + +STATIC void +S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) +{ + dVAR; + + PERL_ARGS_ASSERT_CHECKPOSIXCC; + + if (POSIXCC(UCHARAT(RExC_parse))) { + const char *s = RExC_parse; + const char c = *s++; + + while (isALNUM(*s)) + s++; + if (*s && c == *s && s[1] == ']') { + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); + + /* [[=foo=]] and [[.foo.]] are still future. */ + if (POSIXCC_NOTYET(c)) { + /* adjust RExC_parse so the error shows after + the class closes */ + while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']') + NOOP; + Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } + } + } +} + +/* Generate the code to add a full posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * Xsourcelist is the full Unicode range list to use otherwise. */ +#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + \ + /* Set this class in the node for runtime matching */ \ + ANYOF_CLASS_SET(node, class); \ + \ + /* For above Latin1 code points, we use the full Unicode range */ \ + _invlist_intersection(PL_AboveLatin1, \ + Xsourcelist, \ + &scratch_list); \ + /* And set the output to it, adding instead if there already is an \ + * output. Checking if is NULL first saves an extra \ + * clone. Its reference count will be decremented at the next \ + * union, etc, or if this is the only instance, at the end of the \ + * routine */ \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + /* For non-locale, just add it to any existing list */ \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX, but matches the complement of and . + */ +#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + ANYOF_CLASS_SET(node, class); \ + _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + _invlist_union_complement_2nd(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + /* Under /d, everything in the upper half of the Latin1 range \ + * matches this complement */ \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } + +/* Generate the code to add a posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * l1_sourcelist is the Latin1 range list to use otherwise. + * Xpropertyname is the name to add to of the property to + * specify the code points above Latin1 that will have to be + * determined at run-time + * run_time_list is a SV* that contains text names of properties that are to + * be computed at run time. This concatenates + * to it, apppropriately + * This is essentially DO_POSIX, but we know only the Latin1 values at compile + * time */ +#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + /* If not /a matching, there are going to be code points we will have \ + * to defer to runtime to look-up */ \ + if (! AT_LEAST_ASCII_RESTRICTED) { \ + Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \ + } \ + if (LOC) { \ + ANYOF_CLASS_SET(node, class); \ + } \ + else { \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : l1_sourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of + * this and DO_N_POSIX */ +#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + if (AT_LEAST_ASCII_RESTRICTED) { \ + _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \ + } \ + else { \ + Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \ + if (LOC) { \ + ANYOF_CLASS_SET(node, namedclass); \ + } \ + else { \ + SV* scratch_list = NULL; \ + _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } \ + } + +STATIC U8 +S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) +{ + + /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes. + * Locale folding is done at run-time, so this function should not be + * called for nodes that are for locales. + * + * This function sets the bit corresponding to the fold of the input + * 'value', if not already set. The fold of 'f' is 'F', and the fold of + * 'F' is 'f'. + * + * It also knows about the characters that are in the bitmap that have + * folds that are matchable only outside it, and sets the appropriate lists + * and flags. + * + * It returns the number of bits that actually changed from 0 to 1 */ + + U8 stored = 0; + U8 fold; + + PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD; + + fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value] + : PL_fold[value]; + + /* It assumes the bit for 'value' has already been set */ + if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) { + ANYOF_BITMAP_SET(node, fold); + stored++; + } + if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) { + /* Certain Latin1 characters have matches outside the bitmap. To get + * here, 'value' is one of those characters. None of these matches is + * valid for ASCII characters under /aa, which have been excluded by + * the 'if' above. The matches fall into three categories: + * 1) They are singly folded-to or -from an above 255 character, as + * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y + * WITH DIAERESIS; + * 2) They are part of a multi-char fold with another character in the + * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill; + * 3) They are part of a multi-char fold with a character not in the + * bitmap, such as various ligatures. + * We aren't dealing fully with multi-char folds, except we do deal + * with the pattern containing a character that has a multi-char fold + * (not so much the inverse). + * For types 1) and 3), the matches only happen when the target string + * is utf8; that's not true for 2), and we set a flag for it. + * + * The code below adds to the passed in inversion list the single fold + * closures for 'value'. The values are hard-coded here so that an + * innocent-looking character class, like /[ks]/i won't have to go out + * to disk to find the possible matches. XXX It would be better to + * generate these via regen, in case a new version of the Unicode + * standard adds new mappings, though that is not really likely. */ + switch (value) { + case 'k': + case 'K': + /* KELVIN SIGN */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A); + break; + case 's': + case 'S': + /* LATIN SMALL LETTER LONG S */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F); + break; + case MICRO_SIGN: + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + GREEK_SMALL_LETTER_MU); + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + GREEK_CAPITAL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + /* ANGSTROM SIGN */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B); + if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + PL_fold_latin1[value]); + } + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + LATIN_CAPITAL_LETTER_SHARP_S); + + /* Under /a, /d, and /u, this can match the two chars "ss" */ + if (! MORE_ASCII_RESTRICTED) { + add_alternate(alternate_ptr, (U8 *) "ss", 2); + + /* And under /u or /a, it can match even if the target is + * not utf8 */ + if (AT_LEAST_UNI_SEMANTICS) { + ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8; + } + } + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character folds from code + * points that require UTF8 to express, so they can't match + * unless the target string is in UTF-8, so no action here is + * necessary, as regexec.c properly handles the general case + * for UTF-8 matching */ + break; + default: + /* Use deprecated warning to increase the chances of this + * being output */ + ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value); + break; + } + } + else if (DEPENDS_SEMANTICS + && ! isASCII(value) + && PL_fold_latin1[value] != value) + { + /* Under DEPENDS rules, non-ASCII Latin1 characters match their + * folds only when the target string is in UTF-8. We add the fold + * here to the list of things to match outside the bitmap, which + * won't be looked at unless it is UTF8 (or else if something else + * says to look even if not utf8, but those things better not happen + * under DEPENDS semantics. */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]); + } + + return stored; +} + + +PERL_STATIC_INLINE U8 +S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) +{ + /* This inline function sets a bit in the bitmap if not already set, and if + * appropriate, its fold, returning the number of bits that actually + * changed from 0 to 1 */ + + U8 stored; + + PERL_ARGS_ASSERT_SET_REGCLASS_BIT; + + if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */ + return 0; + } + + ANYOF_BITMAP_SET(node, value); + stored = 1; + + if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */ + stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr); + } + + return stored; +} + +STATIC void +S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) +{ + /* Adds input 'string' with length 'len' to the ANYOF node's unicode + * alternate list, pointed to by 'alternate_ptr'. This is an array of + * the multi-character folds of characters in the node */ + SV *sv; + + PERL_ARGS_ASSERT_ADD_ALTERNATE; + + if (! *alternate_ptr) { + *alternate_ptr = newAV(); + } + sv = newSVpvn_utf8((char*)string, len, TRUE); + av_push(*alternate_ptr, sv); + return; +} + +/* + parse a class specification and produce either an ANYOF node that + matches the pattern or perhaps will be optimized into an EXACTish node + instead. The node contains a bit map for the first 256 characters, with the + corresponding bit set if that character is in the list. For characters + above 255, a range list is used */ + +STATIC regnode * +S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) +{ + dVAR; + register UV nextvalue; + register IV prevvalue = OOB_UNICODE; + register IV range = 0; + UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ + register regnode *ret; + STRLEN numlen; + IV namedclass; + char *rangebegin = NULL; + bool need_class = 0; + bool allow_full_fold = TRUE; /* Assume wants multi-char folding */ + SV *listsv = NULL; + STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more + than just initialized. */ + SV* properties = NULL; /* Code points that match \p{} \P{} */ + UV element_count = 0; /* Number of distinct elements in the class. + Optimizations may be possible if this is tiny */ + UV n; + + /* Unicode properties are stored in a swash; this holds the current one + * being parsed. If this swash is the only above-latin1 component of the + * character class, an optimization is to pass it directly on to the + * execution engine. Otherwise, it is set to NULL to indicate that there + * are other things in the class that have to be dealt with at execution + * time */ + SV* swash = NULL; /* Code points that match \p{} \P{} */ + + /* Set if a component of this character class is user-defined; just passed + * on to the engine */ + UV has_user_defined_property = 0; + + /* code points this node matches that can't be stored in the bitmap */ + SV* nonbitmap = NULL; + + /* The items that are to match that aren't stored in the bitmap, but are a + * result of things that are stored there. This is the fold closure of + * such a character, either because it has DEPENDS semantics and shouldn't + * be matched unless the target string is utf8, or is a code point that is + * too large for the bit map, as for example, the fold of the MICRO SIGN is + * above 255. This all is solely for performance reasons. By having this + * code know the outside-the-bitmap folds that the bitmapped characters are + * involved with, we don't have to go out to disk to find the list of + * matches, unless the character class includes code points that aren't + * storable in the bit map. That means that a character class with an 's' + * in it, for example, doesn't need to go out to disk to find everything + * that matches. A 2nd list is used so that the 'nonbitmap' list is kept + * empty unless there is something whose fold we don't know about, and will + * have to go out to the disk to find. */ + SV* l1_fold_invlist = NULL; + + /* List of multi-character folds that are matched by this node */ + AV* unicode_alternate = NULL; +#ifdef EBCDIC + UV literal_endpoint = 0; +#endif + UV stored = 0; /* how many chars stored in the bitmap */ + + regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in + case we need to change the emitted regop to an EXACT. */ + const char * orig_parse = RExC_parse; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCLASS; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + DEBUG_PARSE("clas"); + + /* Assume we are going to generate an ANYOF node. */ + ret = reganode(pRExC_state, ANYOF, 0); + + + if (!SIZE_ONLY) { + ANYOF_FLAGS(ret) = 0; + } + + if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ + RExC_naughty++; + RExC_parse++; + if (!SIZE_ONLY) + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + + /* We have decided to not allow multi-char folds in inverted character + * classes, due to the confusion that can happen, especially with + * classes that are designed for a non-Unicode world: You have the + * peculiar case that: + "s s" =~ /^[^\xDF]+$/i => Y + "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + allow_full_fold = FALSE; + } + + if (SIZE_ONLY) { + RExC_size += ANYOF_SKIP; + listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ + } + else { + RExC_emit += ANYOF_SKIP; + if (LOC) { + ANYOF_FLAGS(ret) |= ANYOF_LOCALE; + } + ANYOF_BITMAP_ZERO(ret); + listsv = newSVpvs("# comment\n"); + initial_listsv_len = SvCUR(listsv); + } + + nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; + + if (!SIZE_ONLY && POSIXCC(nextvalue)) + checkposixcc(pRExC_state); + + /* allow 1st char to be ] (allowing it to be - is dealt with later) */ + if (UCHARAT(RExC_parse) == ']') + goto charclassloop; + +parseit: + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { + + charclassloop: + + namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + + if (!range) { + rangebegin = RExC_parse; + element_count++; + } + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + + nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; + if (value == '[' && POSIXCC(nextvalue)) + namedclass = regpposixcc(pRExC_state, value); + else if (value == '\\') { + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. + * A similar issue a little bit later when switching on + * namedclass. --jhi */ + switch ((I32)value) { + case 'w': namedclass = ANYOF_ALNUM; break; + case 'W': namedclass = ANYOF_NALNUM; break; + case 's': namedclass = ANYOF_SPACE; break; + case 'S': namedclass = ANYOF_NSPACE; break; + case 'd': namedclass = ANYOF_DIGIT; break; + case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; + case 'N': /* Handle \N{NAME} in class */ + { + /* We only pay attention to the first char of + multichar strings being returned. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. */ + UV v; /* value is register so we cant & it /grrr */ + if (reg_namedseq(pRExC_state, &v, NULL, depth)) { + goto parseit; + } + value= v; + } + break; + case 'p': + case 'P': + { + char *e; + if (RExC_parse >= RExC_end) + vFAIL2("Empty \\%c{}", (U8)value); + if (*RExC_parse == '{') { + const U8 c = (U8)value; + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL2("Missing right brace on \\%c{}", c); + while (isSPACE(UCHARAT(RExC_parse))) + RExC_parse++; + if (e == RExC_parse) + vFAIL2("Empty \\%c{}", c); + n = e - RExC_parse; + while (isSPACE(UCHARAT(RExC_parse + n - 1))) + n--; + } + else { + e = RExC_parse; + n = 1; + } + if (!SIZE_ONLY) { + SV** invlistsvp; + SV* invlist; + char* name; + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + n--; + value = value == 'p' ? 'P' : 'p'; /* toggle */ + while (isSPACE(UCHARAT(RExC_parse))) { + RExC_parse++; + n--; + } + } + /* Try to get the definition of the property into + * . If /i is in effect, the effective property + * will have its name be <__NAME_i>. The design is + * discussed in commit + * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ + Newx(name, n + sizeof("_i__\n"), char); + + sprintf(name, "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + ); + + /* Look up the property name, and get its swash and + * inversion list, if the property is found */ + if (swash) { + SvREFCNT_dec(swash); + } + swash = _core_swash_init("utf8", name, &PL_sv_undef, + 1, /* binary */ + 0, /* not tr/// */ + TRUE, /* this routine will handle + undefined properties */ + NULL, FALSE /* No inversion list */ + ); + if ( ! swash + || ! SvROK(swash) + || ! SvTYPE(SvRV(swash)) == SVt_PVHV + || ! (invlistsvp = + hv_fetchs(MUTABLE_HV(SvRV(swash)), + "INVLIST", FALSE)) + || ! (invlist = *invlistsvp)) + { + if (swash) { + SvREFCNT_dec(swash); + swash = NULL; + } + + /* Here didn't find it. It could be a user-defined + * property that will be available at run-time. Add it + * to the list to look up then */ + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", + (value == 'p' ? '+' : '!'), + name); + has_user_defined_property = 1; + + /* We don't know yet, so have to assume that the + * property could match something in the Latin1 range, + * hence something that isn't utf8 */ + ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; + } + else { + + /* Here, did get the swash and its inversion list. If + * the swash is from a user-defined property, then this + * whole character class should be regarded as such */ + SV** user_defined_svp = + hv_fetchs(MUTABLE_HV(SvRV(swash)), + "USER_DEFINED", FALSE); + if (user_defined_svp) { + has_user_defined_property + |= SvUV(*user_defined_svp); + } + + /* Invert if asking for the complement */ + if (value == 'P') { + _invlist_union_complement_2nd(properties, invlist, &properties); + + /* The swash can't be used as-is, because we've + * inverted things; delay removing it to here after + * have copied its invlist above */ + SvREFCNT_dec(swash); + swash = NULL; + } + else { + _invlist_union(properties, invlist, &properties); + } + } + Safefree(name); + } + RExC_parse = e + 1; + namedclass = ANYOF_MAX; /* no official name, but it's named */ + + /* \p means they want Unicode semantics */ + RExC_uni_semantics = 1; + } + break; + case 'n': value = '\n'; break; + case 'r': value = '\r'; break; + case 't': value = '\t'; break; + case 'f': value = '\f'; break; + case 'b': value = '\b'; break; + case 'e': value = ASCII_TO_NATIVE('\033');break; + case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'o': + RExC_parse--; /* function expects to be pointed at the 'o' */ + { + const char* error_msg; + bool valid = grok_bslash_o(RExC_parse, + &value, + &numlen, + &error_msg, + SIZE_ONLY); + RExC_parse += numlen; + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) { + goto recode_encoding; + } + break; + case 'x': + if (*RExC_parse == '{') { + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX; + char * const e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL("Missing right brace on \\x{}"); + + numlen = e - RExC_parse; + value = grok_hex(RExC_parse, &numlen, &flags, NULL); + RExC_parse = e + 1; + } + else { + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + numlen = 2; + value = grok_hex(RExC_parse, &numlen, &flags, NULL); + RExC_parse += numlen; + } + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + case 'c': + value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + /* Take 1-3 octal digits */ + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + numlen = 3; + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); + RExC_parse += numlen; + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + } + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + value = reg_recode((const char)(U8)value, &enc); + if (!enc && SIZE_ONLY) + ckWARNreg(RExC_parse, + "Invalid escape in the specified encoding"); + break; + } + default: + /* Allow \_ to not give an error */ + if (!SIZE_ONLY && isALNUM(value) && value != '_') { + ckWARN2reg(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); + } + break; + } + } /* end of \blah */ +#ifdef EBCDIC + else + literal_endpoint++; +#endif + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + + /* What matches in a locale is not known until runtime, so need to + * (one time per class) allocate extra space to pass to regexec. + * The space will contain a bit for each named class that is to be + * matched against. This isn't needed for \p{} and pseudo-classes, + * as they are not affected by locale, and hence are dealt with + * separately */ + if (LOC && namedclass < ANYOF_MAX && ! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; + ANYOF_CLASS_ZERO(ret); + } + ANYOF_FLAGS(ret) |= ANYOF_CLASS; + } + + /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a + * literal, as is the character that began the false range, i.e. + * the 'a' in the examples */ + if (range) { + if (!SIZE_ONLY) { + const int w = + RExC_parse >= rangebegin ? + RExC_parse - rangebegin : 0; + ckWARN4reg(RExC_parse, + "False [] range \"%*.*s\"", + w, w, rangebegin); + + stored += + set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); + if (prevvalue < 256) { + stored += + set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate); + } + else { + nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue); + } + } + + range = 0; /* this was not a true range */ + } + + if (!SIZE_ONLY) { + + /* Possible truncation here but in some 64-bit environments + * the compiler gets heartburn about switch on 64-bit values. + * A similar issue a little earlier when switching on value. + * --jhi */ + switch ((I32)namedclass) { + int i; /* loop counter */ + + case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_NALNUMC: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_ALPHA: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; + case ANYOF_NALPHA: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; + case ANYOF_ASCII: + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); + } + else { + _invlist_union(properties, PL_ASCII, &properties); + } + break; + case ANYOF_NASCII: + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); + } + else { + _invlist_union_complement_2nd(properties, + PL_ASCII, &properties); + if (DEPENDS_SEMANTICS) { + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; + } + } + break; + case ANYOF_BLANK: + DO_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_NBLANK: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_CNTRL: + DO_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; + case ANYOF_NCNTRL: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; + case ANYOF_DIGIT: + /* Ignore the compiler warning for this macro, planned to + * be eliminated later */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_NDIGIT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_GRAPH: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_NGRAPH: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_HORIZWS: + /* NBSP matches this, and needs to be added unconditionally + * to the bit map as it matches even under /d, unlike all + * the rest of the Posix-like classes (\v doesn't have any + * matches in the Latin1 range, so it is unaffected.) which + * Otherwise, we use the nonbitmap, as /d doesn't make a + * difference in what these match. It turns out that \h is + * just a synonym for XPosixBlank */ + _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap); + stored += set_regclass_bit(pRExC_state, ret, + UNI_TO_NATIVE(0xA0), + &l1_fold_invlist, + &unicode_alternate); + + break; + case ANYOF_NHORIZWS: + _invlist_union_complement_2nd(nonbitmap, + PL_XPosixBlank, &nonbitmap); + for (i = 128; i < 256; i++) { + if (i == 0xA0) { + continue; + } + stored += set_regclass_bit(pRExC_state, ret, + UNI_TO_NATIVE(i), + &l1_fold_invlist, + &unicode_alternate); + } + break; + case ANYOF_LOWER: + case ANYOF_NLOWER: + { /* These require special handling, as they differ under + folding, matching Cased there (which in the ASCII range + is the same as Alpha */ + + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } + else { + ascii_source = PL_PosixLower; + l1_source = PL_L1PosixLower; + Xname = "XPosixLower"; + } + if (namedclass == ANYOF_LOWER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); + } + else { + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); + } + break; + } + case ANYOF_PRINT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_NPRINT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_PUNCT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_NPUNCT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_PSXSPC: + DO_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_NPSXSPC: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_SPACE: + DO_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_NSPACE: + DO_N_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_UPPER: /* Same as LOWER, above */ + case ANYOF_NUPPER: + { + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } + else { + ascii_source = PL_PosixUpper; + l1_source = PL_L1PosixUpper; + Xname = "XPosixUpper"; + } + if (namedclass == ANYOF_UPPER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); + } + else { + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); + } + break; + } + case ANYOF_ALNUM: /* Really is 'Word' */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_NALNUM: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_VERTWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding */ + _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap); + break; + case ANYOF_NVERTWS: + _invlist_union_complement_2nd(nonbitmap, + PL_VertSpace, &nonbitmap); + break; + case ANYOF_XDIGIT: + DO_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; + case ANYOF_NXDIGIT: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; + case ANYOF_MAX: + /* this is to handle \p and \P */ + break; + default: + vFAIL("Invalid [::] class"); + break; + } + + continue; + } + } /* end of namedclass \blah */ + + if (range) { + if (prevvalue > (IV)value) /* b-a */ { + const int w = RExC_parse - rangebegin; + Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); + range = 0; /* not a valid range */ + } + } + else { + prevvalue = value; /* save the beginning of the range */ + if (RExC_parse+1 < RExC_end + && *RExC_parse == '-' + && RExC_parse[1] != ']') + { + RExC_parse++; + + /* a bad range like \w-, [:word:]- ? */ + if (namedclass > OOB_NAMEDCLASS) { + if (ckWARN(WARN_REGEXP)) { + const int w = + RExC_parse >= rangebegin ? + RExC_parse - rangebegin : 0; + vWARN4(RExC_parse, + "False [] range \"%*.*s\"", + w, w, rangebegin); + } + if (!SIZE_ONLY) + stored += + set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); + } else + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ + } + } + + /* non-Latin1 code point implies unicode semantics. Must be set in + * pass1 so is there for the whole of pass 2 */ + if (value > 255) { + RExC_uni_semantics = 1; + } + + /* now is the next time */ + if (!SIZE_ONLY) { + if (prevvalue < 256) { + const IV ceilvalue = value < 256 ? value : 255; + IV i; +#ifdef EBCDIC + /* In EBCDIC [\x89-\x91] should include + * the \x8e but [i-j] should not. */ + if (literal_endpoint == 2 && + ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || + (isUPPER(prevvalue) && isUPPER(ceilvalue)))) + { + if (isLOWER(prevvalue)) { + for (i = prevvalue; i <= ceilvalue; i++) + if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored += + set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); + } + } else { + for (i = prevvalue; i <= ceilvalue; i++) + if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored += + set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); + } + } + } + else +#endif + for (i = prevvalue; i <= ceilvalue; i++) { + stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); + } + } + if (value > 255) { + const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); + const UV natvalue = NATIVE_TO_UNI(value); + nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue); + } +#ifdef EBCDIC + literal_endpoint = 0; +#endif + } + + range = 0; /* this range (if it was one) is done now */ + } + + + + if (SIZE_ONLY) + return ret; + /****** !SIZE_ONLY AFTER HERE *********/ + + /* If folding and there are code points above 255, we calculate all + * characters that could fold to or from the ones already on the list */ + if (FOLD && nonbitmap) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + + /* This is a list of all the characters that participate in folds + * (except marks, etc in multi-char folds */ + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); + PL_utf8_foldable = _swash_to_invlist(swash); + SvREFCNT_dec(swash); + } + + /* This is a hash that for a particular fold gives all characters + * that are involved in it */ + if (! PL_utf8_foldclosures) { + + /* If we were unable to find any folds, then we likely won't be + * able to find the closures. So just create an empty list. + * Folding will effectively be restricted to the non-Unicode rules + * hard-coded into Perl. (This case happens legitimately during + * compilation of Perl itself before the Unicode tables are + * generated) */ + if (invlist_len(PL_utf8_foldable) == 0) { + PL_utf8_foldclosures = newHV(); + } else { + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + + /* This particular string is above \xff in both UTF-8 and + * UTFEBCDIC */ + to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + } + } + + /* Only the characters in this class that participate in folds need be + * checked. Get the intersection of this class and all the possible + * characters that are foldable. This can quickly narrow down a large + * class */ + _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection); + + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + + /* Get its fold */ + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + const UV f = + _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold); + + if (foldlen > (STRLEN)UNISKIP(f)) { + + /* Any multicharacter foldings (disallowed in lookbehind + * patterns) require the following transform: [ABCDEF] -> + * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F + * folds into "rst", all other characters fold to single + * characters. We save away these multicharacter foldings, + * to be later saved as part of the additional "s" data. */ + if (! RExC_in_lookbehind) { + U8* loc = foldbuf; + U8* e = foldbuf + foldlen; + + /* If any of the folded characters of this are in the + * Latin1 range, tell the regex engine that this can + * match a non-utf8 target string. The only multi-byte + * fold whose source is in the Latin1 range (U+00DF) + * applies only when the target string is utf8, or + * under unicode rules */ + if (j > 255 || AT_LEAST_UNI_SEMANTICS) { + while (loc < e) { + + /* Can't mix ascii with non- under /aa */ + if (MORE_ASCII_RESTRICTED + && (isASCII(*loc) != isASCII(j))) + { + goto end_multi_fold; + } + if (UTF8_IS_INVARIANT(*loc) + || UTF8_IS_DOWNGRADEABLE_START(*loc)) + { + /* Can't mix above and below 256 under LOC + */ + if (LOC) { + goto end_multi_fold; + } + ANYOF_FLAGS(ret) + |= ANYOF_NONBITMAP_NON_UTF8; + break; + } + loc += UTF8SKIP(loc); + } + } + + add_alternate(&unicode_alternate, foldbuf, foldlen); + end_multi_fold: ; + } + + /* This is special-cased, as it is the only letter which + * has both a multi-fold and single-fold in Latin1. All + * the other chars that have single and multi-folds are + * always in utf8, and the utf8 folding algorithm catches + * them */ + if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) { + stored += set_regclass_bit(pRExC_state, + ret, + LATIN_SMALL_LETTER_SHARP_S, + &l1_fold_invlist, &unicode_alternate); + } + } + else { + /* Single character fold. Add everything in its fold + * closure to the list that this node should match */ + SV** listp; + + /* The fold closures data structure is a hash with the keys + * being every character that is folded to, like 'k', and + * the values each an array of everything that folds to its + * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_len(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non-; + * /l doesn't allow them between above and below + * 256 */ + if ((MORE_ASCII_RESTRICTED + && (isASCII(c) != isASCII(j))) + || (LOC && ((c < 256) != (j < 256)))) + { + continue; + } + + if (c < 256 && AT_LEAST_UNI_SEMANTICS) { + stored += set_regclass_bit(pRExC_state, + ret, + (U8) c, + &l1_fold_invlist, &unicode_alternate); + } + /* It may be that the code point is already in + * this range or already in the bitmap, in + * which case we need do nothing */ + else if ((c < start || c > end) + && (c > 255 + || ! ANYOF_BITMAP_TEST(ret, c))) + { + nonbitmap = add_cp_to_invlist(nonbitmap, c); + } + } + } + } + } + } + SvREFCNT_dec(fold_intersection); + } + + /* Combine the two lists into one. */ + if (l1_fold_invlist) { + if (nonbitmap) { + _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap); + SvREFCNT_dec(l1_fold_invlist); + } + else { + nonbitmap = l1_fold_invlist; + } + } + + /* And combine the result (if any) with any inversion list from properties. + * The lists are kept separate up to now because we don't want to fold the + * properties */ + if (properties) { + if (nonbitmap) { + _invlist_union(nonbitmap, properties, &nonbitmap); + SvREFCNT_dec(properties); + } + else { + nonbitmap = properties; + } + } + + /* Here, contains all the code points we can determine at + * compile time that we haven't put into the bitmap. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * */ + if (nonbitmap) { + + /* Above-ASCII code points in /d have to stay in , as they + * possibly only should match when the target string is UTF-8 */ + UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255; + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through */ + invlist_iterinit(nonbitmap); + while (invlist_iternext(nonbitmap, &start, &end)) { + UV high; + int i; + + /* Quit if are above what we should change */ + if (start > max_cp_to_set) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < max_cp_to_set) ? end : max_cp_to_set; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_SET(ret, i); + stored++; + prevvalue = value; + value = i; + } + } + } + + /* Done with loop; remove any code points that are in the bitmap from + * */ + if (change_invlist) { + _invlist_subtract(nonbitmap, + (DEPENDS_SEMANTICS) + ? PL_ASCII + : PL_Latin1, + &nonbitmap); + } + + /* If have completely emptied it, remove it completely */ + if (invlist_len(nonbitmap) == 0) { + SvREFCNT_dec(nonbitmap); + nonbitmap = NULL; + } + } + + /* Here, we have calculated what code points should be in the character + * class. does not overlap the bitmap except possibly in the + * case of DEPENDS rules. + * + * Now we can see about various optimizations. Fold calculation (which we + * did above) needs to take place before inversion. Otherwise /[^k]/i + * would invert to include K, which under /i would match k, which it + * shouldn't. */ + + /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't + * set the FOLD flag yet, so this does optimize those. It doesn't + * optimize locale. Doing so perhaps could be done as long as there is + * nothing like \w in it; some thought also would have to be given to the + * interaction with above 0x100 chars */ + if ((ANYOF_FLAGS(ret) & ANYOF_INVERT) + && ! LOC + && ! unicode_alternate + /* In case of /d, there are some things that should match only when in + * not in the bitmap, i.e., they require UTF8 to match. These are + * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this + * case, they don't require UTF8, so can invert here */ + && (! nonbitmap + || ! DEPENDS_SEMANTICS + || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) + && SvCUR(listsv) == initial_listsv_len) + { + int i; + if (! nonbitmap) { + for (i = 0; i < 256; ++i) { + if (ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_CLEAR(ret, i); + } + else { + ANYOF_BITMAP_SET(ret, i); + prevvalue = value; + value = i; + } + } + /* The inversion means that everything above 255 is matched */ + ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + } + else { + /* Here, also has things outside the bitmap that may overlap with + * the bitmap. We have to sync them up, so that they get inverted + * in both places. Earlier, we removed all overlaps except in the + * case of /d rules, so no syncing is needed except for this case + */ + SV *remove_list = NULL; + + if (DEPENDS_SEMANTICS) { + UV start, end; + + /* Set the bits that correspond to the ones that aren't in the + * bitmap. Otherwise, when we invert, we'll miss these. + * Earlier, we removed from the nonbitmap all code points + * < 128, so there is no extra work here */ + invlist_iterinit(nonbitmap); + while (invlist_iternext(nonbitmap, &start, &end)) { + if (start > 255) { /* The bit map goes to 255 */ + break; + } + if (end > 255) { + end = 255; + } + for (i = start; i <= (int) end; ++i) { + ANYOF_BITMAP_SET(ret, i); + prevvalue = value; + value = i; + } + } + } + + /* Now invert both the bitmap and the nonbitmap. Anything in the + * bitmap has to also be removed from the non-bitmap, but again, + * there should not be overlap unless is /d rules. */ + _invlist_invert(nonbitmap); + + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec(swash); + swash = NULL; + } + + for (i = 0; i < 256; ++i) { + if (ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_CLEAR(ret, i); + if (DEPENDS_SEMANTICS) { + if (! remove_list) { + remove_list = _new_invlist(2); + } + remove_list = add_cp_to_invlist(remove_list, i); + } + } + else { + ANYOF_BITMAP_SET(ret, i); + prevvalue = value; + value = i; + } + } + + /* And do the removal */ + if (DEPENDS_SEMANTICS) { + if (remove_list) { + _invlist_subtract(nonbitmap, remove_list, &nonbitmap); + SvREFCNT_dec(remove_list); + } + } + else { + /* There is no overlap for non-/d, so just delete anything + * below 256 */ + _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap); + } + } + + stored = 256 - stored; + + /* Clear the invert flag since have just done it here */ + ANYOF_FLAGS(ret) &= ~ANYOF_INVERT; + } + + /* Folding in the bitmap is taken care of above, but not for locale (for + * which we have to wait to see what folding is in effect at runtime), and + * for some things not in the bitmap (only the upper latin folds in this + * case, as all other single-char folding has been set above). Set + * run-time fold flag for these */ + if (FOLD && (LOC + || (DEPENDS_SEMANTICS + && nonbitmap + && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) + || unicode_alternate)) + { + ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; + } + + /* A single character class can be "optimized" into an EXACTish node. + * Note that since we don't currently count how many characters there are + * outside the bitmap, we are XXX missing optimization possibilities for + * them. This optimization can't happen unless this is a truly single + * character class, which means that it can't be an inversion into a + * many-character class, and there must be no possibility of there being + * things outside the bitmap. 'stored' (only) for locales doesn't include + * \w, etc, so have to make a special test that they aren't present + * + * Similarly A 2-character class of the very special form like [bB] can be + * optimized into an EXACTFish node, but only for non-locales, and for + * characters which only have the two folds; so things like 'fF' and 'Ii' + * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE + * FI'. */ + if (! nonbitmap + && ! unicode_alternate + && SvCUR(listsv) == initial_listsv_len + && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL)) + && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) + || (! ANYOF_CLASS_TEST_ANY_SET(ret))))) + || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) + && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)) + /* If the latest code point has a fold whose + * bit is set, it must be the only other one */ + && ((prevvalue = PL_fold_latin1[value]) != (IV)value) + && ANYOF_BITMAP_TEST(ret, prevvalue))))) + { + /* Note that the information needed to decide to do this optimization + * is not currently available until the 2nd pass, and that the actually + * used EXACTish node takes less space than the calculated ANYOF node, + * and hence the amount of space calculated in the first pass is larger + * than actually used, so this optimization doesn't gain us any space. + * But an EXACT node is faster than an ANYOF node, and can be combined + * with any adjacent EXACT nodes later by the optimizer for further + * gains. The speed of executing an EXACTF is similar to an ANYOF + * node, so the optimization advantage comes from the ability to join + * it to adjacent EXACT nodes */ + + const char * cur_parse= RExC_parse; + U8 op; + RExC_emit = (regnode *)orig_emit; + RExC_parse = (char *)orig_parse; + + if (stored == 1) { + + /* A locale node with one point can be folded; all the other cases + * with folding will have two points, since we calculate them above + */ + if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) { + op = EXACTFL; + } + else { + op = EXACT; + } + } + else { /* else 2 chars in the bit map: the folds of each other */ + + /* Use the folded value, which for the cases where we get here, + * is just the lower case of the current one (which may resolve to + * itself, or to the other one */ + value = toLOWER_LATIN1(value); + + /* To join adjacent nodes, they must be the exact EXACTish type. + * Try to use the most likely type, by using EXACTFA if possible, + * then EXACTFU if the regex calls for it, or is required because + * the character is non-ASCII. (If is ASCII, its fold is + * also ASCII for the cases where we get here.) */ + if (MORE_ASCII_RESTRICTED && isASCII(value)) { + op = EXACTFA; + } + else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { + op = EXACTFU; + } + else { /* Otherwise, more likely to be EXACTF type */ + op = EXACTF; + } + } + + ret = reg_node(pRExC_state, op); + RExC_parse = (char *)cur_parse; + if (UTF && ! NATIVE_IS_INVARIANT(value)) { + *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value); + *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value); + STR_LEN(ret)= 2; + RExC_emit += STR_SZ(2); + } + else { + *STRING(ret)= (char)value; + STR_LEN(ret)= 1; + RExC_emit += STR_SZ(1); + } + SvREFCNT_dec(listsv); + return ret; + } + + /* If there is a swash and more than one element, we can't use the swash in + * the optimization below. */ + if (swash && element_count > 1) { + SvREFCNT_dec(swash); + swash = NULL; + } + if (! nonbitmap + && SvCUR(listsv) == initial_listsv_len + && ! unicode_alternate) + { + ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); + SvREFCNT_dec(listsv); + SvREFCNT_dec(unicode_alternate); + } + else { + /* av[0] stores the character class description in its textual form: + * used later (regexec.c:Perl_regclass_swash()) to initialize the + * appropriate swash, and is also useful for dumping the regnode. + * av[1] if NULL, is a placeholder to later contain the swash computed + * from av[0]. But if no further computation need be done, the + * swash is stored there now. + * av[2] stores the multicharacter foldings, used later in + * regexec.c:S_reginclass(). + * av[3] stores the nonbitmap inversion list for use in addition or + * instead of av[0]; not used if av[1] isn't NULL + * av[4] is set if any component of the class is from a user-defined + * property; not used if av[1] isn't NULL */ + AV * const av = newAV(); + SV *rv; + + av_store(av, 0, (SvCUR(listsv) == initial_listsv_len) + ? &PL_sv_undef + : listsv); + if (swash) { + av_store(av, 1, swash); + SvREFCNT_dec(nonbitmap); + } + else { + av_store(av, 1, NULL); + if (nonbitmap) { + av_store(av, 3, nonbitmap); + av_store(av, 4, newSVuv(has_user_defined_property)); + } + } + + /* Store any computed multi-char folds only if we are allowing + * them */ + if (allow_full_fold) { + av_store(av, 2, MUTABLE_SV(unicode_alternate)); + if (unicode_alternate) { /* This node is variable length */ + OP(ret) = ANYOFV; + } + } + else { + av_store(av, 2, NULL); + } + rv = newRV_noinc(MUTABLE_SV(av)); + n = add_data(pRExC_state, 1, "s"); + RExC_rxi->data->data[n] = (void*)rv; + ARG_SET(ret, n); + } + return ret; +} + + +/* reg_skipcomment() + + Absorbs an /x style # comments from the input stream. + Returns true if there is more text remaining in the stream. + Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + terminates the pattern without including a newline. + + Note its the callers responsibility to ensure that we are + actually in /x mode + +*/ + +STATIC bool +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +{ + bool ended = 0; + + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + + while (RExC_parse < RExC_end) + if (*RExC_parse++ == '\n') { + ended = 1; + break; + } + if (!ended) { + /* we ran off the end of the pattern without ending + the comment, so we have to add an \n when wrapping */ + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + return 0; + } else + return 1; +} + +/* nextchar() + + Advances the parse position, and optionally absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + +STATIC char* +S_nextchar(pTHX_ RExC_state_t *pRExC_state) +{ + char* const retval = RExC_parse++; + + PERL_ARGS_ASSERT_NEXTCHAR; + + for (;;) { + if (RExC_end - RExC_parse >= 3 + && *RExC_parse == '(' + && RExC_parse[1] == '?' + && RExC_parse[2] == '#') + { + while (*RExC_parse != ')') { + if (RExC_parse == RExC_end) + FAIL("Sequence (?#... not terminated"); + RExC_parse++; + } + RExC_parse++; + continue; + } + if (RExC_flags & RXf_PMf_EXTENDED) { + if (isSPACE(*RExC_parse)) { + RExC_parse++; + continue; + } + else if (*RExC_parse == '#') { + if ( reg_skipcomment( pRExC_state ) ) + continue; + } + } + return retval; + } +} + +/* +- reg_node - emit a node +*/ +STATIC regnode * /* Location. */ +S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) +{ + dVAR; + register regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 1; + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, RExC_emit, RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE(ptr, op); + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + "reg_node", __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reganode - emit a node with an argument +*/ +STATIC regnode * /* Location. */ +S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) +{ + dVAR; + register regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGANODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 2; + /* + We can't do this: + + assert(2==regarglen[op]+1); + + Anything larger than this has to allocate the extra amount. + If we changed this to be: + + RExC_size += (1 + regarglen[op]); + + then it wouldn't matter. Its not clear what side effect + might come from that so its not done so far. + -- dmq + */ + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, RExC_emit, RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE_ARG(ptr, op, arg); + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reganode", + __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? + "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Cur_Node_Offset; + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reguni - emit (if appropriate) a Unicode character +*/ +STATIC STRLEN +S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) +{ + dVAR; + + PERL_ARGS_ASSERT_REGUNI; + + return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); +} + +/* +- reginsert - insert an operator in front of already-emitted operand +* +* Means relocating the operand. +*/ +STATIC void +S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) +{ + dVAR; + register regnode *src; + register regnode *dst; + register regnode *place; + const int offset = regarglen[(U8)op]; + const int size = NODE_STEP_REGNODE + offset; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGINSERT; + PERL_UNUSED_ARG(depth); +/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ + DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); + if (SIZE_ONLY) { + RExC_size += size; + return; + } + + src = RExC_emit; + RExC_emit += size; + dst = RExC_emit; + if (RExC_open_parens) { + int paren; + /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ + for ( paren=0 ; paren < RExC_npar ; paren++ ) { + if ( RExC_open_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("open"," - %d",size);*/ + RExC_open_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ + } + if ( RExC_close_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("close"," - %d",size);*/ + RExC_close_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ + } + } + } + + while (src > opnd) { + StructCopy(--src, --dst, regnode); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD 20010112 */ + MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + "reg_insert", + __LINE__, + PL_reg_name[op], + (UV)(dst - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(src - RExC_emit_start), + (UV)(dst - RExC_emit_start), + (UV)RExC_offsets[0])); + Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); + Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); + } +#endif + } + + + place = opnd; /* Op node, where operand used to be. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reginsert", + __LINE__, + PL_reg_name[op], + (UV)(place - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(place - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(place, RExC_parse); + Set_Node_Length(place, 1); + } +#endif + src = NEXTOPER(place); + FILL_ADVANCE_NODE(place, op); + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1); + Zero(src, offset, regnode); +} + +/* +- regtail - set the next-pointer at the end of a node chain of p to val. +- SEE ALSO: regtail_study +*/ +/* TODO: All three parms should be const */ +STATIC void +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +{ + dVAR; + register regnode *scan; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if (SIZE_ONLY) + return; + + /* Find last node. */ + scan = p; + for (;;) { + regnode * const temp = regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tail" : "")); + regprop(RExC_rx, mysv, scan); + PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", + SvPV_nolen_const(mysv), REG_NODE_NUM(scan), + (temp == NULL ? "->" : ""), + (temp == NULL ? PL_reg_name[OP(val)] : "") + ); + }); + if (temp == NULL) + break; + scan = temp; + } + + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } +} + +#ifdef DEBUGGING +/* +- regtail_study - set the next-pointer at the end of a node chain of p to val. +- Look for optimizable sequences at the same time. +- currently only looks for EXACT chains. + +This is experimental code. The idea is to use this routine to perform +in place optimizations on branches and groups as they are constructed, +with the long term intention of removing optimization from study_chunk so +that it is purely analytical. + +Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used +to control which is which. + +*/ +/* TODO: All four parms should be const */ + +STATIC U8 +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +{ + dVAR; + register regnode *scan; + U8 exact = PSEUDO; +#ifdef EXPERIMENTAL_INPLACESCAN + I32 min = 0; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL_STUDY; + + + if (SIZE_ONLY) + return exact; + + /* Find last node. */ + + scan = p; + for (;;) { + regnode * const temp = regnext(scan); +#ifdef EXPERIMENTAL_INPLACESCAN + if (PL_regkind[OP(scan)] == EXACT) { + bool has_exactf_sharp_s; /* Unexamined in this routine */ + if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) + return EXACT; + } +#endif + if ( exact ) { + switch (OP(scan)) { + case EXACT: + case EXACTF: + case EXACTFA: + case EXACTFU: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: + case EXACTFL: + if( exact == PSEUDO ) + exact= OP(scan); + else if ( exact != OP(scan) ) + exact= 0; + case NOTHING: + break; + default: + exact= 0; + } + } + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); + regprop(RExC_rx, mysv, scan); + PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", + SvPV_nolen_const(mysv), + REG_NODE_NUM(scan), + PL_reg_name[exact]); + }); + if (temp == NULL) + break; + scan = temp; + } + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_PARSE_MSG(""); + regprop(RExC_rx, mysv_val, val); + PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(val), + (IV)(val - scan) + ); + }); + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } + + return exact; +} +#endif + +/* + - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form + */ +#ifdef DEBUGGING +static void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + regex_charset cs; + + for (bit=0; bit<32; bit++) { + if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); + + /* Header fields of interest. */ + if (r->anchored_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), + RE_SV_DUMPLEN(r->anchored_substr), 30); + PerlIO_printf(Perl_debug_log, + "anchored %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_substr), + (IV)r->anchored_offset); + } else if (r->anchored_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), + RE_SV_DUMPLEN(r->anchored_utf8), 30); + PerlIO_printf(Perl_debug_log, + "anchored utf8 %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_utf8), + (IV)r->anchored_offset); + } + if (r->float_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), + RE_SV_DUMPLEN(r->float_substr), 30); + PerlIO_printf(Perl_debug_log, + "floating %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_substr), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } else if (r->float_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), + RE_SV_DUMPLEN(r->float_utf8), 30); + PerlIO_printf(Perl_debug_log, + "floating utf8 %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_utf8), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, + (const char *) + (r->check_substr == r->float_substr + && r->check_utf8 == r->float_utf8 + ? "(checking floating" : "(checking anchored")); + if (r->extflags & RXf_NOSCAN) + PerlIO_printf(Perl_debug_log, " noscan"); + if (r->extflags & RXf_CHECK_ALL) + PerlIO_printf(Perl_debug_log, " isall"); + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, ") "); + + if (ri->regstclass) { + regprop(r, sv, ri->regstclass); + PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); + } + if (r->extflags & RXf_ANCH) { + PerlIO_printf(Perl_debug_log, "anchored"); + if (r->extflags & RXf_ANCH_BOL) + PerlIO_printf(Perl_debug_log, "(BOL)"); + if (r->extflags & RXf_ANCH_MBOL) + PerlIO_printf(Perl_debug_log, "(MBOL)"); + if (r->extflags & RXf_ANCH_SBOL) + PerlIO_printf(Perl_debug_log, "(SBOL)"); + if (r->extflags & RXf_ANCH_GPOS) + PerlIO_printf(Perl_debug_log, "(GPOS)"); + PerlIO_putc(Perl_debug_log, ' '); + } + if (r->extflags & RXf_GPOS_SEEN) + PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); + if (r->intflags & PREGf_SKIP) + PerlIO_printf(Perl_debug_log, "plus "); + if (r->intflags & PREGf_IMPLICIT) + PerlIO_printf(Perl_debug_log, "implicit "); + PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); + if (r->extflags & RXf_EVAL_SEEN) + PerlIO_printf(Perl_debug_log, "with eval "); + PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); +#else + PERL_ARGS_ASSERT_REGDUMP; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(r); +#endif /* DEBUGGING */ +} + +/* +- regprop - printable representation of opcode +*/ +#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ +STMT_START { \ + if (do_sep) { \ + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ + if (flags & ANYOF_INVERT) \ + /*make sure the invert info is in each */ \ + sv_catpvs(sv, "^"); \ + do_sep = 0; \ + } \ +} STMT_END + +void +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) +{ +#ifdef DEBUGGING + dVAR; + register int k; + RXi_GET_DECL(prog,progi); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPROP; + + sv_setpvs(sv, ""); + + if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ + /* It would be nice to FAIL() here, but this may be called from + regexec.c, and it would be hard to supply pRExC_state. */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); + sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ + + k = PL_regkind[OP(o)]; + + if (k == EXACT) { + sv_catpvs(sv, " "); + /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) + * is a crude hack but it may be the best for now since + * we have no flag "this EXACTish node was UTF-8" + * --jhi */ + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_ESCAPE_NONASCII | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); + } else if (k == TRIE) { + /* print the details of the trie in dumpuntil instead, as + * progi->data isn't available here */ + const char op = OP(o); + const U32 n = ARG(o); + const reg_ac_data * const ac = IS_TRIE_AC(op) ? + (reg_ac_data *)progi->data->data[n] : + NULL; + const reg_trie_data * const trie + = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; + + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); + DEBUG_TRIE_COMPILE_r( + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ) + ); + if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { + int i; + int rangestart = -1; + U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); + sv_catpvs(sv, "["); + for (i = 0; i <= 256; i++) { + if (i < 256 && BITMAP_TEST(bitmap,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) + put_byte(sv, rangestart); + else { + put_byte(sv, rangestart); + sv_catpvs(sv, "-"); + put_byte(sv, i - 1); + } + rangestart = -1; + } + } + sv_catpvs(sv, "]"); + } + + } else if (k == CURLY) { + if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); + } + else if (k == WHILEM && o->flags) /* Ordinal/of */ + Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); + else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { + Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ + if ( RXp_PAREN_NAMES(prog) ) { + if ( k != REF || (OP(o) < NREF)) { + AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + SV **name= av_fetch(list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + } + else { + AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + SV **name= av_fetch(list, nums[0], 0 ); + I32 n; + if (name) { + for ( n=0; nflags) + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); + } else if (k == LOGICAL) + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + else if (k == ANYOF) { + int i, rangestart = -1; + const U8 flags = ANYOF_FLAGS(o); + int do_sep = 0; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { + "\\w", + "\\W", + "\\s", + "\\S", + "\\d", + "\\D", + "[:alnum:]", + "[:^alnum:]", + "[:alpha:]", + "[:^alpha:]", + "[:ascii:]", + "[:^ascii:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:graph:]", + "[:^graph:]", + "[:lower:]", + "[:^lower:]", + "[:print:]", + "[:^print:]", + "[:punct:]", + "[:^punct:]", + "[:upper:]", + "[:^upper:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" + }; + + if (flags & ANYOF_LOCALE) + sv_catpvs(sv, "{loc}"); + if (flags & ANYOF_LOC_NONBITMAP_FOLD) + sv_catpvs(sv, "{i}"); + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + if (flags & ANYOF_INVERT) + sv_catpvs(sv, "^"); + + /* output what the standard cp 0-255 bitmap matches */ + for (i = 0; i <= 256; i++) { + if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) + put_byte(sv, rangestart); + else { + put_byte(sv, rangestart); + sv_catpvs(sv, "-"); + put_byte(sv, i - 1); + } + do_sep = 1; + rangestart = -1; + } + } + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + /* output any special charclass tests (used entirely under use locale) */ + if (ANYOF_CLASS_TEST_ANY_SET(o)) + for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) + if (ANYOF_CLASS_TEST(o,i)) { + sv_catpv(sv, anyofs[i]); + do_sep = 1; + } + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + + if (flags & ANYOF_NON_UTF8_LATIN1_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } + + /* output information about the unicode matching */ + if (flags & ANYOF_UNICODE_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ANYOF_NONBITMAP(o)) + sv_catpvs(sv, "{unicode}"); + if (flags & ANYOF_NONBITMAP_NON_UTF8) + sv_catpvs(sv, "{outside bitmap}"); + + if (ANYOF_NONBITMAP(o)) { + SV *lv; /* Set if there is something outside the bit map */ + SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0); + bool byte_output = FALSE; /* If something in the bitmap has been + output */ + + if (lv && lv != &PL_sv_undef) { + if (sw) { + U8 s[UTF8_MAXBYTES_CASE+1]; + + for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */ + uvchr_to_utf8(s, i); + + if (i < 256 + && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate + things already + output as part + of the bitmap */ + && swash_fetch(sw, s, TRUE)) + { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + byte_output = TRUE; + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) { + put_byte(sv, rangestart); + } + else { + put_byte(sv, rangestart); + sv_catpvs(sv, "-"); + put_byte(sv, i-1); + } + rangestart = -1; + } + } + } + + { + char *s = savesvpv(lv); + char * const origs = s; + + while (*s && *s != '\n') + s++; + + if (*s == '\n') { + const char * const t = ++s; + + if (byte_output) { + sv_catpvs(sv, " "); + } + + while (*s) { + if (*s == '\n') { + + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); + } + + out_dump: + + Safefree(origs); + } + SvREFCNT_dec(lv); + } + } + + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + } + else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) + Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); +#else + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(sv); + PERL_UNUSED_ARG(o); + PERL_UNUSED_ARG(prog); +#endif /* DEBUGGING */ +} + +SV * +Perl_re_intuit_string(pTHX_ REGEXP * const r) +{ /* Assume that RE_INTUIT is set */ + dVAR; + struct regexp *const prog = (struct regexp *)SvANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_STRING; + PERL_UNUSED_CONTEXT; + + DEBUG_COMPILE_r( + { + const char * const s = SvPV_nolen_const(prog->check_substr + ? prog->check_substr : prog->check_utf8); + + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, + "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", + PL_colors[4], + prog->check_substr ? "" : "utf8 ", + PL_colors[5],PL_colors[0], + s, + PL_colors[1], + (strlen(s) > 60 ? "..." : "")); + } ); + + return prog->check_substr ? prog->check_substr : prog->check_utf8; +} + +/* + pregfree() + + handles refcounting and freeing the perl core regexp structure. When + it is necessary to actually free the structure the first thing it + does is call the 'free' method of the regexp_engine associated to + the regexp, allowing the handling of the void *pprivate; member + first. (This routine is not overridable by extensions, which is why + the extensions free is called first.) + + See regdupe and regdupe_internal if you change anything here. +*/ +#ifndef PERL_IN_XSUB_RE +void +Perl_pregfree(pTHX_ REGEXP *r) +{ + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) +{ + dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_PREGFREE2; + + if (r->mother_re) { + ReREFCNT_dec(r->mother_re); + } else { + CALLREGFREE_PVT(rx); /* free the private data */ + SvREFCNT_dec(RXp_PAREN_NAMES(r)); + } + if (r->substrs) { + SvREFCNT_dec(r->anchored_substr); + SvREFCNT_dec(r->anchored_utf8); + SvREFCNT_dec(r->float_substr); + SvREFCNT_dec(r->float_utf8); + Safefree(r->substrs); + } + RX_MATCH_COPY_FREE(rx); +#ifdef PERL_OLD_COPY_ON_WRITE + SvREFCNT_dec(r->saved_copy); +#endif + Safefree(r->offs); +} + +/* reg_temp_copy() + + This is a hacky workaround to the structural issue of match results + being stored in the regexp structure which is in turn stored in + PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern + could be PL_curpm in multiple contexts, and could require multiple + result sets being associated with the pattern simultaneously, such + as when doing a recursive match with (??{$qr}) + + The solution is to make a lightweight copy of the regexp structure + when a qr// is returned from the code executed by (??{$qr}) this + lightweight copy doesn't actually own any of its data except for + the starp/end and the actual regexp structure itself. + +*/ + + +REGEXP * +Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) +{ + struct regexp *ret; + struct regexp *const r = (struct regexp *)SvANY(rx); + register const I32 npar = r->nparens+1; + + PERL_ARGS_ASSERT_REG_TEMP_COPY; + + if (!ret_x) + ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + ret = (struct regexp *)SvANY(ret_x); + + (void)ReREFCNT_inc(rx); + /* We can take advantage of the existing "copied buffer" mechanism in SVs + by pointing directly at the buffer, but flagging that the allocated + space in the copy is zero. As we've just done a struct copy, it's now + a case of zero-ing that, rather than copying the current length. */ + SvPV_set(ret_x, RX_WRAPPED(rx)); + SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8); + memcpy(&(ret->xpv_cur), &(r->xpv_cur), + sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); + SvLEN_set(ret_x, 0); + SvSTASH_set(ret_x, NULL); + SvMAGIC_set(ret_x, NULL); + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + if (r->substrs) { + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + SvREFCNT_inc_void(ret->anchored_substr); + SvREFCNT_inc_void(ret->anchored_utf8); + SvREFCNT_inc_void(ret->float_substr); + SvREFCNT_inc_void(ret->float_utf8); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + } + RX_MATCH_COPIED_off(ret_x); +#ifdef PERL_OLD_COPY_ON_WRITE + ret->saved_copy = NULL; +#endif + ret->mother_re = rx; + + return ret_x; +} +#endif + +/* regfree_internal() + + Free the private data in a regexp. This is overloadable by + extensions. Perl takes care of the regexp structure in pregfree(), + this covers the *pprivate pointer which technically perl doesn't + know about, however of course we have to handle the + regexp_internal structure when no extension is in use. + + Note this is called before freeing anything in the regexp + structure. + */ + +void +Perl_regfree_internal(pTHX_ REGEXP * const rx) +{ + dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGFREE_INTERNAL; + + DEBUG_COMPILE_r({ + if (!PL_colorset) + reginitcolors(); + { + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); + PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + PL_colors[4],PL_colors[5],s); + } + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) + Safefree(ri->u.offsets); /* 20010421 MJD */ +#endif + if (ri->data) { + int n = ri->data->count; + PAD* new_comppad = NULL; + PAD* old_comppad; + PADOFFSET refcnt; + + while (--n >= 0) { + /* If you add a ->what type here, update the comment in regcomp.h */ + switch (ri->data->what[n]) { + case 'a': + case 's': + case 'S': + case 'u': + SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); + break; + case 'f': + Safefree(ri->data->data[n]); + break; + case 'p': + new_comppad = MUTABLE_AV(ri->data->data[n]); + break; + case 'o': + if (new_comppad == NULL) + Perl_croak(aTHX_ "panic: pregfree comppad"); + PAD_SAVE_LOCAL(old_comppad, + /* Watch out for global destruction's random ordering. */ + (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL + ); + OP_REFCNT_LOCK; + refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]); + OP_REFCNT_UNLOCK; + if (!refcnt) + op_free((OP_4tree*)ri->data->data[n]); + + PAD_RESTORE_LOCAL(old_comppad); + SvREFCNT_dec(MUTABLE_SV(new_comppad)); + new_comppad = NULL; + break; + case 'n': + break; + case 'T': + { /* Aho Corasick add-on structure for a trie node. + Used in stclass optimization only */ + U32 refcount; + reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; + OP_REFCNT_LOCK; + refcount = --aho->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(aho->states); + PerlMemShared_free(aho->fail); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + PerlMemShared_free(ri->regstclass); + } + } + break; + case 't': + { + /* trie structure. */ + U32 refcount; + reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; + OP_REFCNT_LOCK; + refcount = --trie->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(trie->charmap); + PerlMemShared_free(trie->states); + PerlMemShared_free(trie->trans); + if (trie->bitmap) + PerlMemShared_free(trie->bitmap); + if (trie->jump) + PerlMemShared_free(trie->jump); + PerlMemShared_free(trie->wordinfo); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + } + } + break; + default: + Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); + } + } + Safefree(ri->data->what); + Safefree(ri->data); + } + + Safefree(ri); +} + +#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t)) +#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) +#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) + +/* + re_dup - duplicate a regexp. + + This routine is expected to clone a given regexp structure. It is only + compiled under USE_ITHREADS. + + After all of the core data stored in struct regexp is duplicated + the regexp_engine.dupe method is used to copy any private data + stored in the *pprivate pointer. This allows extensions to handle + any duplication it needs to do. + + See pregfree() and regfree_internal() if you change anything here. +*/ +#if defined(USE_ITHREADS) +#ifndef PERL_IN_XSUB_RE +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) +{ + dVAR; + I32 npar; + const struct regexp *r = (const struct regexp *)SvANY(sstr); + struct regexp *ret = (struct regexp *)SvANY(dstr); + + PERL_ARGS_ASSERT_RE_DUP_GUTS; + + npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + if(ret->swap) { + /* no need to copy these */ + Newx(ret->swap, npar, regexp_paren_pair); + } + + if (ret->substrs) { + /* Do it this way to avoid reading from *r after the StructCopy(). + That way, if any of the sv_dup_inc()s dislodge *r from the L1 + cache, it doesn't matter. */ + const bool anchored = r->check_substr + ? r->check_substr == r->anchored_substr + : r->check_utf8 == r->anchored_utf8; + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); + ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); + ret->float_substr = sv_dup_inc(ret->float_substr, param); + ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + + if (ret->check_substr) { + if (anchored) { + assert(r->check_utf8 == r->anchored_utf8); + ret->check_substr = ret->anchored_substr; + ret->check_utf8 = ret->anchored_utf8; + } else { + assert(r->check_substr == r->float_substr); + assert(r->check_utf8 == r->float_utf8); + ret->check_substr = ret->float_substr; + ret->check_utf8 = ret->float_utf8; + } + } else if (ret->check_utf8) { + if (anchored) { + ret->check_utf8 = ret->anchored_utf8; + } else { + ret->check_utf8 = ret->float_utf8; + } + } + } + + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); + + if (ret->pprivate) + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); + + if (RX_MATCH_COPIED(dstr)) + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); + else + ret->subbeg = NULL; +#ifdef PERL_OLD_COPY_ON_WRITE + ret->saved_copy = NULL; +#endif + + if (ret->mother_re) { + if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) { + /* Our storage points directly to our mother regexp, but that's + 1: a buffer in a different thread + 2: something we no longer hold a reference on + so we need to copy it locally. */ + /* Note we need to use SvCUR(), rather than + SvLEN(), on our mother_re, because it, in + turn, may well be pointing to its own mother_re. */ + SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re), + SvCUR(ret->mother_re)+1)); + SvLEN_set(dstr, SvCUR(ret->mother_re)+1); + } + ret->mother_re = NULL; + } + ret->gofs = 0; +} +#endif /* PERL_IN_XSUB_RE */ + +/* + regdupe_internal() + + This is the internal complement to regdupe() which is used to copy + the structure pointed to by the *pprivate pointer in the regexp. + This is the core version of the extension overridable cloning hook. + The regexp structure being duplicated will be copied by perl prior + to this and will be provided as the regexp *r argument, however + with the /old/ structures pprivate pointer value. Thus this routine + may override any copying normally done by perl. + + It returns a pointer to the new regexp_internal structure. +*/ + +void * +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) +{ + dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); + regexp_internal *reti; + int len; + RXi_GET_DECL(r,ri); + + PERL_ARGS_ASSERT_REGDUPE_INTERNAL; + + len = ProgLen(ri); + + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); + Copy(ri->program, reti->program, len+1, regnode); + + + reti->regstclass = NULL; + + if (ri->data) { + struct reg_data *d; + const int count = ri->data->count; + int i; + + Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), + char, struct reg_data); + Newx(d->what, count, U8); + + d->count = count; + for (i = 0; i < count; i++) { + d->what[i] = ri->data->what[i]; + switch (d->what[i]) { + /* legal options are one of: sSfpontTua + see also regcomp.h and pregfree() */ + case 'a': /* actually an AV, but the dup function is identical. */ + case 's': + case 'S': + case 'p': /* actually an AV, but the dup function is identical. */ + case 'u': /* actually an HV, but the dup function is identical. */ + d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); + break; + case 'f': + /* This is cheating. */ + Newx(d->data[i], 1, struct regnode_charclass_class); + StructCopy(ri->data->data[i], d->data[i], + struct regnode_charclass_class); + reti->regstclass = (regnode*)d->data[i]; + break; + case 'o': + /* Compiled op trees are readonly and in shared memory, + and can thus be shared without duplication. */ + OP_REFCNT_LOCK; + d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]); + OP_REFCNT_UNLOCK; + break; + case 'T': + /* Trie stclasses are readonly and can thus be shared + * without duplication. We free the stclass in pregfree + * when the corresponding reg_ac_data struct is freed. + */ + reti->regstclass= ri->regstclass; + /* Fall through */ + case 't': + OP_REFCNT_LOCK; + ((reg_trie_data*)ri->data->data[i])->refcount++; + OP_REFCNT_UNLOCK; + /* Fall through */ + case 'n': + d->data[i] = ri->data->data[i]; + break; + default: + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); + } + } + + reti->data = d; + } + else + reti->data = NULL; + + reti->name_list_idx = ri->name_list_idx; + +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) { + Newx(reti->u.offsets, 2*len+1, U32); + Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); + } +#else + SetProgLen(reti,len); +#endif + + return (void*)reti; +} + +#endif /* USE_ITHREADS */ + +#ifndef PERL_IN_XSUB_RE + +/* + - regnext - dig the "next" pointer out of a node + */ +regnode * +Perl_regnext(pTHX_ register regnode *p) +{ + dVAR; + register I32 offset; + + if (!p) + return(NULL); + + if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX); + } + + offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); + if (offset == 0) + return(NULL); + + return(p+offset); +} +#endif + +STATIC void +S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) +{ + va_list args; + STRLEN l1 = strlen(pat1); + STRLEN l2 = strlen(pat2); + char buf[512]; + SV *msv; + const char *message; + + PERL_ARGS_ASSERT_RE_CROAK2; + + if (l1 > 510) + l1 = 510; + if (l1 + l2 > 510) + l2 = 510 - l1; + Copy(pat1, buf, l1 , char); + Copy(pat2, buf + l1, l2 , char); + buf[l1 + l2] = '\n'; + buf[l1 + l2 + 1] = '\0'; +#ifdef I_STDARG + /* ANSI variant takes additional second argument */ + va_start(args, pat2); +#else + va_start(args); +#endif + msv = vmess(buf, &args); + va_end(args); + message = SvPV_const(msv,l1); + if (l1 > 512) + l1 = 512; + Copy(message, buf, l1 , char); + buf[l1-1] = '\0'; /* Overwrite \n */ + Perl_croak(aTHX_ "%s", buf); +} + +/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ + +#ifndef PERL_IN_XSUB_RE +void +Perl_save_re_context(pTHX) +{ + dVAR; + + struct re_save_state *state; + + SAVEVPTR(PL_curcop); + SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1); + + state = (struct re_save_state *)(PL_savestack + PL_savestack_ix); + PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE; + SSPUSHUV(SAVEt_RE_STATE); + + Copy(&PL_reg_state, state, 1, struct re_save_state); + + PL_reg_start_tmp = 0; + PL_reg_start_tmpl = 0; + PL_reg_oldsaved = NULL; + PL_reg_oldsavedlen = 0; + PL_reg_maxiter = 0; + PL_reg_leftiter = 0; + PL_reg_poscache = NULL; + PL_reg_poscache_size = 0; +#ifdef PERL_OLD_COPY_ON_WRITE + PL_nrs = NULL; +#endif + + /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ + if (PL_curpm) { + const REGEXP * const rx = PM_GETRE(PL_curpm); + if (rx) { + U32 i; + for (i = 1; i <= RX_NPARENS(rx); i++) { + char digits[TYPE_CHARS(long)]; + const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); + GV *const *const gvp + = (GV**)hv_fetch(PL_defstash, digits, len, 0); + + if (gvp) { + GV * const gv = *gvp; + if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) + save_scalar(gv); + } + } + } + } +} +#endif + +static void +clear_re(pTHX_ void *r) +{ + dVAR; + ReREFCNT_dec((REGEXP *)r); +} + +#ifdef DEBUGGING + +STATIC void +S_put_byte(pTHX_ SV *sv, int c) +{ + PERL_ARGS_ASSERT_PUT_BYTE; + + /* Our definition of isPRINT() ignores locales, so only bytes that are + not part of UTF-8 are considered printable. I assume that the same + holds for UTF-EBCDIC. + Also, code point 255 is not printable in either (it's E0 in EBCDIC, + which Wikipedia says: + + EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all + ones (binary 1111 1111, hexadecimal FF). It is similar, but not + identical, to the ASCII delete (DEL) or rubout control character. + ) So the old condition can be simplified to !isPRINT(c) */ + if (!isPRINT(c)) { + if (c < 256) { + Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c); + } + else { + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + } + } + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } +} + + +#define CLEAR_OPTSTART \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ + } STMT_END + +#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); + +STATIC const regnode * +S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, + const regnode *last, const regnode *plast, + SV* sv, I32 indent, U32 depth) +{ + dVAR; + register U8 op = PSEUDO; /* Arbitrary non-END op. */ + register const regnode *next; + const regnode *optstart= NULL; + + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMPUNTIL; + +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, + last ? last-start : 0,plast ? plast-start : 0); +#endif + + if (plast && plast < last) + last= plast; + + while (PL_regkind[op] != END && (!last || node < last)) { + /* While that wasn't END last time... */ + NODE_ALIGN(node); + op = OP(node); + if (op == CLOSE || op == WHILEM) + indent--; + next = regnext((regnode *)node); + + /* Where, what. */ + if (OP(node) == OPTIMIZED) { + if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) + optstart = node; + else + goto after_print; + } else + CLEAR_OPTSTART; + + regprop(r, sv, node); + PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), + (int)(2*indent + 1), "", SvPVX_const(sv)); + + if (OP(node) != OPTIMIZED) { + if (next == NULL) /* Next ptr. */ + PerlIO_printf(Perl_debug_log, " (0)"); + else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) + PerlIO_printf(Perl_debug_log, " (FAIL)"); + else + PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); + (void)PerlIO_putc(Perl_debug_log, '\n'); + } + + after_print: + if (PL_regkind[(U8)op] == BRANCHJ) { + assert(next); + { + register const regnode *nnode = (OP(next) == LONGJMP + ? regnext((regnode *)next) + : next); + if (last && nnode > last) + nnode = last; + DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); + } + } + else if (PL_regkind[(U8)op] == BRANCH) { + assert(next); + DUMPUNTIL(NEXTOPER(node), next); + } + else if ( PL_regkind[(U8)op] == TRIE ) { + const regnode *this_trie = node; + const char op = OP(node); + const U32 n = ARG(node); + const reg_ac_data * const ac = op>=AHOCORASICK ? + (reg_ac_data *)ri->data->data[n] : + NULL; + const reg_trie_data * const trie = + (reg_trie_data*)ri->data->data[optrie]; +#ifdef DEBUGGING + AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); +#endif + const regnode *nextbranch= NULL; + I32 word_idx; + sv_setpvs(sv, ""); + for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { + SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); + + PerlIO_printf(Perl_debug_log, "%*s%s ", + (int)(2*(indent+3)), "", + elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT + ) + : "???" + ); + if (trie->jump) { + U16 dist= trie->jump[word_idx+1]; + PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", + (UV)((dist ? this_trie + dist : next) - start)); + if (dist) { + if (!nextbranch) + nextbranch= this_trie + trie->jump[0]; + DUMPUNTIL(this_trie + dist, nextbranch); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode *)nextbranch); + } else { + PerlIO_printf(Perl_debug_log, "\n"); + } + } + if (last && next > last) + node= last; + else + node= next; + } + else if ( op == CURLY ) { /* "next" might be very big: optimizer */ + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, + NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); + } + else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { + assert(next); + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); + } + else if ( op == PLUS || op == STAR) { + DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); + } + else if (PL_regkind[(U8)op] == ANYOF) { + /* arglen 1 + class block */ + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS) + ? ANYOF_CLASS_SKIP : ANYOF_SKIP); + node = NEXTOPER(node); + } + else if (PL_regkind[(U8)op] == EXACT) { + /* Literal string, where present. */ + node += NODE_SZ_STR(node) - 1; + node = NEXTOPER(node); + } + else { + node = NEXTOPER(node); + node += regarglen[(U8)op]; + } + if (op == CURLYX || op == OPEN) + indent++; + } + CLEAR_OPTSTART; +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); +#endif + return node; +} + +#endif /* DEBUGGING */ + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: t + * End: + * + * ex: set ts=8 sts=4 sw=4 noet: + */