From: Vincent Pit Date: Mon, 1 Sep 2014 16:04:34 +0000 (+0200) Subject: Add support for perl 5.18.2, 5.20.0, and 5.21.[0123] X-Git-Url: http://git.vpit.fr/?a=commitdiff_plain;h=819b78c9396701a0ef5fe7334e4054dd53c7ef93;p=perl%2Fmodules%2Fre-engine-Hooks.git Add support for perl 5.18.2, 5.20.0, and 5.21.[0123] The 5.19 development branch has also been removed. --- diff --git a/MANIFEST b/MANIFEST index 3d12264..1c2b067 100644 --- a/MANIFEST +++ b/MANIFEST @@ -61,22 +61,30 @@ src/5018001/dquote_static.c src/5018001/inline_invlist.c src/5018001/regcomp.c src/5018001/regexec.c -src/5019000/dquote_static.c -src/5019000/inline_invlist.c -src/5019000/regcomp.c -src/5019000/regexec.c -src/5019001/dquote_static.c -src/5019001/inline_invlist.c -src/5019001/regcomp.c -src/5019001/regexec.c -src/5019002/dquote_static.c -src/5019002/inline_invlist.c -src/5019002/regcomp.c -src/5019002/regexec.c -src/5019003/dquote_static.c -src/5019003/inline_invlist.c -src/5019003/regcomp.c -src/5019003/regexec.c +src/5018002/dquote_static.c +src/5018002/inline_invlist.c +src/5018002/regcomp.c +src/5018002/regexec.c +src/5020000/dquote_static.c +src/5020000/inline_invlist.c +src/5020000/regcomp.c +src/5020000/regexec.c +src/5021000/dquote_static.c +src/5021000/inline_invlist.c +src/5021000/regcomp.c +src/5021000/regexec.c +src/5021001/dquote_static.c +src/5021001/inline_invlist.c +src/5021001/regcomp.c +src/5021001/regexec.c +src/5021002/dquote_static.c +src/5021002/inline_invlist.c +src/5021002/regcomp.c +src/5021002/regexec.c +src/5021003/dquote_static.c +src/5021003/inline_invlist.c +src/5021003/regcomp.c +src/5021003/regexec.c src/update.pl t/00-load.t t/05-keys.t diff --git a/Makefile.PL b/Makefile.PL index da35198..abf2386 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -44,7 +44,7 @@ my %META = ( }, ); -my $latest_dev_rev = 19; +my $latest_dev_rev = 21; sub is_outdated_dev_perl { my ($rev) = "$]" =~ /^5\.([0-9]{2}[13579])/; diff --git a/src/5019000/dquote_static.c b/src/5018002/dquote_static.c similarity index 100% rename from src/5019000/dquote_static.c rename to src/5018002/dquote_static.c diff --git a/src/5019000/inline_invlist.c b/src/5018002/inline_invlist.c similarity index 100% rename from src/5019000/inline_invlist.c rename to src/5018002/inline_invlist.c diff --git a/src/5019000/orig/dquote_static.c b/src/5018002/orig/dquote_static.c similarity index 100% rename from src/5019000/orig/dquote_static.c rename to src/5018002/orig/dquote_static.c diff --git a/src/5019000/orig/inline_invlist.c b/src/5018002/orig/inline_invlist.c similarity index 100% rename from src/5019000/orig/inline_invlist.c rename to src/5018002/orig/inline_invlist.c diff --git a/src/5019000/orig/regcomp.c b/src/5018002/orig/regcomp.c similarity index 99% rename from src/5019000/orig/regcomp.c rename to src/5018002/orig/regcomp.c index fb52d47..0841f17 100644 --- a/src/5019000/orig/regcomp.c +++ b/src/5018002/orig/regcomp.c @@ -5088,16 +5088,15 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * sv_catsv_nomg(pat, msv); * that allows us to adjust code block indices if * needed */ - STRLEN slen, dlen; + STRLEN dlen; char *dst = SvPV_force_nomg(pat, dlen); - const char *src = SvPV_flags_const(msv, slen, 0); orig_patlen = dlen; if (SvUTF8(msv) && !SvUTF8(pat)) { S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); sv_setpvn(pat, dst, dlen); SvUTF8_on(pat); } - sv_catpvn_nomg(pat, src, slen); + sv_catsv_nomg(pat, msv); rx = msv; } else @@ -5817,7 +5816,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, pRExC_state->num_code_blocks); goto redo_first_pass; } - Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags); } if (code_blocksv) SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ @@ -5989,7 +5988,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); - Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags); } /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ @@ -6689,13 +6688,23 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - && !(rx->extflags & RXf_PMf_KEEPCOPY) - ) - goto ret_undef; + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } if (!rx->subbeg) goto ret_undef; @@ -6801,13 +6810,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + /* Some of this code was originally in C in F */ switch (paren) { case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; @@ -6820,8 +6843,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, return 0; case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; @@ -6833,13 +6854,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } return 0; - case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - - /* $& / ${^MATCH}, $1, $2, ... */ - default: + default: /* $& / ${^MATCH}, $1, $2, ... */ if (paren <= (I32)rx->nparens && (s1 = rx->offs[paren].start) != -1 && (t1 = rx->offs[paren].end) != -1) @@ -9176,8 +9191,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", - (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#X", + flags); } else REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); c = *nextchar(pRExC_state); @@ -9192,8 +9207,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", - (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#X", + flags); } REGTAIL(pRExC_state, ret, lastbr); if (flags&HASWIDTH) @@ -9281,7 +9296,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#X", flags); } if (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -9326,7 +9341,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#X", flags); } REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; @@ -9531,7 +9546,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regpiece returned NULL, flags=%#X", flags); } else if (ret == NULL) ret = latest; @@ -9601,7 +9616,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (flags & (TRYAGAIN|RESTART_UTF8)) *flagp |= flags & (TRYAGAIN|RESTART_UTF8); else - FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regatom returned NULL, flags=%#X", flags); return(NULL); } @@ -9655,16 +9670,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } - else if (max == 0) { /* replace {0} with a nothing node */ - if (SIZE_ONLY) { - RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING]; - } - else { - RExC_emit = orig_emit; - } - ret = reg_node(pRExC_state, NOTHING); - return ret; - } do_curly: if ((flags&SIMPLE)) { @@ -10043,8 +10048,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I *flagp = RESTART_UTF8; return FALSE; } - FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", - (UV) flags); + FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X", + flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); @@ -10329,8 +10334,8 @@ tryagain: if (ret == NULL) { if (*flagp & RESTART_UTF8) return NULL; - FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to regatom, flags=%#X", + *flagp); } nextchar(pRExC_state); Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ @@ -10352,7 +10357,7 @@ tryagain: *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; @@ -10547,8 +10552,8 @@ tryagain: /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. */ if (!ret) - FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to regatom, flags=%#X", + *flagp); RExC_parse--; @@ -10961,6 +10966,12 @@ tryagain: if (PL_encoding && ender < 0x100) goto recode_encoding; break; + case '8': case '9': /* These are illegal unless backrefs */ + if (atoi(p) <= RExC_npar) { + --p; /* backup to backslash; handle as backref */ + goto loopdone; + } + goto unrecognized; recode_encoding: if (! RExC_override_recoding) { SV* enc = PL_encoding; @@ -10975,6 +10986,7 @@ tryagain: FAIL("Trailing \\"); /* FALL THROUGH */ default: + unrecognized: if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { /* Include any { following the alpha to emphasize * that it could be part of an escape at some point @@ -11671,8 +11683,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f FALSE, /* don't allow multi-char folds */ TRUE, /* silence non-portable warnings. */ ¤t)) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X", + *flagp); /* function call leaves parse pointing to the ']', except * if we faked it */ @@ -11740,7 +11752,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f * been parsed and evaluated to a single operand (or else is a syntax * error), and is handled as a regular operand */ - stack = newAV(); + sv_2mortal((SV *)(stack = newAV())); while (RExC_parse < RExC_end) { I32 top_index = av_tindex(stack); @@ -11837,8 +11849,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ ¤t)) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X", + *flagp); /* regclass() will return with parsing just the \ sequence, * leaving the parse pointer at the next thing to parse */ RExC_parse--; @@ -11860,8 +11872,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ ¤t)) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X", + *flagp); /* function call leaves parse pointing to the ']', except if we * faked it */ if (is_posix_class) { @@ -11912,6 +11924,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f || IS_OPERAND(lparen) || SvUV(lparen) != '(') { + SvREFCNT_dec(current); RExC_parse++; vFAIL("Unexpected ')'"); } @@ -11930,9 +11943,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f } else { SV* top = av_pop(stack); + SV *prev = NULL; char current_operator; if (IS_OPERAND(top)) { + SvREFCNT_dec_NN(top); + SvREFCNT_dec_NN(current); vFAIL("Operand with no preceding operator"); } current_operator = (char) SvUV(top); @@ -11959,7 +11975,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f goto handle_operand; case '&': - _invlist_intersection(av_pop(stack), + prev = av_pop(stack); + _invlist_intersection(prev, current, ¤t); av_push(stack, current); @@ -11967,12 +11984,14 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f case '|': case '+': - _invlist_union(av_pop(stack), current, ¤t); + prev = av_pop(stack); + _invlist_union(prev, current, ¤t); av_push(stack, current); break; case '-': - _invlist_subtract(av_pop(stack), current, ¤t); + prev = av_pop(stack);; + _invlist_subtract(prev, current, ¤t); av_push(stack, current); break; @@ -11982,9 +12001,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f SV* u = NULL; SV* element; - element = av_pop(stack); - _invlist_union(element, current, &u); - _invlist_intersection(element, current, &i); + prev = av_pop(stack); + _invlist_union(prev, current, &u); + _invlist_intersection(prev, current, &i); + /* _invlist_subtract will overwrite current + without freeing what it already contains */ + element = current; _invlist_subtract(u, i, ¤t); av_push(stack, current); SvREFCNT_dec_NN(i); @@ -11997,6 +12019,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); } SvREFCNT_dec_NN(top); + SvREFCNT_dec(prev); } } @@ -12060,7 +12083,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f RExC_end = save_end; SvREFCNT_dec_NN(final); SvREFCNT_dec_NN(result_string); - SvREFCNT_dec_NN(stack); nextchar(pRExC_state); Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ @@ -12665,12 +12687,21 @@ parseit: } else #endif /* Not isascii(); just use the hard-coded definition for it */ + { _invlist_union_maybe_complement_2nd( posixes, PL_ASCII, cBOOL(namedclass % 2), /* Complement if odd (NASCII) */ &posixes); + + /* The code points 128-255 added above will be + * subtracted out below under /d, so the flag needs to + * be set */ + if (namedclass == ANYOF_NASCII && DEPENDS_SEMANTICS) { + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; + } + } } else { /* Garden variety class */ @@ -13681,6 +13712,7 @@ parseit: if (ret_invlist) { *ret_invlist = cp_list; + SvREFCNT_dec(swash); /* Discard the generated node */ if (SIZE_ONLY) { diff --git a/src/5019000/orig/regexec.c b/src/5018002/orig/regexec.c similarity index 99% rename from src/5019000/orig/regexec.c rename to src/5018002/orig/regexec.c index bc38839..b865b46 100644 --- a/src/5019000/orig/regexec.c +++ b/src/5018002/orig/regexec.c @@ -6662,7 +6662,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, scan = *startposp; if (max == REG_INFTY) max = I32_MAX; - else if (! utf8_target && scan + max < loceol) + else if (! utf8_target && loceol - scan > max) loceol = scan + max; /* Here, for the case of a non-UTF-8 target we have adjusted down @@ -6711,7 +6711,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, scan = loceol; break; case CANY: /* Move forward bytes, unless goes off end */ - if (utf8_target && scan + max < loceol) { + if (utf8_target && loceol - scan > max) { /* hadn't been adjusted in the UTF-8 case */ scan += max; @@ -6730,7 +6730,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's * true iff it doesn't matter if the argument is in UTF-8 or not */ if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! is_utf8_pat)) { - if (utf8_target && scan + max < loceol) { + if (utf8_target && loceol - scan > max) { /* We didn't adjust because is UTF-8, but ok to do so, * since here, to match at all, 1 char == 1 byte */ loceol = scan + max; @@ -6910,7 +6910,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXA: - if (utf8_target && scan + max < loceol) { + if (utf8_target && loceol - scan > max) { /* We didn't adjust at the beginning of this routine * because is UTF-8, but it is actually ok to do so, since here, to diff --git a/src/5019000/regcomp.c b/src/5018002/regcomp.c similarity index 99% rename from src/5019000/regcomp.c rename to src/5018002/regcomp.c index cc7b914..7975140 100644 --- a/src/5019000/regcomp.c +++ b/src/5018002/regcomp.c @@ -5089,16 +5089,15 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * sv_catsv_nomg(pat, msv); * that allows us to adjust code block indices if * needed */ - STRLEN slen, dlen; + STRLEN dlen; char *dst = SvPV_force_nomg(pat, dlen); - const char *src = SvPV_flags_const(msv, slen, 0); orig_patlen = dlen; if (SvUTF8(msv) && !SvUTF8(pat)) { S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); sv_setpvn(pat, dst, dlen); SvUTF8_on(pat); } - sv_catpvn_nomg(pat, src, slen); + sv_catsv_nomg(pat, msv); rx = msv; } else @@ -5818,7 +5817,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, pRExC_state->num_code_blocks); goto redo_first_pass; } - Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags); } if (code_blocksv) SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ @@ -5991,7 +5990,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); - Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags); } /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ @@ -6691,13 +6690,23 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - && !(rx->extflags & RXf_PMf_KEEPCOPY) ) - goto ret_undef; + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } if (!rx->subbeg) goto ret_undef; @@ -6803,13 +6812,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + /* Some of this code was originally in C in F */ switch (paren) { case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; @@ -6822,8 +6845,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, return 0; case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; @@ -6835,13 +6856,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } return 0; - case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - - /* $& / ${^MATCH}, $1, $2, ... */ - default: + default: /* $& / ${^MATCH}, $1, $2, ... */ if (paren <= (I32)rx->nparens && (s1 = rx->offs[paren].start) != -1 && (t1 = rx->offs[paren].end) != -1) @@ -9178,8 +9193,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", - (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#X", + flags); } else REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); c = *nextchar(pRExC_state); @@ -9194,8 +9209,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", - (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#X", + flags); } REGTAIL(pRExC_state, ret, lastbr); if (flags&HASWIDTH) @@ -9283,7 +9298,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#X", flags); } if (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -9328,7 +9343,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#X", flags); } REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; @@ -9533,7 +9548,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regpiece returned NULL, flags=%#X", flags); } else if (ret == NULL) ret = latest; @@ -9603,7 +9618,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (flags & (TRYAGAIN|RESTART_UTF8)) *flagp |= flags & (TRYAGAIN|RESTART_UTF8); else - FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regatom returned NULL, flags=%#X", flags); return(NULL); } @@ -9657,16 +9672,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } - else if (max == 0) { /* replace {0} with a nothing node */ - if (SIZE_ONLY) { - RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING]; - } - else { - RExC_emit = orig_emit; - } - ret = reg_node(pRExC_state, NOTHING); - return ret; - } do_curly: if ((flags&SIMPLE)) { @@ -10045,8 +10050,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I *flagp = RESTART_UTF8; return FALSE; } - FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", - (UV) flags); + FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X", + flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); @@ -10331,8 +10336,8 @@ tryagain: if (ret == NULL) { if (*flagp & RESTART_UTF8) return NULL; - FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to regatom, flags=%#X", + *flagp); } nextchar(pRExC_state); Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ @@ -10354,7 +10359,7 @@ tryagain: *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; @@ -10549,8 +10554,8 @@ tryagain: /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. */ if (!ret) - FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to regatom, flags=%#X", + *flagp); RExC_parse--; @@ -10963,6 +10968,12 @@ tryagain: if (PL_encoding && ender < 0x100) goto recode_encoding; break; + case '8': case '9': /* These are illegal unless backrefs */ + if (atoi(p) <= RExC_npar) { + --p; /* backup to backslash; handle as backref */ + goto loopdone; + } + goto unrecognized; recode_encoding: if (! RExC_override_recoding) { SV* enc = PL_encoding; @@ -10977,6 +10988,7 @@ tryagain: FAIL("Trailing \\"); /* FALL THROUGH */ default: + unrecognized: if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { /* Include any { following the alpha to emphasize * that it could be part of an escape at some point @@ -11673,8 +11685,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f FALSE, /* don't allow multi-char folds */ TRUE, /* silence non-portable warnings. */ ¤t)) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X", + *flagp); /* function call leaves parse pointing to the ']', except * if we faked it */ @@ -11742,7 +11754,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f * been parsed and evaluated to a single operand (or else is a syntax * error), and is handled as a regular operand */ - stack = newAV(); + sv_2mortal((SV *)(stack = newAV())); while (RExC_parse < RExC_end) { I32 top_index = av_tindex(stack); @@ -11839,8 +11851,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ ¤t)) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X", + *flagp); /* regclass() will return with parsing just the \ sequence, * leaving the parse pointer at the next thing to parse */ RExC_parse--; @@ -11862,8 +11874,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ ¤t)) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X", + *flagp); /* function call leaves parse pointing to the ']', except if we * faked it */ if (is_posix_class) { @@ -11914,6 +11926,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f || IS_OPERAND(lparen) || SvUV(lparen) != '(') { + SvREFCNT_dec(current); RExC_parse++; vFAIL("Unexpected ')'"); } @@ -11932,9 +11945,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f } else { SV* top = av_pop(stack); + SV *prev = NULL; char current_operator; if (IS_OPERAND(top)) { + SvREFCNT_dec_NN(top); + SvREFCNT_dec_NN(current); vFAIL("Operand with no preceding operator"); } current_operator = (char) SvUV(top); @@ -11961,7 +11977,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f goto handle_operand; case '&': - _invlist_intersection(av_pop(stack), + prev = av_pop(stack); + _invlist_intersection(prev, current, ¤t); av_push(stack, current); @@ -11969,12 +11986,14 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f case '|': case '+': - _invlist_union(av_pop(stack), current, ¤t); + prev = av_pop(stack); + _invlist_union(prev, current, ¤t); av_push(stack, current); break; case '-': - _invlist_subtract(av_pop(stack), current, ¤t); + prev = av_pop(stack);; + _invlist_subtract(prev, current, ¤t); av_push(stack, current); break; @@ -11984,9 +12003,12 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f SV* u = NULL; SV* element; - element = av_pop(stack); - _invlist_union(element, current, &u); - _invlist_intersection(element, current, &i); + prev = av_pop(stack); + _invlist_union(prev, current, &u); + _invlist_intersection(prev, current, &i); + /* _invlist_subtract will overwrite current + without freeing what it already contains */ + element = current; _invlist_subtract(u, i, ¤t); av_push(stack, current); SvREFCNT_dec_NN(i); @@ -11999,6 +12021,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); } SvREFCNT_dec_NN(top); + SvREFCNT_dec(prev); } } @@ -12062,7 +12085,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f RExC_end = save_end; SvREFCNT_dec_NN(final); SvREFCNT_dec_NN(result_string); - SvREFCNT_dec_NN(stack); nextchar(pRExC_state); Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ @@ -12667,12 +12689,21 @@ parseit: } else #endif /* Not isascii(); just use the hard-coded definition for it */ + { _invlist_union_maybe_complement_2nd( posixes, PL_ASCII, cBOOL(namedclass % 2), /* Complement if odd (NASCII) */ &posixes); + + /* The code points 128-255 added above will be + * subtracted out below under /d, so the flag needs to + * be set */ + if (namedclass == ANYOF_NASCII && DEPENDS_SEMANTICS) { + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; + } + } } else { /* Garden variety class */ @@ -13683,6 +13714,7 @@ parseit: if (ret_invlist) { *ret_invlist = cp_list; + SvREFCNT_dec(swash); /* Discard the generated node */ if (SIZE_ONLY) { diff --git a/src/5019000/regexec.c b/src/5018002/regexec.c similarity index 99% rename from src/5019000/regexec.c rename to src/5018002/regexec.c index f2ea7bb..7c307bd 100644 --- a/src/5019000/regexec.c +++ b/src/5018002/regexec.c @@ -6664,7 +6664,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, scan = *startposp; if (max == REG_INFTY) max = I32_MAX; - else if (! utf8_target && scan + max < loceol) + else if (! utf8_target && loceol - scan > max) loceol = scan + max; /* Here, for the case of a non-UTF-8 target we have adjusted down @@ -6713,7 +6713,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, scan = loceol; break; case CANY: /* Move forward bytes, unless goes off end */ - if (utf8_target && scan + max < loceol) { + if (utf8_target && loceol - scan > max) { /* hadn't been adjusted in the UTF-8 case */ scan += max; @@ -6732,7 +6732,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's * true iff it doesn't matter if the argument is in UTF-8 or not */ if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! is_utf8_pat)) { - if (utf8_target && scan + max < loceol) { + if (utf8_target && loceol - scan > max) { /* We didn't adjust because is UTF-8, but ok to do so, * since here, to match at all, 1 char == 1 byte */ loceol = scan + max; @@ -6912,7 +6912,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXA: - if (utf8_target && scan + max < loceol) { + if (utf8_target && loceol - scan > max) { /* We didn't adjust at the beginning of this routine * because is UTF-8, but it is actually ok to do so, since here, to diff --git a/src/5019001/inline_invlist.c b/src/5019001/inline_invlist.c deleted file mode 100644 index 332a3d8..0000000 --- a/src/5019001/inline_invlist.c +++ /dev/null @@ -1,77 +0,0 @@ -/* inline_invlist.c - * - * Copyright (C) 2012 by Larry Wall and others - * - * You may distribute under the terms of either the GNU General Public - * License or the Artistic License, as specified in the README file. - */ - -#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) - -#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */ -#define INVLIST_ITER_OFFSET 1 /* Current iteration position */ -#define INVLIST_PREVIOUS_INDEX_OFFSET 2 /* Place to cache index of previous - result */ - -/* This is a combination of a version and data structure type, so that one - * being passed in can be validated to be an inversion list of the correct - * vintage. When the structure of the header is changed, a new random number - * in the range 2**31-1 should be generated and the new() method changed to - * insert that at this location. Then, if an auxiliary program doesn't change - * correspondingly, it will be discovered immediately */ -#define INVLIST_VERSION_ID_OFFSET 3 -#define INVLIST_VERSION_ID 290655244 - -/* For safety, when adding new elements, remember to #undef them at the end of - * the inversion list code section */ - -#define INVLIST_ZERO_OFFSET 4 /* 0 or 1; must be last element in header */ -/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list - * contains the code point U+00000, and begins here. If 1, the inversion list - * doesn't contain U+0000, and it begins at the next UV in the array. - * Inverting an inversion list consists of adding or removing the 0 at the - * beginning of it. By reserving a space for that 0, inversion can be made - * very fast */ - -#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1) - -/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, - * etc */ -#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) -#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) - -PERL_STATIC_INLINE UV* -S__get_invlist_len_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that contains the current number - * of used elements in the inversion list */ - - PERL_ARGS_ASSERT__GET_INVLIST_LEN_ADDR; - - return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV))); -} - -PERL_STATIC_INLINE UV -S__invlist_len(pTHX_ SV* const invlist) -{ - /* Returns the current number of elements stored in the inversion list's - * array */ - - PERL_ARGS_ASSERT__INVLIST_LEN; - - return *_get_invlist_len_addr(invlist); -} - -PERL_STATIC_INLINE bool -S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp) -{ - /* Does contain code point as part of the set? */ - - IV index = _invlist_search(invlist, cp); - - PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; - - return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); -} - -#endif diff --git a/src/5019001/orig/inline_invlist.c b/src/5019001/orig/inline_invlist.c deleted file mode 100644 index b56ce60..0000000 --- a/src/5019001/orig/inline_invlist.c +++ /dev/null @@ -1,77 +0,0 @@ -/* inline_invlist.c - * - * Copyright (C) 2012 by Larry Wall and others - * - * You may distribute under the terms of either the GNU General Public - * License or the Artistic License, as specified in the README file. - */ - -#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) - -#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */ -#define INVLIST_ITER_OFFSET 1 /* Current iteration position */ -#define INVLIST_PREVIOUS_INDEX_OFFSET 2 /* Place to cache index of previous - result */ - -/* This is a combination of a version and data structure type, so that one - * being passed in can be validated to be an inversion list of the correct - * vintage. When the structure of the header is changed, a new random number - * in the range 2**31-1 should be generated and the new() method changed to - * insert that at this location. Then, if an auxiliary program doesn't change - * correspondingly, it will be discovered immediately */ -#define INVLIST_VERSION_ID_OFFSET 3 -#define INVLIST_VERSION_ID 290655244 - -/* For safety, when adding new elements, remember to #undef them at the end of - * the inversion list code section */ - -#define INVLIST_ZERO_OFFSET 4 /* 0 or 1; must be last element in header */ -/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list - * contains the code point U+00000, and begins here. If 1, the inversion list - * doesn't contain U+0000, and it begins at the next UV in the array. - * Inverting an inversion list consists of adding or removing the 0 at the - * beginning of it. By reserving a space for that 0, inversion can be made - * very fast */ - -#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1) - -/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, - * etc */ -#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) -#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) - -PERL_STATIC_INLINE UV* -S__get_invlist_len_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that contains the current number - * of used elements in the inversion list */ - - PERL_ARGS_ASSERT__GET_INVLIST_LEN_ADDR; - - return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV))); -} - -PERL_STATIC_INLINE UV -S__invlist_len(pTHX_ SV* const invlist) -{ - /* Returns the current number of elements stored in the inversion list's - * array */ - - PERL_ARGS_ASSERT__INVLIST_LEN; - - return *_get_invlist_len_addr(invlist); -} - -PERL_STATIC_INLINE bool -S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp) -{ - /* Does contain code point as part of the set? */ - - IV index = _invlist_search(invlist, cp); - - PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; - - return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); -} - -#endif diff --git a/src/5019001/dquote_static.c b/src/5020000/dquote_static.c similarity index 90% rename from src/5019001/dquote_static.c rename to src/5020000/dquote_static.c index d5241ca..752b399 100644 --- a/src/5019001/dquote_static.c +++ b/src/5020000/dquote_static.c @@ -46,44 +46,44 @@ S_regcurly(pTHX_ const char *s, */ STATIC char -S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning) +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) { U8 result; - if (utf8) { - /* Trying to deprecate non-ASCII usages. This construct has never - * worked for a utf8 variant. So, even though are accepting non-ASCII - * Latin1 in 5.14, no need to make them work under utf8 */ + if (! isPRINT_A(source)) { + const char msg[] = "Character following \"\\c\" must be printable ASCII"; if (! isASCII(source)) { - Perl_croak(aTHX_ "Character following \"\\c\" must be ASCII"); + Perl_croak(aTHX_ "%s", msg); + } + else if (output_warning) { /* Unprintables can be removed in v5.22 */ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), "%s", + msg); } } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); - result = toCTRL(source); - if (! isASCII(source)) { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "Character following \"\\c\" must be ASCII"); + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); } - else if (! isCNTRL(result) && output_warning) { - if (source == '{') { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "\"\\c{\" is deprecated and is more clearly written as \";\""); - } - else { - U8 clearer[3]; - U8 i = 0; - if (! isWORDCHAR(result)) { - clearer[i++] = '\\'; - } - clearer[i++] = result; - clearer[i++] = '\0'; - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "\"\\c%c\" is more clearly written simply as \"%s\"", - source, - clearer); + result = toCTRL(source); + if (output_warning && ! isCNTRL_L1(result)) { + /* We use isCNTRL_L1 above and not simply isCNTRL, because on EBCDIC + * machines, things like \cT map into a C1 control. */ + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); } return result; diff --git a/src/5019003/inline_invlist.c b/src/5020000/inline_invlist.c similarity index 100% rename from src/5019003/inline_invlist.c rename to src/5020000/inline_invlist.c diff --git a/src/5019003/orig/dquote_static.c b/src/5020000/orig/dquote_static.c similarity index 88% rename from src/5019003/orig/dquote_static.c rename to src/5020000/orig/dquote_static.c index da1b5b9..bb1bd4a 100644 --- a/src/5019003/orig/dquote_static.c +++ b/src/5020000/orig/dquote_static.c @@ -46,44 +46,44 @@ S_regcurly(pTHX_ const char *s, */ STATIC char -S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning) +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) { U8 result; - if (utf8) { - /* Trying to deprecate non-ASCII usages. This construct has never - * worked for a utf8 variant. So, even though are accepting non-ASCII - * Latin1 in 5.14, no need to make them work under utf8 */ - if (! isASCII(source)) { - Perl_croak(aTHX_ "Character following \"\\c\" must be ASCII"); + if (! isPRINT_A(source)) { + const char msg[] = "Character following \"\\c\" must be printable ASCII"; + if (! isASCII(source)) { + Perl_croak(aTHX_ "%s", msg); + } + else if (output_warning) { /* Unprintables can be removed in v5.22 */ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), "%s", + msg); } } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); - result = toCTRL(source); - if (! isASCII(source)) { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "Character following \"\\c\" must be ASCII"); + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); } - else if (! isCNTRL(result) && output_warning) { - if (source == '{') { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "\"\\c{\" is deprecated and is more clearly written as \";\""); - } - else { - U8 clearer[3]; - U8 i = 0; - if (! isWORDCHAR(result)) { - clearer[i++] = '\\'; - } - clearer[i++] = result; - clearer[i++] = '\0'; - - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "\"\\c%c\" is more clearly written simply as \"%s\"", - source, - clearer); - } + + result = toCTRL(source); + if (output_warning && ! isCNTRL_L1(result)) { + /* We use isCNTRL_L1 above and not simply isCNTRL, because on EBCDIC + * machines, things like \cT map into a C1 control. */ + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; + } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); } return result; diff --git a/src/5019003/orig/inline_invlist.c b/src/5020000/orig/inline_invlist.c similarity index 100% rename from src/5019003/orig/inline_invlist.c rename to src/5020000/orig/inline_invlist.c diff --git a/src/5019003/orig/regcomp.c b/src/5020000/orig/regcomp.c similarity index 74% rename from src/5019003/orig/regcomp.c rename to src/5020000/orig/regcomp.c index 659d51f..eaee604 100644 --- a/src/5019003/orig/regcomp.c +++ b/src/5020000/orig/regcomp.c @@ -81,7 +81,7 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" -extern const struct regexp_engine my_reg_engine; +EXTERN_C const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif @@ -91,51 +91,46 @@ extern const struct regexp_engine my_reg_engine; #include "inline_invlist.c" #include "unicode_constants.h" -#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) -#ifdef op -#undef op -#endif /* op */ - -#ifdef MSDOS -# if defined(BUGGY_MSC6) - /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ -# pragma optimize("a",off) - /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ -# pragma optimize("w",on ) -# endif /* BUGGY_MSC6 */ -#endif /* MSDOS */ - #ifndef STATIC #define STATIC static #endif -typedef struct RExC_state_t { +struct RExC_state_t { U32 flags; /* RXf_* are we folding, multilining? */ U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ - regexp_internal *rxi; /* internal data for regexp object pprivate field */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ - I32 whilem_seen; /* number of WHILEM in this expr */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ - regnode *emit_bound; /* First regnode outside of the allocated space */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ regnode *emit; /* Code-emit pointer; if = &emit_dummy, implies compiling, so don't emit */ - regnode emit_dummy; /* placeholder for emit to point to */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; - I32 size; /* Code size. */ - I32 npar; /* Capture buffer count, (OPEN). */ - I32 cpar; /* Capture buffer count, (CLOSE). */ - I32 nestroot; /* root parens we are in - used by accept */ + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ I32 extralen; I32 seen_zerolen; regnode **open_parens; /* pointers to open parens */ @@ -149,18 +144,23 @@ typedef struct RExC_state_t { rules, even if the pattern is not in utf8 */ HV *paren_names; /* Paren names */ - + regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ I32 in_lookbehind; I32 contains_locale; + I32 contains_i; I32 override_recoding; I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ -#if ADD_TO_REGEXEC + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif @@ -173,7 +173,7 @@ typedef struct RExC_state_t { #define RExC_lastnum (pRExC_state->lastnum) #define RExC_paren_name_list (pRExC_state->paren_name_list) #endif -} RExC_state_t; +}; #define RExC_flags (pRExC_state->flags) #define RExC_pm_flags (pRExC_state->pm_flags) @@ -186,7 +186,8 @@ typedef struct RExC_state_t { #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) #ifdef RE_TRACK_PATTERN_OFFSETS -#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ #endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_dummy (pRExC_state->emit_dummy) @@ -196,6 +197,7 @@ typedef struct RExC_state_t { #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) @@ -209,8 +211,12 @@ typedef struct RExC_state_t { #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) #define RExC_override_recoding (pRExC_state->override_recoding) #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) @@ -219,9 +225,6 @@ typedef struct RExC_state_t { #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s, FALSE))) -#ifdef SPSTART -#undef SPSTART /* dratted cpp namespace... */ -#endif /* * Flags to be passed up and down. */ @@ -267,104 +270,109 @@ typedef struct RExC_state_t { #define namedclass_to_classnum(class) ((int) ((class) / 2)) #define classnum_to_namedclass(classnum) ((classnum) * 2) +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + /* About scan_data_t. During optimisation we recurse through the regexp program performing various inplace (keyhole style) optimisations. In addition study_chunk and scan_commit populate this data structure with information about - what strings MUST appear in the pattern. We look for the longest + what strings MUST appear in the pattern. We look for the longest string that must appear at a fixed location, and we look for the longest string that may appear at a floating location. So for instance in the pattern: - + /FOO[xX]A.*B[xX]BAR/ - + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating strings (because they follow a .* construct). study_chunk will identify both FOO and BAR as being the longest fixed and floating strings respectively. - + The strings can be composites, for instance - + /(f)(o)(o)/ - + will result in a composite fixed substring 'foo'. - + For each string some basic information is maintained: - + - offset or min_offset This is the position the string must appear at, or not before. It also implicitly (when combined with minlenp) tells us how many characters must match before the string we are searching for. Likewise when combined with minlenp and the length of the string it - tells us how many characters must appear after the string we have + tells us how many characters must appear after the string we have found. - + - max_offset Only used for floating strings. This is the rightmost point that - the string can appear at. If set to I32 max it indicates that the + the string can appear at. If set to SSize_t_MAX it indicates that the string can occur infinitely far to the right. - + - minlenp A pointer to the minimum number of characters of the pattern that the string was found inside. This is important as in the case of positive - lookahead or positive lookbehind we can have multiple patterns + lookahead or positive lookbehind we can have multiple patterns involved. Consider - + /(?=FOO).*F/ - + The minimum length of the pattern overall is 3, the minimum length of the lookahead part is 3, but the minimum length of the part that - will actually match is 1. So 'FOO's minimum length is 3, but the + will actually match is 1. So 'FOO's minimum length is 3, but the minimum length for the F is 1. This is important as the minimum length - is used to determine offsets in front of and behind the string being + is used to determine offsets in front of and behind the string being looked for. Since strings can be composites this is the length of the pattern at the time it was committed with a scan_commit. Note that the length is calculated by study_chunk, so that the minimum lengths - are not known until the full pattern has been compiled, thus the + are not known until the full pattern has been compiled, thus the pointer to the value. - + - lookbehind - + In the case of lookbehind the string being searched for can be - offset past the start point of the final matching string. + offset past the start point of the final matching string. If this value was just blithely removed from the min_offset it would invalidate some of the calculations for how many chars must match before or after (as they are derived from min_offset and minlen and - the length of the string being searched for). + the length of the string being searched for). When the final pattern is compiled and the data is moved from the scan_data_t structure into the regexp structure the information - about lookbehind is factored in, with the information that would - have been lost precalculated in the end_shift field for the + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the associated string. The fields pos_min and pos_delta are used to store the minimum offset - and the delta to the maximum offset at the current point in the pattern. + and the delta to the maximum offset at the current point in the pattern. */ typedef struct scan_data_t { /*I32 len_min; unused */ /*I32 len_delta; unused */ - I32 pos_min; - I32 pos_delta; + SSize_t pos_min; + SSize_t pos_delta; SV *last_found; - I32 last_end; /* min value, <0 unless valid. */ - I32 last_start_min; - I32 last_start_max; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; SV **longest; /* Either &l_fixed, or &l_float. */ SV *longest_fixed; /* longest fixed string found in pattern */ - I32 offset_fixed; /* offset where it starts */ - I32 *minlen_fixed; /* pointer to the minlen relevant to the string */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ I32 lookbehind_fixed; /* is the position of the string modfied by LB */ SV *longest_float; /* longest floating string found in pattern */ - I32 offset_float_min; /* earliest point in string it can appear */ - I32 offset_float_max; /* latest point in string it can appear */ - I32 *minlen_float; /* pointer to the minlen relevant to the string */ - I32 lookbehind_float; /* is the position of the string modified by LB */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ I32 flags; I32 whilem_c; - I32 *last_closep; - struct regnode_charclass_class *start_class; + SSize_t *last_closep; + regnode_ssc *start_class; } scan_data_t; /* The below is perhaps overboard, but this allows us to save a test at the @@ -398,13 +406,8 @@ static const scan_data_t zero_scan_data = #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) -#ifdef NO_UNARY_PLUS -# define SF_FIX_SHIFT_EOL (0+2) -# define SF_FL_SHIFT_EOL (0+4) -#else -# define SF_FIX_SHIFT_EOL (+2) -# define SF_FL_SHIFT_EOL (+4) -#endif +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) @@ -422,22 +425,32 @@ static const scan_data_t zero_scan_data = #define SCF_WHILEM_VISITED_POS 0x2000 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ -#define SCF_SEEN_ACCEPT 0x8000 +#define SCF_SEEN_ACCEPT 0x8000 #define SCF_TRIE_DOING_RESTUDY 0x10000 #define UTF cBOOL(RExC_utf8) /* The enums for all these are ordered so things work out correctly */ #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) -#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) -#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) -#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + #define OOB_NAMEDCLASS -1 /* There is no code point that is out-of-bounds, so this is problematic. But @@ -460,7 +473,12 @@ static const scan_data_t zero_scan_data = #define MARKER1 "<-- HERE" /* marker as it appears in the description */ #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ -#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given @@ -482,12 +500,12 @@ static const scan_data_t zero_scan_data = } STMT_END #define FAIL(msg) _FAIL( \ - Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) #define FAIL2(msg,arg) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ - arg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan @@ -495,7 +513,7 @@ static const scan_data_t zero_scan_data = #define Simple_vFAIL(m) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -512,8 +530,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL2(m,a1) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -531,8 +549,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL3(m, a1, a2) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -549,8 +567,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vFAIL4(m,a1,a2,a3) STMT_START { \ @@ -559,80 +577,90 @@ static const scan_data_t zero_scan_data = Simple_vFAIL4(m, a1, a2, a3); \ } STMT_END +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + /* m is not necessarily a "literal string", in this macro */ #define reg_warn_non_literal_string(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN_dep(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg_d(loc,m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN3(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ } STMT_END @@ -641,7 +669,7 @@ static const scan_data_t zero_scan_data = if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ } STMT_END -/* Macros for recording node offsets. 20001227 mjd@plover.com +/* Macros for recording node offsets. 20001227 mjd@plover.com * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in * element 2*n-1 of the array. Element #2n holds the byte length node #n. * Element 0 holds the number n. @@ -654,8 +682,8 @@ static const scan_data_t zero_scan_data = #define Set_Node_Length_To_R(node,len) #define Set_Node_Length(node,len) #define Set_Node_Cur_Length(node,start) -#define Node_Offset(n) -#define Node_Length(n) +#define Node_Offset(n) +#define Node_Length(n) #define Set_Node_Offset_Length(node,offset,len) #define ProgLen(ri) ri->u.proglen #define SetProgLen(ri,x) ri->u.proglen = x @@ -667,7 +695,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ __LINE__, (int)(node), (int)(byte))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)-1] = (byte); \ } \ @@ -683,7 +712,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ __LINE__, (int)(node), (int)(len))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)] = (len); \ } \ @@ -709,6 +739,49 @@ static const scan_data_t zero_scan_data = #define EXPERIMENTAL_INPLACESCAN #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log, \ @@ -748,7 +821,8 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ floating substrings if needed. */ STATIC void -S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) { const STRLEN l = CHR_SVLEN(data->last_found); const STRLEN old_l = CHR_SVLEN(*data->longest); @@ -772,9 +846,12 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min data->offset_float_min = l ? data->last_start_min : data->pos_min; data->offset_float_max = (l ? data->last_start_max - : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta)); - if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) - data->offset_float_max = I32_MAX; + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; if (data->flags & SF_BEFORE_EOL) data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); @@ -798,299 +875,592 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min DEBUG_STUDYDATA("commit: ",data,0); } -/* These macros set, clear and test whether the synthetic start class ('ssc', - * given by the parameter) matches an empty string (EOS). This uses the - * 'next_off' field in the node, to save a bit in the flags field. The ssc - * stands alone, so there is never a next_off, so this field is otherwise - * unused. The EOS information is used only for compilation, but theoretically - * it could be passed on to the execution code. This could be used to store - * more than one bit of information, but only this one is currently used. */ -#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END -#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END -#define TEST_SSC_EOS(node) cBOOL((node)->next_off) - -/* Can match anything (initialization) */ +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(pTHX_ const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + STATIC void -S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) { - PERL_ARGS_ASSERT_CL_ANYTHING; + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ + + PERL_ARGS_ASSERT_SSC_INIT; - ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_UNICODE_ALL; - SET_SSC_EOS(cl); + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); /* If any portion of the regex is to operate under locale rules, * initialization includes it. The reason this isn't done for all regexes * is that the optimizer was written under the assumption that locale was * all-or-nothing. Given the complexity and lack of documentation in the - * optimizer, and that there are inadequate test cases for locale, so many + * optimizer, and that there are inadequate test cases for locale, many * parts of it may not work properly, it is safest to avoid locale unless * necessary. */ if (RExC_contains_locale) { - ANYOF_CLASS_SETALL(cl); /* /l uses class */ - cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD; + ANYOF_POSIXL_SETALL(ssc); } else { - ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ + ANYOF_POSIXL_ZERO(ssc); } } -/* Can match anything (initialization) */ STATIC int -S_cl_is_anything(const struct regnode_charclass_class *cl) +S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) { - int value; + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ - PERL_ARGS_ASSERT_CL_IS_ANYTHING; + UV start, end; + bool ret; - for (value = 0; value < ANYOF_MAX; value += 2) - if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) - return 1; - if (!(cl->flags & ANYOF_UNICODE_ALL)) - return 0; - if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) - return 0; - return 1; + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; } -/* Can match anything (initialization) */ -STATIC void -S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) { - PERL_ARGS_ASSERT_CL_INIT; + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ + + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } + + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } + + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { - Zero(cl, 1, struct regnode_charclass_class); - cl->type = ANYOF; - cl_anything(pRExC_state, cl); - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } + + return invlist; } /* These two functions currently do the exact same thing */ -#define cl_init_zero cl_init +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ -/* 'AND' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if - * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_and(struct regnode_charclass_class *cl, - const struct regnode_charclass_class *and_with) +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) { - PERL_ARGS_ASSERT_CL_AND; + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } - assert(PL_regkind[and_with->type] == ANYOF); + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ - /* I (khw) am not sure all these restrictions are necessary XXX */ - if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) - && !(ANYOF_CLASS_TEST_ANY_SET(cl)) - && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(and_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD)) { - int i; + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; - if (and_with->flags & ANYOF_INVERT) - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= ~and_with->bitmap[i]; - else - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= and_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ - - if (and_with->flags & ANYOF_INVERT) { - - /* Here, the and'ed node is inverted. Get the AND of the flags that - * aren't affected by the inversion. Those that are affected are - * handled individually below */ - U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; - cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); - cl->flags |= affected_flags; - - /* We currently don't know how to deal with things that aren't in the - * bitmap, but we know that the intersection is no greater than what - * is already in cl, so let there be false positives that get sorted - * out after the synthetic start class succeeds, and the node is - * matched for real. */ - - /* The inversion of these two flags indicate that the resulting - * intersection doesn't have them */ - if (and_with->flags & ANYOF_UNICODE_ALL) { - cl->flags &= ~ANYOF_UNICODE_ALL; - } - if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { - cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* and'd node is not inverted */ - U8 outside_bitmap_but_not_utf8; /* Temp variable */ - - if (! ANYOF_NONBITMAP(and_with)) { - - /* Here 'and_with' doesn't match anything outside the bitmap - * (except possibly ANYOF_UNICODE_ALL), which means the - * intersection can't either, except for ANYOF_UNICODE_ALL, in - * which case we don't know what the intersection is, but it's no - * greater than what cl already has, so can just leave it alone, - * with possible false positives */ - if (! (and_with->flags & ANYOF_UNICODE_ALL)) { - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); - cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; - } - } - else if (! ANYOF_NONBITMAP(cl)) { - - /* Here, 'and_with' does match something outside the bitmap, and cl - * doesn't have a list of things to match outside the bitmap. If - * cl can match all code points above 255, the intersection will - * be those above-255 code points that 'and_with' matches. If cl - * can't match all Unicode code points, it means that it can't - * match anything outside the bitmap (since the 'if' that got us - * into this block tested for that), so we leave the bitmap empty. - */ - if (cl->flags & ANYOF_UNICODE_ALL) { - ARG_SET(cl, ARG(and_with)); + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); - /* and_with's ARG may match things that don't require UTF8. - * And now cl's will too, in spite of this being an 'and'. See - * the comments below about the kludge */ - cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8; - } - } - else { - /* Here, both 'and_with' and cl match something outside the - * bitmap. Currently we do not do the intersection, so just match - * whatever cl had at the beginning. */ - } - - - /* Take the intersection of the two sets of flags. However, the - * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a - * kludge around the fact that this flag is not treated like the others - * which are initialized in cl_anything(). The way the optimizer works - * is that the synthetic start class (SSC) is initialized to match - * anything, and then the first time a real node is encountered, its - * values are AND'd with the SSC's with the result being the values of - * the real node. However, there are paths through the optimizer where - * the AND never gets called, so those initialized bits are set - * inappropriately, which is not usually a big deal, as they just cause - * false positives in the SSC, which will just mean a probably - * imperceptible slow down in execution. However this bit has a - * higher false positive consequence in that it can cause utf8.pm, - * utf8_heavy.pl ... to be loaded when not necessary, which is a much - * bigger slowdown and also causes significant extra memory to be used. - * In order to prevent this, the code now takes a different tack. The - * bit isn't set unless some part of the regular expression needs it, - * but once set it won't get cleared. This means that these extra - * modules won't get loaded unless there was some path through the - * pattern that would have required them anyway, and so any false - * positives that occur by not ANDing them out when they could be - * aren't as severe as they would be if we treated this bit like all - * the others */ - outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags) - & ANYOF_NONBITMAP_NON_UTF8; - cl->flags &= and_with->flags; - cl->flags |= outside_bitmap_but_not_utf8; + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); + + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } + } + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } } } -/* 'OR' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if - * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) { - PERL_ARGS_ASSERT_CL_OR; - - if (or_with->flags & ANYOF_INVERT) { - - /* Here, the or'd node is to be inverted. This means we take the - * complement of everything not in the bitmap, but currently we don't - * know what that is, so give up and match anything */ - if (ANYOF_NONBITMAP(or_with)) { - cl_anything(pRExC_state, cl); - } - /* We do not use - * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) - * <= (B1 | !B2) | (CL1 | !CL2) - * which is wasteful if CL2 is small, but we ignore CL2: - * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 - * XXXX Can we handle case-fold? Unclear: - * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = - * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) - */ - else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(or_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD) ) { - int i; + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= ~or_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise */ - else { - cl_anything(pRExC_state, cl); - } + SV* ored_cp_list; + U8 ored_flags; - /* And, we can just take the union of the flags that aren't affected - * by the inversion */ - cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; + PERL_ARGS_ASSERT_SSC_OR; - /* For the remaining flags: - ANYOF_UNICODE_ALL and inverted means to not match anything above - 255, which means that the union with cl should just be - what cl has in it, so can ignore this flag - ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord - is 127-255 to match them, but then invert that, so the - union with cl should just be what cl has in it, so can - ignore this flag - */ - } else { /* 'or_with' is not inverted */ - /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ - if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_LOC_FOLD) - || (cl->flags & ANYOF_LOC_FOLD)) ) { - int i; + assert(is_ANYOF_SYNTHETIC(ssc)); - /* OR char bitmap and class bitmap separately */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= or_with->bitmap[i]; - if (or_with->flags & ANYOF_CLASS) { - ANYOF_CLASS_OR(or_with, cl); + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ + + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); + } } - } - else { /* XXXX: logic is complicated, leave it along for a moment. */ - cl_anything(pRExC_state, cl); - } + } + } - if (ANYOF_NONBITMAP(or_with)) { + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} - /* Use the added node's outside-the-bit-map match if there isn't a - * conflict. If there is a conflict (both nodes match something - * outside the bitmap, but what they match outside is not the same - * pointer, and hence not easily compared until XXX we extend - * inversion lists this far), give up and allow the start class to - * match everything outside the bitmap. If that stuff is all above - * 255, can just set UNICODE_ALL, otherwise caould be anything. */ - if (! ANYOF_NONBITMAP(cl)) { - ARG_SET(cl, ARG(or_with)); - } - else if (ARG(cl) != ARG(or_with)) { +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; - if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { - cl_anything(pRExC_state, cl); - } - else { - cl->flags |= ANYOF_UNICODE_ALL; - } - } - } + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); - /* Take the union */ - cl->flags |= or_with->flags; + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) -#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) #ifdef DEBUGGING @@ -1133,13 +1503,13 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 0 ; state < trie->uniquecharcount ; state++ ) { SV ** const tmp = av_fetch( revcharmap, state, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + PerlIO_printf( Perl_debug_log, "%*s", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) + PERL_PV_ESCAPE_FIRSTCHAR + ) ); } } @@ -1153,10 +1523,12 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; - PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); if ( trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); } else { PerlIO_printf( Perl_debug_log, "%6s", "" ); } @@ -1168,19 +1540,23 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, while( ( base + ofs < trie->uniquecharcount ) || ( base + ofs - trie->uniquecharcount < trie->lasttrans - && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) ofs++; PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) && - ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && - trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) { PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, - (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); } else { PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); } @@ -1191,17 +1567,18 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, } PerlIO_printf( Perl_debug_log, "\n" ); } - PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, ""); + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); for (word=1; word <= trie->wordcount; word++) { PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", (int)word, (int)(trie->wordinfo[word].prev), (int)(trie->wordinfo[word].len)); } PerlIO_printf(Perl_debug_log, "\n" ); -} +} /* Dumps a fully constructed but uncompressed trie in list form. - List tries normally only are used for construction when the number of + List tries normally only are used for construction when the number of possible chars (trie->uniquecharcount) is very high. Used for debugging make_trie(). */ @@ -1221,10 +1598,10 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", "------:-----+-----------------\n" ); - + for( state=1 ; state < next_alloc ; state ++ ) { U16 charid; - + PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", (int)depth * 2 + 2,"", (UV)state ); if ( ! trie->states[ state ].wordnum ) { @@ -1235,31 +1612,33 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, ); } for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { - SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); if ( tmp ) { PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR ) , TRIE_LIST_ITEM(state,charid).forid, (UV)TRIE_LIST_ITEM(state,charid).newstate ); - if (!(charid % 10)) + if (!(charid % 10)) PerlIO_printf(Perl_debug_log, "\n%*s| ", (int)((depth * 2) + 14), ""); } } PerlIO_printf( Perl_debug_log, "\n"); } -} +} /* Dumps a fully constructed but uncompressed trie in table form. - This is the normal DFA style state transition table, with a few - twists to facilitate compression later. + This is the normal DFA style state transition table, with a few + twists to facilitate compression later. Used for debugging make_trie(). */ STATIC void @@ -1274,24 +1653,24 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; - + /* print out the table precompression so that we can do a visual check that they are identical. */ - + PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { SV ** const tmp = av_fetch( revcharmap, charid, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + PerlIO_printf( Perl_debug_log, "%*s", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) + PERL_PV_ESCAPE_FIRSTCHAR + ) ); } } @@ -1306,7 +1685,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { - PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", + PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", (int)depth * 2 + 2,"", (UV)TRIE_NODENUM( state ) ); @@ -1318,9 +1697,11 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); } if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); } else { - PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, trie->states[ TRIE_NODENUM( state ) ].wordnum ); } } @@ -1439,7 +1820,7 @@ and would end up looking like: 8: EXACT (10) 10: END(0) - d = uvuni_to_utf8_flags(d, uv, 0); + d = uvchr_to_utf8_flags(d, uv, 0); is the recommended Unicode-aware way of saying @@ -1451,7 +1832,7 @@ is the recommended Unicode-aware way of saying if (UTF) { \ SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ @@ -1462,31 +1843,28 @@ is the recommended Unicode-aware way of saying } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - /* if it is UTF then it is either already folded, or does not need folding */ \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ - } \ - else if (folder == PL_fold_latin1) { \ - /* if we use this folder we have to obey unicode rules on latin-1 data */ \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ - skiplen = UNISKIP(uvc); \ - foldlen -= skiplen; \ - scan = foldbuf + skiplen; \ - } \ - } else { \ - /* raw data, will be folded later if needed */ \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1529,7 +1907,8 @@ is the recommended Unicode-aware way of saying \ if ( noper_next < tail ) { \ if (!trie->jump) \ - trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ trie->jump[curword] = (U16)(noper_next - convert); \ if (!jumper) \ jumper = noper_next; \ @@ -1564,7 +1943,9 @@ is the recommended Unicode-aware way of saying #define MADE_EXACT_TRIE 4 STATIC I32 -S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) { dVAR; /* first pass, loop through and scan words */ @@ -1572,7 +1953,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs HV *widecharmap = NULL; AV *revcharmap = newAV(); regnode *cur; - const U32 uniflags = UTF8_ALLOW_DEFAULT; STRLEN len = 0; UV uvc = 0; U16 curword = 0; @@ -1585,13 +1965,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 * folder = NULL; #ifdef DEBUGGING - const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); AV *trie_words = NULL; /* along with revcharmap, this only used during construction but both are * useful during debugging so we store them in the struct when debugging. */ #else - const U32 data_slot = add_data( pRExC_state, 2, "tu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); STRLEN trie_charcount=0; #endif SV *re_trie_maxbuff; @@ -1606,10 +1986,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs case EXACT: break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; - case EXACTFL: folder = PL_fold_locale; break; default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } @@ -1633,14 +2011,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } DEBUG_TRIE_COMPILE_r({ - PerlIO_printf( Perl_debug_log, - "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - (int)depth * 2 + 2, "", - REG_NODE_NUM(startbranch),REG_NODE_NUM(first), - REG_NODE_NUM(last), REG_NODE_NUM(tail), - (int)depth); + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); }); - + /* Find the node we are going to overwrite */ if ( first == startbranch && OP( last ) != BRANCH ) { /* whole branch chain */ @@ -1649,7 +2026,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* branch sub-chain */ convert = NEXTOPER( first ); } - + /* -- First loop and Setup -- We first traverse the branches and scan each word to determine if it @@ -1658,9 +2035,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store Unicode characters. We use the - native representation of the character value as the key and IV's for the - coded index. + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. *TODO* If we keep track of how many times each character is used we can remap the columns so that the table compression later on is more @@ -1677,13 +2054,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); const U8 *e = uc + STR_LEN( noper ); - STRLEN foldlen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; - const U8 *scan = (U8*)NULL; + int foldlen = 0; U32 wordlen = 0; /* required init */ - STRLEN chars = 0; - bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1703,13 +2079,77 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regardless of encoding */ if (OP( noper ) == EXACTFU_SS) { /* false positives are ok, so just set this */ - TRIE_BITMAP_SET(trie,0xDF); + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; - chars++; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ if ( uvc < 256 ) { if ( folder ) { U8 folded= folder[ (U8) uvc ]; @@ -1733,13 +2173,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !UTF ) { /* store first byte of utf8 representation of variant codepoints */ - if (! UNI_IS_INVARIANT(uvc)) { + if (! UVCHR_IS_INVARIANT(uvc)) { TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); } } set_bit = 0; /* We've done our bit :-) */ } } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + SV** svpp; if ( !widecharmap ) widecharmap = newHV(); @@ -1754,30 +2202,22 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs TRIE_STORE_REVCHAR(uvc); } } - } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ if( cur == first ) { - trie->minlen = chars; - trie->maxlen = chars; - } else if (chars < trie->minlen) { - trie->minlen = chars; - } else if (chars > trie->maxlen) { - trie->maxlen = chars; - } - if (OP( noper ) == EXACTFU_SS) { - /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ - if (trie->minlen > 1) - trie->minlen= 1; - } - if (OP( noper ) == EXACTFU_TRICKYFOLD) { - /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" - * - We assume that any such sequence might match a 2 byte string */ - if (trie->minlen > 2 ) - trie->minlen= 2; + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; } - } /* end first pass */ DEBUG_TRIE_COMPILE_r( - PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", (int)depth * 2 + 2,"", ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, @@ -1809,7 +2249,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); prev_states[1] = 0; - if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { /* Second Pass -- Array Of Lists Representation @@ -1824,7 +2266,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN transcount = 1; - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sCompiling trie using list compiler\n", (int)depth * 2 + 2, "")); @@ -1841,11 +2283,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1864,14 +2302,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); if ( !svpp ) { charid = 0; } else { charid=(U16)SvIV( *svpp ); } } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ if ( charid ) { U16 check; @@ -1881,8 +2323,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !trie->states[ state ].trans.list ) { TRIE_LIST_NEW( state ); } - for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { - if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { newstate = TRIE_LIST_ITEM( state, check ).newstate; break; } @@ -1904,7 +2351,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } /* end second pass */ /* next alloc is the NEXT state to be allocated */ - trie->statecount = next_alloc; + trie->statecount = next_alloc; trie->states = (reg_trie_state *) PerlMemShared_realloc( trie->states, next_alloc @@ -1952,7 +2399,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, transcount * sizeof(reg_trie_trans) ); - Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); } base = trie->uniquecharcount + tp - minid; if ( maxid == minid ) { @@ -1960,22 +2409,27 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( ; zp < tp ; zp++ ) { if ( ! trie->trans[ zp ].next ) { base = trie->uniquecharcount + zp - minid; - trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ zp ].check = state; set = 1; break; } } if ( !set ) { - trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ tp ].check = state; tp++; zp = tp; } } else { for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; - trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; trie->trans[ tid ].check = state; } tp += ( maxid - minid + 1 ); @@ -1995,26 +2449,26 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* Second Pass -- Flat Table Representation. - we dont use the 0 slot of either trans[] or states[] so we add 1 to each. - We know that we will need Charcount+1 trans at most to store the data - (one row per char at worst case) So we preallocate both structures - assuming worst case. + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. We then construct the trie using only the .next slots of the entry structs. - We use the .check field of the first entry of the node temporarily to - make compression both faster and easier by keeping track of how many non - zero fields are in the node. + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. Since trans are numbered from 1 any 0 pointer in the table is a FAIL transition. - There are two terms at use here: state as a TRIE_NODEIDX() which is a - number representing the first entry of the node, and state as a - TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and - TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there - are 2 entrys per node. eg: + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: A B A B 1. 2 4 1. 3 7 @@ -2022,12 +2476,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs 3. 0 0 5. 0 0 4. 0 0 7. 0 0 - The table is internally in the right hand, idx form. However as we also - have to deal with the states array which is indexed by nodenum we have to - use TRIE_NODENUM() to convert. + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. */ - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sCompiling trie using table compiler\n", (int)depth * 2 + 2, "")); @@ -2051,12 +2505,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U16 charid = 0; /* sanity init */ U32 accept_state = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - STRLEN skiplen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -2075,7 +2525,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); charid = svpp ? (U16)SvIV(*svpp) : 0; } if ( charid ) { @@ -2091,7 +2544,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } else { Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ } } accept_state = TRIE_NODENUM( state ); @@ -2178,7 +2632,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U32 used = trie->trans[ stateidx ].check; trie->trans[ stateidx ].check = 0; - for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { if ( flag || trie->trans[ stateidx + charid ].next ) { if ( trie->trans[ stateidx + charid ].next ) { if (o_used == 1) { @@ -2187,8 +2644,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs break; } } - trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; - trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); trie->trans[ zp ].check = state; if ( ++zp > pos ) pos = zp; break; @@ -2197,9 +2659,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( !flag ) { flag = 1; - trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; } - trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); trie->trans[ pos ].check = state; pos++; } @@ -2210,19 +2675,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->states, laststate * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, - "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", - (int)depth * 2 + 2,"", - (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), - (IV)next_alloc, - (IV)pos, - ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); ); } /* end table compress */ } DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", (int)depth * 2 + 2, "", (UV)trie->statecount, (UV)trie->lasttrans) @@ -2232,10 +2699,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, trie->lasttrans * sizeof(reg_trie_trans) ); - { /* Modify the program and insert the new TRIE node */ + { /* Modify the program and insert the new TRIE node */ U8 nodetype =(U8)(flags & 0xFF); char *str=NULL; - + #ifdef DEBUGGING regnode *optimize = NULL; #ifdef RE_TRACK_PATTERN_OFFSETS @@ -2273,12 +2740,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs }); } DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); #endif - /* But first we check to see if there is a common prefix we can + /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; if ( trie->bitmap && !widecharmap && !trie->jump ) { @@ -2337,11 +2805,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlIO_printf( Perl_debug_log, "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", (int)depth * 2 + 2, "", - (UV)state, (UV)idx, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, + (UV)state, (UV)idx, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR + PERL_PV_ESCAPE_FIRSTCHAR ) ); }); @@ -2354,7 +2822,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs while (len--) *str++ = *ch++; } else { -#ifdef DEBUGGING +#ifdef DEBUGGING if (state>1) DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); #endif @@ -2405,17 +2873,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } } } - if (!jumper) - jumper = last; + if (!jumper) + jumper = last; if ( trie->maxlen ) { NEXT_OFF( convert ) = (U16)(tail - convert); ARG_SET( convert, data_slot ); - /* Store the offset to the first unabsorbed branch in - jump[0], which is otherwise unused by the jump logic. + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. We use this when dumping a trie and during optimisation. */ - if (trie->jump) + if (trie->jump) trie->jump[0] = (U16)(nextbranch - convert); - + /* If the start state is not accepting (meaning there is no empty string/NOTHING) * and there is a bitmap * and the first "jump target" node we found leaves enough room @@ -2430,17 +2898,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); PerlMemShared_free(trie->bitmap); trie->bitmap= NULL; - } else + } else OP( convert ) = TRIE; /* store the type in the flags */ convert->flags = nodetype; DEBUG_r({ - optimize = convert - + NODE_STEP_REGNODE + optimize = convert + + NODE_STEP_REGNODE + regarglen[ OP( convert ) ]; }); - /* XXX We really should free up the resource in trie now, + /* XXX We really should free up the resource in trie now, as we won't use them - (which resources?) dmq */ } /* needed for dumping*/ @@ -2450,8 +2918,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs while ( ++opt < optimize) { Set_Node_Offset_Length(opt,0,0); } - /* - Try to clean up some of the debris left after the + /* + Try to clean up some of the debris left after the optimisation. */ while( optimize < jumper ) { @@ -2506,32 +2974,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #else SvREFCNT_dec_NN(revcharmap); #endif - return trie->jump - ? MADE_JUMP_TRIE - : trie->startstate>1 - ? MADE_EXACT_TRIE + return trie->jump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE : MADE_TRIE; } STATIC void S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) { -/* The Trie is constructed and compressed now so we can build a fail array if it's needed +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed - This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the - "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 ISBN 0-201-10088-6 - We find the fail state for each state in the trie, this state is the longest proper - suffix of the current state's 'word' that is also a proper prefix of another word in our - trie. State 1 represents the word '' and is thus the default fail state. This allows - the DFA not to have to restart after its tried and failed a word at a given point, it - simply continues as though it had been matching the other word in the first place. + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. Consider 'abcdgu'=~/abcdefg|cdgu/ - When we get to 'd' we are still matching the first word, we would encounter 'g' which would - fail, which would bring us to the state representing 'd' in the second word where we would - try 'g' and succeed, proceeding to match 'cdgu'. + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. */ /* add a fail transition */ const U32 trie_offset = ARG(source); @@ -2546,7 +3019,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 base = trie->states[ 1 ].trans.base; U32 *fail; reg_ac_data *aho; - const U32 data_slot = add_data( pRExC_state, 1, "T" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; @@ -2610,7 +3083,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode fail[ 0 ] = fail[ 1 ] = 0; DEBUG_TRIE_COMPILE_r({ PerlIO_printf(Perl_debug_log, - "%*sStclass Failtable (%"UVuf" states): 0", + "%*sStclass Failtable (%"UVuf" states): 0", (int)(depth * 2), "", (UV)numstates ); for( q_read=1; q_read%3d: %s (%d)\n", \ (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ Next ? (REG_NODE_NUM(Next)) : 0 ); \ @@ -2657,49 +3119,58 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta of the input nodes. + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. * - * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF - * and contains LATIN SMALL LETTER SHARP S + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) * * This is as good a place as any to discuss the design of handling these * multi-character fold sequences. It's been wrong in Perl for a very long * time. There are three code points in Unicode whose multi-character folds * were long ago discovered to mess things up. The previous designs for * dealing with these involved assigning a special node for them. This - * approach doesn't work, as evidenced by this example: + * approach doesn't always work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node that + * Both sides fold to "sss", but if the pattern is parsed to create a node that * would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node - * that is "sss". + * that is "sss" in this case. * * It turns out that there are problems with all multi-character folds, and not - * just these three. Now the code is general, for all such cases, but the - * three still have some special handling. The approach taken is: + * just these three. Now the code is general, for all such cases. The + * approach taken is: * 1) This routine examines each EXACTFish node that could contain multi- - * character fold sequences. It returns in *min_subtract how much to - * subtract from the the actual length of the string to get a real minimum - * match length; it is 0 if there are no multi-char folds. This delta is - * used by the caller to adjust the min length of the match, and the delta - * between min and max, so that the optimizer doesn't reject these - * possibilities based on size constraints. - * 2) Certain of these sequences require special handling by the trie code, - * so, if found, this code changes the joined node type to special ops: - * EXACTFU_TRICKYFOLD and EXACTFU_SS. - * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS * is used for an EXACTFU node that contains at least one "ss" sequence in * it. For non-UTF-8 patterns and strings, this is the only case where * there is a possible fold length change. That means that a regular * EXACTFU node without UTF-8 involvement doesn't have to concern itself * with length changes, and so can be processed faster. regexec.c takes * advantage of this. Generally, an EXACTFish node that is in UTF-8 is - * pre-folded by regcomp.c. This saves effort in regex matching. - * However, the pre-folding isn't done for non-UTF8 patterns because the - * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things - * down by forcing the pattern into UTF8 unless necessary. Also what - * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold * possibilities for the non-UTF8 patterns are quite simple, except for * the sharp s. All the ones that don't involve a UTF-8 target string are * members of a fold-pair, and arrays are set up for all of them so that @@ -2707,45 +3178,63 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * this file makes sure that in EXACTFU nodes, the sharp s gets folded to * 'ss', even if the pattern isn't UTF-8. This avoids the issues * described in the next item. - * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the - * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a - * UTF-8 pattern.) An assumption that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes is that a - * character in the pattern corresponds to at most a single character in - * the target string. (And I do mean character, and not byte here, unlike - * other parts of the documentation that have never been updated to - * account for multibyte Unicode.) sharp s in EXACTF nodes can match the - * two character string 'ss'; in EXACTFA nodes it can match - * "\x{17F}\x{17F}". These violate the assumption, and they are the only - * instances where it is violated. I'm reluctant to try to change the - * assumption, as the code involved is impenetrable to me (khw), so - * instead the code here punts. This routine examines (when the pattern - * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a - * boolean indicating whether or not the node contains a sharp s. When it - * is true, the caller sets a flag that later causes the optimizer in this - * file to not set values for the floating and fixed string lengths, and - * thus avoids the optimizer code in regexec.c that makes the invalid + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid * assumption. Thus, there is no optimization based on string lengths for - * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. - * (The reason the assumption is wrong only in these two cases is that all - * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all - * other folds to their expanded versions. We can't prefold sharp s to - * 'ss' in EXACTF nodes because we don't know at compile time if it - * actually matches 'ss' or not. It will match iff the target string is - * in UTF-8, unlike the EXACTFU nodes, where it always matches; and - * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 - * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; - * but in a non-UTF8 pattern, folding it to that above-Latin1 string would - * require the pattern to be forced into UTF-8, the overhead of which we - * want to avoid.) - */ - -#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2774,7 +3263,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b && NEXT_OFF(n) && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { - + if (OP(n) == TAIL || n > next) stringok = 0; if (PL_regkind[OP(n)] == NOTHING) { @@ -2791,12 +3280,13 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); - /* XXX I (khw) kind of doubt that this works on platforms where - * U8_MAX is above 255 because of lots of other assumptions */ + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ /* Don't join if the sum can't fit into a single node */ if (oldl + STR_LEN(n) > U8_MAX) break; - + DEBUG_PEEP("merg",n,depth); merged++; @@ -2827,7 +3317,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } *min_subtract = 0; - *has_exactf_sharp_s = FALSE; + *unfolded_multi_char = FALSE; /* Here, all the adjacent mergeable EXACTish nodes have been merged. We * can now analyze for sequences of problematic code points. (Prior to @@ -2835,15 +3325,68 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - const U8 * const s0 = (U8*) STRING(scan); - const U8 * s = s0; - const U8 * const s_end = s0 + STR_LEN(scan); + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ /* One pass is made over the node's string looking for all the - * possibilities. to avoid some tests in the loop, there are two main + * possibilities. To avoid some tests in the loop, there are two main * cases, for UTF-8 patterns (which can't have EXACTF nodes) and * non-UTF-8 */ if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ /* Examine the string for a multi-character fold sequence. UTF-8 * patterns have all characters pre-folded by the time this code is @@ -2851,60 +3394,32 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b while (s < s_end - 1) /* Can stop 1 before the end, as minimum length sequence we are looking for is 2 */ { - int count = 0; + int count = 0; /* How many characters in a multi-char fold */ int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); if (! len) { /* Not a multi-char fold: get next char */ s += UTF8SKIP(s); continue; } - /* Nodes with 'ss' require special handling, except for EXACTFL - * and EXACTFA for which there is no multi-char fold to this */ + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ if (len == 2 && *s == 's' && *(s+1) == 's' - && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) { count = 2; - OP(scan) = EXACTFU_SS; - s += 2; - } - else if (len == 6 /* len is the same in both ASCII and EBCDIC - for these */ - && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6) - || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6))) - { - count = 3; - - /* These two folds require special handling by trie's, so - * change the node type to indicate this. If EXACTFA and - * EXACTFL were ever to be handled by trie's, this would - * have to be changed. If this node has already been - * changed to EXACTFU_SS in this loop, leave it as is. (I - * (khw) think it doesn't matter in regexec.c for UTF - * patterns, but no need to change it */ - if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_TRICKYFOLD; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; } - s += 6; + s += 2; } else { /* Here is a generic multi-char fold. */ - const U8* multi_end = s + len; - - /* Count how many characters in it. In the case of /l and - * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. (In - * EXACTFL, no folds are allowed to any Latin1 code point, - * not just ASCII. But there aren't any of these - * currently, nor ever likely, so don't take the time to - * test for them. The code that generates the - * is_MULTI_foo() macros croaks should one actually get put - * into Unicode .) */ - if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + U8* multi_end = s + len; + + /* Count how many characters in it. In the case of /aa, no + * folds which contain ASCII code points are allowed, so + * check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; } @@ -2924,44 +3439,61 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b /* The delta is how long the sequence is minus 1 (1 is how long * the character that folds to the sequence is) */ - *min_subtract += count - 1; - next_iteration: ; + total_count_delta += count - 1; + next_iteration: ; } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); } else if (OP(scan) == EXACTFA) { /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char * fold to the ASCII range (and there are no existing ones in the * upper latin1 range). But, as outlined in the comments preceding - * this function, we need to flag any occurrences of the sharp s */ + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { - *has_exactf_sharp_s = TRUE; + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; break; } s++; continue; } } - else if (OP(scan) != EXACTFL) { + else { - /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the - * multi-char folds that are all Latin1. (This code knows that - * there are no current multi-char folds possible with EXACTFL, - * relying on fold_grind.t to catch any errors if the very unlikely - * event happens that some get added in future Unicode versions.) - * As explained in the comments preceding this function, we look - * also for the sharp s in EXACTF nodes; it can be in the final - * position. Otherwise we can stop looking 1 byte earlier because - * have to find at least two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; while (s < upper) { int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) { - *has_exactf_sharp_s = TRUE; + *unfolded_multi_char = TRUE; } s++; continue; @@ -2976,8 +3508,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * changed so that a sharp s in the string can match this * ss in the pattern, but they remain EXACTF nodes, as they * won't match this unless the target string is is UTF-8, - * which we don't know until runtime */ - if (OP(scan) != EXACTF) { + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { OP(scan) = EXACTFU_SS; } } @@ -3011,7 +3544,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b #define INIT_AND_WITHP \ assert(!and_withp); \ - Newx(and_withp,1,struct regnode_charclass_class); \ + Newx(and_withp,1, regnode_ssc); \ SAVEFREEPV(and_withp) /* this is a chain of data about sub patterns we are processing that @@ -3022,20 +3555,19 @@ typedef struct scan_frame { regnode *last; /* last node to process in this frame */ regnode *next; /* next node to process when last is reached */ struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; I32 stop; /* what stopparen do we use */ } scan_frame; -#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) - -STATIC I32 +STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, - I32 *minlenp, I32 *deltap, + SSize_t *minlenp, SSize_t *deltap, regnode *last, scan_data_t *data, I32 stopparen, - U8* recursed, - struct regnode_charclass_class *and_withp, + U32 recursed_depth, + regnode_ssc *and_withp, U32 flags, U32 depth) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ @@ -3046,17 +3578,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0; /* There must be at least this number of characters to match */ + /* There must be at least this number of characters to match */ + SSize_t min = 0; I32 pars = 0, code; regnode *scan = *scanp, *next; - I32 delta = 0; + SSize_t delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; SV *re_trie_maxbuff = NULL; regnode *first_non_open = scan; - I32 stopmin = I32_MAX; + SSize_t stopmin = SSize_t_MAX; scan_frame *frame = NULL; GET_RE_DEBUG_FLAGS_DECL; @@ -3065,7 +3598,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif - if ( depth == 0 ) { while (first_non_open && OP(first_non_open) == OPEN) first_non_open=regnext(first_non_open); @@ -3077,15 +3609,40 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV min_subtract = 0; /* How mmany chars to subtract from the minimum node length to get a real minimum (because the folded version may be shorter) */ - bool has_exactf_sharp_s = FALSE; + bool unfolded_multi_char = FALSE; /* Peephole optimizer: */ - DEBUG_STUDYDATA("Peep:", data,depth); - DEBUG_PEEP("Peep",scan,depth); + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); - /* Its not clear to khw or hv why this is done here, and not in the - * clauses that deal with EXACT nodes. khw's guess is that it's - * because of a previous design */ - JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); + + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -3118,24 +3675,29 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, || OP(scan) == IFTHEN) { next = regnext(scan); code = OP(scan); - /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ if (OP(next) == code || code == IFTHEN) { - /* NOTE - There is similar code to this block below for handling - TRIE nodes on a re-study. If you change stuff here check there - too. */ - I32 max1 = 0, min1 = I32_MAX, num = 0; - struct regnode_charclass_class accum; + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; regnode * const startbranch=scan; - if (flags & SCF_DO_SUBSTR) - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ - if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); while (OP(scan) == code) { - I32 deltanext, minnext, f = 0, fake; - struct regnode_charclass_class this_class; + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; num++; data_fake.flags = 0; @@ -3152,7 +3714,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (code != BRANCH) scan = NEXTOPER(scan); if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -3160,21 +3722,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - next, &data_fake, - stopparen, recursed, NULL, f,depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); if (min1 > minnext) min1 = minnext; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; + max1 = SSize_t_MAX; } else if (max1 < minnext + deltanext) max1 = minnext + deltanext; scan = next; if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > minnext) + if ( stopmin > minnext) stopmin = min + min1; flags &= ~SCF_DO_SUBSTR; if (data) @@ -3186,63 +3748,64 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; if (flags & SCF_DO_SUBSTR) { data->pos_min += min1; - if (data->pos_delta >= I32_MAX - (max1 - min1)) - data->pos_delta = I32_MAX; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; else data->pos_delta += max1 - min1; if (max1 != min1 || is_inf) data->longest = &(data->longest_float); } min += min1; - if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0) - delta = I32_MAX; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; else delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } - if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { /* demq. - Assuming this was/is a branch we are dealing with: 'scan' now - points at the item that follows the branch sequence, whatever - it is. We now start at the beginning of the sequence and look - for subsequences of + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of BRANCH->EXACT=>x1 BRANCH->EXACT=>x2 tail - which would be constructed from a pattern like /A|LIST|OF|WORDS/ + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ If we can find such a subsequence we need to turn the first element into a trie and then add the subsequent branch exact @@ -3250,7 +3813,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, We have two cases - 1. patterns where the whole set of branches can be converted. + 1. patterns where the whole set of branches can be + converted. 2. patterns where only a subset can be converted. @@ -3261,7 +3825,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, 'BRANCH EXACT; BRANCH EXACT; BRANCH X' becomes BRANCH TRIE; BRANCH X; - There is an additional case, that being where there is a + There is an additional case, that being where there is a common prefix, which gets split out into an EXACT like node preceding the TRIE node. @@ -3287,7 +3851,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U32 count=0; #ifdef DEBUGGING - SV * const mysv = sv_newmortal(); /* for dumping */ + SV * const mysv = sv_newmortal(); /* for dumping */ #endif /* var tail is used because there may be a TAIL regop in the way. Ie, the exacts will point to the @@ -3302,49 +3866,60 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, tail = regnext( tail ); } - + DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, tail ); + regprop(RExC_rx, mysv, tail, NULL); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", - (int)depth * 2 + 2, "", - "Looking for TRIE'able sequences. Tail node is: ", - SvPV_nolen_const( mysv ) + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) ); }); - + /* Step through the branches cur represents each branch, - noper is the first thing to be matched as part of that branch + noper is the first thing to be matched as part + of that branch noper_next is the regnext() of that node. - We normally handle a case like this /FOO[xyz]|BAR[pqr]/ - via a "jump trie" but we also support building with NOJUMPTRIE, - which restricts the trie logic to structures like /FOO|BAR/. - - If noper is a trieable nodetype then the branch is a possible optimization - target. If we are building under NOJUMPTRIE then we require that noper_next - is the same as scan (our current position in the regex program). - - Once we have two or more consecutive such branches we can create a - trie of the EXACT's contents and stitch it in place into the program. - - If the sequence represents all of the branches in the alternation we - replace the entire thing with a single TRIE node. - - Otherwise when it is a subsequence we need to stitch it in place and - replace only the relevant branches. This means the first branch has - to remain as it is used by the alternation logic, and its next pointer, - and needs to be repointed at the item on the branch chain following - the last branch we have optimized away. - - This could be either a BRANCH, in which case the subsequence is internal, - or it could be the item following the branch sequence in which case the - subsequence is at the end (which does not necessarily mean the first node - is the start of the alternation). - - TRIE_TYPE(X) is a define which maps the optype to a trietype. + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. optype | trietype ----------------+----------- @@ -3352,14 +3927,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, EXACT | EXACT EXACTFU | EXACTFU EXACTFU_SS | EXACTFU - EXACTFU_TRICKYFOLD | EXACTFU - EXACTFA | 0 + EXACTFA | EXACTFA */ #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ ( EXACT == (X) ) ? EXACT : \ - ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ 0 ) /* dont use tail as the end marker for this traverse */ @@ -3374,27 +3949,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); - regprop(RExC_rx, mysv, noper); + regprop(RExC_rx, mysv, noper, NULL); PerlIO_printf( Perl_debug_log, " -> %s", SvPV_nolen_const(mysv)); if ( noper_next ) { - regprop(RExC_rx, mysv, noper_next ); + regprop(RExC_rx, mysv, noper_next, NULL); PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), - PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] + PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] ); }); - /* Is noper a trieable nodetype that can be merged with the - * current trie (if there is one)? */ + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ if ( noper_trietype && ( @@ -3407,10 +3982,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif && count < U16_MAX) { - /* Handle mergable triable node - * Either we are the first node in a new trieable sequence, - * in which case we do some bookkeeping, otherwise we update - * the end pointer. */ + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ if ( !first ) { first = cur; if ( noper_trietype == NOTHING ) { @@ -3423,8 +3998,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_next_trietype ) { trietype = noper_next_trietype; } else if (noper_next_type) { - /* a NOTHING regop is 1 regop wide. We need at least two - * for a trie so we can't merge this in */ + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ first = NULL; } } else { @@ -3440,31 +4016,39 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle mergable triable node */ else { /* handle unmergable node - - * noper may either be a triable node which can not be tried - * together with the current trie, or a non triable node */ + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ if ( last ) { - /* If last is set and trietype is not NOTHING then we have found - * at least two triable branch sequences in a row of a similar - * trietype so we can turn them into a trie. If/when we - * allow NOTHING to start a trie sequence this condition will be - * required, and it isn't expensive so we leave it in for now. */ + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ if ( trietype && trietype != NOTHING ) make_trie( pRExC_state, - startbranch, first, cur, tail, count, - trietype, depth+1 ); - last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ } if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ - /* noper is triable, so we can start a new trie sequence */ + /* noper is triable, so we can start a new + * trie sequence */ count = 1; first = cur; trietype = noper_trietype; } else if (first) { - /* if we already saw a first but the current node is not triable then we have + /* if we already saw a first but the + * current node is not triable then we have * to reset the first information. */ count = 0; first = NULL; @@ -3473,18 +4057,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle unmergable node */ } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) \n", (int)depth * 2 + 2, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); if ( last && trietype ) { if ( trietype != NOTHING ) { - /* the last branch of the sequence was part of a trie, - * so we have to construct it here outside of the loop - */ - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) @@ -3494,20 +4081,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( startbranch == first && scan == tail ) { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } } #endif } else { - /* at this point we know whatever we have is a NOTHING sequence/branch - * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING */ if ( startbranch == first ) { regnode *opt; - /* the entire thing is a NOTHING sequence, something like this: - * (?:|) So we can turn it into a plain NOTHING op. */ + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); @@ -3521,9 +4111,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } /* end if ( last) */ } /* TRIE_MAXBUF is non zero */ - + } /* do trie */ - + } else if ( code == BRANCHJ ) { /* single branch is optimized. */ scan = NEXTOPER(NEXTOPER(scan)); @@ -3535,9 +4125,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 paren; regnode *start; regnode *end; + U32 my_recursed_depth= recursed_depth; if (OP(scan) != SUSPEND) { - /* set the pointer */ + /* set the pointer */ if (OP(scan) == GOSUB) { paren = ARG(scan); RExC_recurse[ARG2L(scan)] = scan; @@ -3548,21 +4139,33 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, start = RExC_rxi->program + 1; end = RExC_opend; } - if (!recursed) { - Newxz(recursed, (((RExC_npar)>>3) +1), U8); - SAVEFREEPV(recursed); - } - if (!PAREN_TEST(recursed,paren+1)) { - PAREN_SET(recursed,paren+1); + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; Newx(newframe,1,scan_frame); } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } } else { @@ -3579,17 +4182,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, newframe->last = last; newframe->stop = stopparen; newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); frame = newframe; scan = start; stopparen = paren; last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; continue; } } else if (OP(scan) == EXACT) { - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc; if (UTF) { const U8 * const s = (U8*)STRING(scan); @@ -3605,7 +4214,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data->last_end == -1) { /* Update the start info. */ data->last_start_min = data->pos_min; data->last_start_max = is_inf - ? I32_MAX : data->pos_min + data->pos_delta; + ? SSize_t_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); if (UTF) @@ -3616,83 +4225,47 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) mg->mg_len += utf8_length((U8*)STRING(scan), - (U8*)STRING(scan)+STR_LEN(scan)); + (U8*)STRING(scan)+STR_LEN(scan)); } data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - - - /* If compatible, we or it in below. It is compatible if is - * in the bitmp and either 1) its bit or its fold is set, or 2) - * it's for a locale. Even if there isn't unicode semantics - * here, at runtime there may be because of matching against a - * utf8 string, so accept a possible false positive for - * latin1-range folds */ - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && (!(data->start_class->flags & ANYOF_LOC_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - ) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) - ANYOF_BITMAP_SET(data->start_class, uc); - else if (uc >= 0x100) { - int i; - - /* Some Unicode code points fold to the Latin1 range; as - * XXX temporary code, instead of figuring out if this is - * one, just assume it is and set all the start class bits - * that could be some such above 255 code point's fold - * which will generate fals positives. As the code - * elsewhere that does compute the fold settles down, it - * can be extracted out and re-used here */ - for (i = 0; i < 256; i++){ - if (HAS_NONLATIN1_FOLD_CLOSURE(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } - } - } - CLEAR_SSC_EOS(data->start_class); - if (uc < 0x100) - data->start_class->flags &= ~ANYOF_UNICODE_ALL; + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); } else if (flags & SCF_DO_STCLASS_OR) { - /* false positive possible if the class is case-folded */ - if (uc < 0x100) - ANYOF_BITMAP_SET(data->start_class, uc); - else - data->start_class->flags |= ANYOF_UNICODE_ALL; - CLEAR_SSC_EOS(data->start_class); - cl_and(data->start_class, and_withp); + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { const U8 * const s = (U8 *)STRING(scan); uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - if (has_exactf_sharp_s) { - RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; } min += l - min_subtract; assert (min >= 0); @@ -3707,99 +4280,95 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) { - ANYOF_BITMAP_SET(data->start_class, uc); - CLEAR_SSC_EOS(data->start_class); - if (OP(scan) == EXACTFL) { - /* XXX This set is probably no longer necessary, and - * probably wrong as LOCALE now is on in the initial - * state */ - data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; - } - else { + if (OP(scan) == EXACTFL) { + + /* We don't know what the folds are; it could be anything. XXX + * Actually, we only support UTF-8 encoding for code points + * above Latin1, so we could know what those folds are. */ + EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, + 0, + UV_MAX); + } + else { /* Non-locale EXACTFish */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + if (uc < 256) { /* We know what the Latin1 folds are ... */ + if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we + know if anything folds + with this */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); + if (OP(scan) != EXACTFA) { /* The folds below aren't + legal under /iaa */ + if (isARG2_lower_or_UPPER_ARG1('s', uc)) { + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, 's'); + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, 'S'); + } + } - /* Also set the other member of the fold pair. In case - * that unicode semantics is called for at runtime, use - * the full latin1 fold. (Can't do this for locale, - * because not known until runtime) */ - ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); - - /* All other (EXACTFL handled above) folds except under - * /iaa that include s, S, and sharp_s also may include - * the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } - } - } - } - else if (uc >= 0x100) { - int i; - for (i = 0; i < 256; i++){ - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } - } - } + /* We also know if there are above-Latin1 code points + * that fold to this (none legal for ASCII and /iaa) */ + if ((! isASCII(uc) || OP(scan) != EXACTFA) + && HAS_NONLATIN1_FOLD_CLOSURE(uc)) + { + /* XXX We could know exactly what does fold to this + * if the reverse folds are loaded, as currently in + * S_regclass() */ + _invlist_union(EXACTF_invlist, + PL_AboveLatin1, + &EXACTF_invlist); + } + } + } + else { /* Non-locale, above Latin1. XXX We don't currently + know what participates in folds with this, so have + to assume anything could */ + + /* XXX We could know exactly what does fold to this if the + * reverse folds are loaded, as currently in S_regclass(). + * But we do know that under /iaa nothing in the ASCII + * range can participate */ + if (OP(scan) == EXACTFA) { + _invlist_union_complement_2nd(EXACTF_invlist, + PL_XPosix_ptrs[_CC_ASCII], + &EXACTF_invlist); + } + else { + EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, + 0, UV_MAX); + } + } + } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); } else if (flags & SCF_DO_STCLASS_OR) { - if (data->start_class->flags & ANYOF_LOC_FOLD) { - /* false positive possible if the class is case-folded. - Assume that the locale settings are the same... */ - if (uc < 0x100) { - ANYOF_BITMAP_SET(data->start_class, uc); - if (OP(scan) != EXACTFL) { - - /* And set the other member of the fold pair, but - * can't do that in locale because not known until - * run-time */ - ANYOF_BITMAP_SET(data->start_class, - PL_fold_latin1[uc]); - - /* All folds except under /iaa that include s, S, - * and sharp_s also may include the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } - } - } - } - CLEAR_SSC_EOS(data->start_class); - } - cl_and(data->start_class, and_withp); + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); } else if (REGNODE_VARIES(OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, fl = 0; - I32 f = flags, pos_before = 0; + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; regnode * const oscan = scan; - struct regnode_charclass_class this_class; - struct regnode_charclass_class *oclass = NULL; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; I32 next_is_eval = 0; switch (PL_regkind[OP(scan)]) { @@ -3829,12 +4398,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan = NEXTOPER(scan); goto do_curly; } - is_inf = is_inf_internal = 1; - scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } + is_inf = is_inf_internal = 1; + scan = regnext(scan); goto optimize_curly_tail; case CURLY: if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) @@ -3855,7 +4425,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ pos_before = data->pos_min; } if (data) { @@ -3865,7 +4437,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags |= SF_IS_INF; } if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); oclass = data->start_class; data->start_class = &this_class; f |= SCF_DO_STCLASS_AND; @@ -3884,36 +4456,36 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f &= ~SCF_WHILEM_VISITED_POS; /* This will finish on WHILEM, setting scan, or on NULL: */ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - last, data, stopparen, recursed, NULL, - (mincount == 0 - ? (f & ~SCF_DO_SUBSTR) : f),depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); if (flags & SCF_DO_STCLASS) data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&this_class, data->start_class, - struct regnode_charclass_class); + StructCopy(&this_class, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); - cl_and(data->start_class, and_withp); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); } else if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); flags &= ~SCF_DO_STCLASS; } if (!scan) /* It was not CURLYX, but CURLY. */ @@ -3923,24 +4495,26 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3) /* Complement check for big count */ + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ { /* Fatal warnings may leak the regexp without this: */ SAVEFREESV(RExC_rx_sv); ckWARNreg(RExC_parse, - "Quantifier unexpected on zero-length expression"); + "Quantifier unexpected on zero-length expression"); (void)ReREFCNT_inc(RExC_rx_sv); } min += minnext * mincount; - is_inf_internal |= deltanext == I32_MAX - || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); is_inf |= is_inf_internal; - if (is_inf) - delta = I32_MAX; - else - delta += (minnext + deltanext) * maxcount - minnext * mincount; - + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } /* Try powerful optimization CURLYX => CURLYN. */ if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR @@ -3991,7 +4565,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ - && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4038,7 +4615,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif /* Optimize again: */ study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, - NULL, stopparen, recursed, NULL, 0,depth+1); + NULL, stopparen, recursed_depth, NULL, 0,depth+1); } else oscan->flags = 0; @@ -4063,43 +4640,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, pars++; if (flags & SCF_DO_SUBSTR) { SV *last_str = NULL; + STRLEN last_chrs = 0; int counted = mincount != 0; - if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ -#if defined(SPARC64_GCC_WORKAROUND) - I32 b = 0; - STRLEN l = 0; - const char *s = NULL; - I32 old = 0; - - if (pos_before >= data->last_start_min) - b = pos_before; - else - b = data->last_start_min; - - l = 0; - s = SvPV_const(data->last_found, l); - old = b - data->last_start_min; - -#else - I32 b = pos_before >= data->last_start_min + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min ? pos_before : data->last_start_min; STRLEN l; const char * const s = SvPV_const(data->last_found, l); - I32 old = b - data->last_start_min; -#endif + SSize_t old = b - data->last_start_min; if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); repeatcpy(SvPVX(last_str) + l, - SvPVX_const(last_str), l, mincount - 1); + SvPVX_const(last_str), l, + mincount - 1); SvCUR_set(last_str, SvCUR(last_str) * mincount); /* Add additional parts. */ SvCUR_set(data->last_found, @@ -4111,34 +4677,41 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str) - l; + mg->mg_len += last_chrs * (mincount-1); } + last_chrs *= mincount; data->last_end += l * (mincount - 1); } } else { /* start offset must point into the last copy */ data->last_start_min += minnext * (mincount - 1); - data->last_start_max += is_inf ? I32_MAX + data->last_start_max += is_inf ? SSize_t_MAX : (maxcount - 1) * (minnext + data->pos_delta); } } /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n", - counted, deltanext, I32_MAX, minnext, maxcount, mincount); -if (deltanext != I32_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta); +PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf + " SSize_t_MAX=%"UVdf" minnext=%"UVdf + " maxcount=%"UVdf" mincount=%"UVdf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif - if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta) - data->pos_delta = I32_MAX; + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; else data->pos_delta += - counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount; if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); if (mincount && last_str) { SV * const sv = data->last_found; MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? @@ -4148,12 +4721,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext mg->mg_len = -1; sv_setsv(sv, last_str); data->last_end = data->pos_min; - data->last_start_min = - data->pos_min - CHR_SVLEN(last_str); + data->last_start_min = data->pos_min - last_chrs; data->last_start_max = is_inf - ? I32_MAX - : data->pos_min + data->pos_delta - - CHR_SVLEN(last_str); + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; } data->longest = &(data->longest_float); } @@ -4168,164 +4739,212 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext NEXT_OFF(oscan) += NEXT_OFF(next); } continue; - default: /* REF, and CLUMP only? */ + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) - cl_anything(pRExC_state, data->start_class); + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } flags &= ~SCF_DO_STCLASS; break; } } else if (OP(scan) == LNBREAK) { if (flags & SCF_DO_STCLASS) { - int value = 0; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ if (flags & SCF_DO_STCLASS_AND) { - for (value = 0; value < 256; value++) - if (!is_VERTWS_cp(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - else { - for (value = 0; value < 256; value++) - if (is_VERTWS_cp(value)) - ANYOF_BITMAP_SET(data->start_class, value); + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } min++; delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += 1; data->pos_delta += 1; data->longest = &(data->longest_float); } } else if (REGNODE_SIMPLE(OP(scan))) { - int value = 0; if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min++; } min++; if (flags & SCF_DO_STCLASS) { - int loop_max = 256; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; /* Some of the logic below assumes that switching locale on will only add false positives. */ - switch (PL_regkind[OP(scan)]) { - U8 classnum; + switch (OP(scan)) { - case SANY: default: #ifdef DEBUGGING - Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); #endif - do_default: + case CANY: + case SANY: if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_match_all_cp(data->start_class); break; + case REG_ANY: - if (OP(scan) == SANY) - goto do_default; - if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ - value = (ANYOF_BITMAP_TEST(data->start_class,'\n') - || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); - cl_anything(pRExC_state, data->start_class); + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); } - if (flags & SCF_DO_STCLASS_AND || !value) - ANYOF_BITMAP_CLEAR(data->start_class,'\n'); break; - case ANYOF: + + case ANYOF: if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, - (struct regnode_charclass_class*)scan); + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); else - cl_or(pRExC_state, data->start_class, - (struct regnode_charclass_class*)scan); + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); break; - case POSIXA: - loop_max = 128; + + case NPOSIXL: + invert = 1; /* FALL THROUGH */ + case POSIXL: - case POSIXD: - case POSIXU: - classnum = FLAGS(scan); - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } - } - } - else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum)); + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; + if (flags & SCF_DO_STCLASS_AND) { + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } - else { - - /* Even if under locale, set the bits for non-locale - * in case it isn't a true locale-node. This will - * create false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); + } + else { + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } - } - break; - case NPOSIXA: - loop_max = 128; + } + break; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; /* FALL THROUGH */ - case NPOSIXL: - case NPOSIXU: + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + case NPOSIXD: - classnum = FLAGS(scan); - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } - } - } - else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1); - } - else { + case NPOSIXU: + invert = 1; + /* FALL THROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } - /* Even if under locale, set the bits for non-locale in - * case it isn't a true locale-node. This will create - * false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } - } - if (PL_regkind[OP(scan)] == NPOSIXD) { - data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - } - break; + join_posix: + + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); + } + else { + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); + } } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -4333,7 +4952,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } else if ( PL_regkind[OP(scan)] == BRANCHJ @@ -4352,11 +4971,12 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("OPFAIL",data,depth); /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, mysv_val, upto); - PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(mysv_val), - (IV)REG_NODE_NUM(upto), - (IV)(upto - scan) + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) ); }); OP(scan) = OPFAIL; @@ -4366,16 +4986,16 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext scan= upto; continue; } - if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY || OP(scan) == UNLESSM ) { /* Negative Lookahead/lookbehind In this case we can't do fixed string optimisation. */ - I32 deltanext, minnext, fake = 0; + SSize_t deltanext, minnext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; data_fake.flags = 0; @@ -4388,7 +5008,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.pos_delta = delta; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4396,14 +5016,16 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext f |= SCF_WHILEM_VISITED_POS; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f, depth+1); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (minnext > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)minnext; } @@ -4422,14 +5044,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext * *** HACK *** for now just treat as "no information". * See [perl #56690]. */ - cl_init(pRExC_state, data->start_class); + ssc_init(pRExC_state, data->start_class); } else { /* AND before and after: combine and continue */ - const int was = TEST_SSC_EOS(data->start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } } } @@ -4442,26 +5060,26 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext length of the pattern, something we won't know about until after the recurse. */ - I32 deltanext, fake = 0; + SSize_t deltanext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; - /* We use SAVEFREEPV so that when the full compile - is finished perl will clean up the allocated + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated minlens when it's all done. This way we don't have to worry about freeing them when we know they wont be used, which would be a pain. */ - I32 *minnextp; - Newx( minnextp, 1, I32 ); + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); SAVEFREEPV(minnextp); if (data) { StructCopy(data, &data_fake, scan_data_t); if ((flags & SCF_DO_SUBSTR) && data->last_found) { f |= SCF_DO_SUBSTR; - if (scan->flags) - SCAN_COMMIT(pRExC_state, &data_fake,minlenp); + if (scan->flags) + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); data_fake.last_found=newSVsv(data->last_found); } } @@ -4473,7 +5091,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.flags |= SF_IS_INF; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4482,14 +5100,17 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f,depth+1); + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (*minnextp > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)*minnextp; } @@ -4497,11 +5118,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext *minnextp += min; if (f & SCF_DO_STCLASS_AND) { - const int was = TEST_SSC_EOS(data.start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4512,10 +5129,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { if (RExC_rx->minlen<*minnextp) RExC_rx->minlen=*minnextp; - SCAN_COMMIT(pRExC_state, &data_fake, minnextp); + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); SvREFCNT_dec_NN(data_fake.last_found); - - if ( data_fake.minlen_fixed != minlenp ) + + if ( data_fake.minlen_fixed != minlenp ) { data->offset_fixed= data_fake.offset_fixed; data->minlen_fixed= data_fake.minlen_fixed; @@ -4556,7 +5173,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext } else if ( PL_regkind[OP(scan)] == ENDLIKE ) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); flags &= ~SCF_DO_SUBSTR; } if (data && OP(scan)==ACCEPT) { @@ -4568,26 +5185,26 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } else if (OP(scan) == GPOS) { - if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && - !(delta || is_inf || (data && data->pos_delta))) + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) { - if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) - RExC_rx->extflags |= RXf_ANCH_GPOS; - if (RExC_rx->gofs < (U32)min) + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) RExC_rx->gofs = min; } else { - RExC_rx->extflags |= RXf_GPOS_FLOAT; + RExC_rx->intflags |= PREGf_GPOS_FLOAT; RExC_rx->gofs = 0; - } + } } #ifdef TRIE_STUDY_OPT #ifdef FULL_TRIE_STUDY @@ -4598,26 +5215,28 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext regnode *trie_node= scan; regnode *tail= regnext(scan); reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - I32 max1 = 0, min1 = I32_MAX; - struct regnode_charclass_class accum; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; - if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); - + ssc_init_zero(pRExC_state, &accum); + if (!trie->jump) { min1= trie->minlen; max1= trie->maxlen; } else { const regnode *nextbranch= NULL; U32 word; - - for ( word=1 ; word <= trie->wordcount ; word++) + + for ( word=1 ; word <= trie->wordcount ; word++) { - I32 deltanext=0, minnext=0, f = 0, fake; - struct regnode_charclass_class this_class; - + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; + data_fake.flags = 0; if (data) { data_fake.whilem_c = data->whilem_c; @@ -4627,40 +5246,39 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.last_closep = &fake; data_fake.pos_delta = delta; if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } if (flags & SCF_WHILEM_VISITED_POS) f |= SCF_WHILEM_VISITED_POS; - + if (trie->jump[word]) { if (!nextbranch) nextbranch = trie_node + trie->jump[0]; scan= trie_node + trie->jump[word]; /* We go from the jump point to the branch that follows - it. Note this means we need the vestigal unused branches - even though they arent otherwise used. - */ - minnext = study_chunk(pRExC_state, &scan, minlenp, - &deltanext, (regnode *)nextbranch, &data_fake, - stopparen, recursed, NULL, f,depth+1); + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed_depth, NULL, f,depth+1); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode*)nextbranch); - - if (min1 > (I32)(minnext + trie->minlen)) + + if (min1 > (SSize_t)(minnext + trie->minlen)) min1 = minnext + trie->minlen; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; - } else if (max1 < (I32)(minnext + deltanext + trie->maxlen)) + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) max1 = minnext + deltanext + trie->maxlen; - + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > min + min1) + if ( stopmin > min + min1) stopmin = min + min1; flags &= ~SCF_DO_SUBSTR; if (data) @@ -4672,7 +5290,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -4684,28 +5302,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } scan= tail; @@ -4715,19 +5330,20 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext else if (PL_regkind[OP(scan)] == TRIE) { reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; U8*bang=NULL; - + min += trie->minlen; delta += (trie->maxlen - trie->minlen); flags &= ~SCF_DO_STCLASS; /* xxx */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += trie->minlen; data->pos_delta += (trie->maxlen - trie->minlen); if (trie->maxlen != trie->minlen) data->longest = &(data->longest_float); } if (trie->jump) /* no more substrings -- for now /grr*/ - flags &= ~SCF_DO_SUBSTR; + flags &= ~SCF_DO_SUBSTR; } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ @@ -4735,10 +5351,24 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext /* Else: zero-length, ignore. */ scan = regnext(scan); } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ last = frame->last; scan = frame->next; stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + frame = frame->prev; goto fake_study_recurse; } @@ -4748,9 +5378,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("pre-fin:",data,depth); *scanp = scan; - *deltap = is_inf_internal ? I32_MAX : delta; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) - data->pos_delta = I32_MAX - data->pos_min; + data->pos_delta = SSize_t_MAX - data->pos_min; if (is_par > (I32)U8_MAX) is_par = 0; if (is_par && pars==1 && data) { @@ -4762,17 +5393,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; - + DEBUG_STUDYDATA("post-fin:",data,depth); - - return min < stopmin ? min : stopmin; + + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ } STATIC U32 -S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; @@ -4833,7 +5472,7 @@ Perl_reginitcolors(pTHX) } STMT_END #else #define CHECK_RESTUDY_GOTO_butfirst -#endif +#endif /* * pregcomp - compile a regular expression into internal code @@ -4842,7 +5481,7 @@ Perl_reginitcolors(pTHX) * scope */ -#ifndef PERL_IN_XSUB_RE +#ifndef PERL_IN_XSUB_RE /* return the currently in-scope regex engine (or the default if none) */ @@ -4855,7 +5494,7 @@ Perl_current_re_engine(pTHX) HV * const table = GvHV(PL_hintgv); SV **ptr; - if (!table) + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) return &PL_core_reg_engine; ptr = hv_fetchs(table, "regcomp", FALSE); if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) @@ -4936,12 +5575,11 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, Newx(dst, *plen_p * 2 + 1, U8); while (s < *plen_p) { - const UV uv = NATIVE_TO_ASCII(src[s]); - if (UNI_IS_INVARIANT(uv)) - dst[d] = (U8)UTF_TO_NATIVE(uv); + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; else { - dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv); - dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv); + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); } if (n < num_code_blocks) { if (!do_end && pRExC_state->code_blocks[n].start == s) { @@ -5005,6 +5643,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, STRLEN orig_patlen = 0; bool code = 0; SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; /* if we've got a delimiter, we go round the loop twice for each * svp slot (except the last), using the delimiter the second @@ -5023,21 +5662,21 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * The code in this block is based on S_pushav() */ AV *const av = (AV*)msv; - const I32 maxarg = AvFILL(av) + 1; + const SSize_t maxarg = AvFILL(av) + 1; SV **array; if (oplist) { assert(oplist->op_type == OP_PADAV - || oplist->op_type == OP_RV2AV); + || oplist->op_type == OP_RV2AV); oplist = oplist->op_sibling;; } if (SvRMAGICAL(av)) { - U32 i; + SSize_t i; Newx(array, maxarg, SV*); SAVEFREEPV(array); - for (i=0; i < (U32)maxarg; i++) { + for (i=0; i < maxarg; i++) { SV ** const svp = av_fetch(av, i, FALSE); array[i] = svp ? *svp : &PL_sv_undef; } @@ -5327,7 +5966,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, { Safefree(pRExC_state->code_blocks); /* use croak_sv ? */ - Perl_croak_nocontext("%s", SvPV_nolen_const(errsv)); + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); } } assert(SvROK(qr_ref)); @@ -5418,20 +6057,24 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, STATIC bool -S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol) +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) { /* This is the common code for setting up the floating and fixed length * string data extracted from Perl_re_op_compile() below. Returns a boolean * as to whether succeeded or not */ - I32 t,ml; + I32 t; + SSize_t ml; if (! (longest_length || (eol /* Can't have SEOL and MULTI */ && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) ) - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) { return FALSE; } @@ -5449,7 +6092,7 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, S follow this item. We calculate it ahead of time as once the lookbehind offset is added in we lose the ability to correctly calculate it.*/ - ml = minlen ? *(minlen) : (I32)longest_length; + ml = minlen ? *(minlen) : (SSize_t)longest_length; *rx_end_shift = ml - offset - longest_length + (SvTAIL(sv_longest) != 0) + lookbehind; @@ -5518,7 +6161,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, char *exp; regnode *scan; I32 flags; - I32 minlen = 0; + SSize_t minlen = 0; U32 rx_flags; SV *pat; SV *code_blocksv = NULL; @@ -5537,10 +6180,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, scan_data_t data; RExC_state_t RExC_state; RExC_state_t * const pRExC_state = &RExC_state; -#ifdef TRIE_STUDY_OPT +#ifdef TRIE_STUDY_OPT int restudied = 0; RExC_state_t copyRExC_state; -#endif +#endif GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_OP_COMPILE; @@ -5552,61 +6195,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * having to test them each time otherwise */ if (! PL_AboveLatin1) { PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); - PL_ASCII = _new_invlist_C_array(ASCII_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); - - PL_L1Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(PosixAlnum_invlist); - - PL_L1Posix_ptrs[_CC_ALPHA] - = _new_invlist_C_array(L1PosixAlpha_invlist); - PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist); - PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist); - - /* Cased is the same as Alpha in the ASCII range */ - PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist); - PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist); - PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist); - - PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - - PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist); - PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist); - - PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist); - PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist); - - PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist); - PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist); - - PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist); - PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist); - - PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist); - PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist); - PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist); - PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist); - - PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist); - PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist); - - PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist); - - PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist); - PL_L1Posix_ptrs[_CC_WORDCHAR] - = _new_invlist_C_array(L1PosixWord_invlist); - - PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist); - PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); - - PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); } #endif @@ -5722,6 +6315,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + RExC_contains_i = 0; pRExC_state->runtime_code_qr = NULL; DEBUG_COMPILE_r({ @@ -5743,11 +6337,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); /* return old regex if pattern hasn't changed */ - /* XXX: note in the below we have to check the flags as well as the pattern. + /* XXX: note in the below we have to check the flags as well as the + * pattern. * - * Things get a touch tricky as we have to compare the utf8 flag independently - * from the compile flags. - */ + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ if ( old_re && !recompile @@ -5764,10 +6358,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, rx_flags = orig_rx_flags; - if (initial_charset == REGEX_LOCALE_CHARSET) { - RExC_contains_locale = 1; + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; } - else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -5795,6 +6389,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_sawback = 0; RExC_seen = 0; + RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; @@ -5809,7 +6404,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_npar = 1; RExC_nestroot = 0; RExC_size = 0L; - RExC_emit = &RExC_emit_dummy; + RExC_emit = (regnode *) &RExC_emit_dummy; RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; @@ -5819,6 +6414,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_paren_name_list = NULL; #endif RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; RExC_recurse_count = 0; pRExC_state->code_index = 0; @@ -5862,12 +6459,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ DEBUG_PARSE_r({ - PerlIO_printf(Perl_debug_log, + PerlIO_printf(Perl_debug_log, "Required size %"IVdf" nodes\n" - "Starting second pass (creation)\n", + "Starting second pass (creation)\n", (IV)RExC_size); - RExC_lastnum=0; - RExC_lastparse=NULL; + RExC_lastnum=0; + RExC_lastparse=NULL; }); /* The first pass could have found things that force Unicode semantics */ @@ -5886,8 +6483,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (RExC_whilem_seen > 15) RExC_whilem_seen = 15; - /* Allocate space and zero-initialize. Note, the two step process - of zeroing when in debug mode, thus anything assigned has to + /* Allocate space and zero-initialize. Note, the two step process + of zeroing when in debug mode, thus anything assigned has to happen after that */ rx = (REGEXP*) newSV_type(SVt_REGEXP); r = ReANY(rx); @@ -5897,10 +6494,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, FAIL("Regexp out of space"); #ifdef DEBUGGING /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ - Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); -#else + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); +#else /* bulk initialize base fields with 0. */ - Zero(ri, sizeof(regexp_internal), char); + Zero(ri, sizeof(regexp_internal), char); #endif /* non-zero initialization begins here */ @@ -5924,14 +6522,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); /* The caret is output if there are any defaults: if not all the STD * flags are set, or if no character set specifier is needed */ bool has_default = (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) || ! has_charset); - bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ @@ -5992,13 +6592,24 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - - if (RExC_seen & REG_SEEN_RECURSE) { + + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { Newxz(RExC_open_parens, RExC_npar,regnode *); SAVEFREEPV(RExC_open_parens); Newxz(RExC_close_parens,RExC_npar,regnode *); SAVEFREEPV(RExC_close_parens); } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } /* Useful during FAIL. */ #ifdef RE_TRACK_PATTERN_OFFSETS @@ -6027,7 +6638,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { - ReREFCNT_dec(rx); + ReREFCNT_dec(rx); Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); } /* XXXX To minimize changes to RE engine we always allocate @@ -6041,6 +6652,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, reStudy: r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); #ifdef TRIE_STUDY_OPT if (!restudied) { @@ -6049,22 +6663,22 @@ reStudy: } else { U32 seen=RExC_seen; DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); - + RExC_state = copyRExC_state; - if (seen & REG_TOP_LEVEL_BRANCHES) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; else - RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; StructCopy(&zero_scan_data, &data, scan_data_t); } #else StructCopy(&zero_scan_data, &data, scan_data_t); -#endif +#endif /* Dig out information for optimizations. */ r->extflags = RExC_flags; /* was pm_op */ /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ - + if (UTF) SvUTF8_on(rx); /* Unicode in it? */ ri->regstclass = NULL; @@ -6074,20 +6688,21 @@ reStudy: /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ - if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ - I32 fake; + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; STRLEN longest_float_length, longest_fixed_length; - struct regnode_charclass_class ch_class; /* pointed to by data */ + regnode_ssc ch_class; /* pointed to by data */ int stclass_flag; - I32 last_close = 0; /* pointed to by data */ + SSize_t last_close = 0; /* pointed to by data */ regnode *first= scan; regnode *first_next= regnext(first); /* * Skip introductions and multiplicators >= 1 - * so that we can extract the 'meat' of the pattern that must + * so that we can extract the 'meat' of the pattern that must * match in the large if() sequence following. * NOTE that EXACT is NOT covered here, as it is normally - * picked up by the optimiser separately. + * picked up by the optimiser separately. * * This is unfortunate as the optimiser isnt handling lookahead * properly currently. @@ -6104,7 +6719,7 @@ reStudy: (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { - /* + /* * the only op that could be a regnode is PLUS, all the rest * will be regnode_1 or regnode_2. * @@ -6133,7 +6748,7 @@ reStudy: } #ifdef TRIE_STCLASS else if (PL_regkind[OP(first)] == TRIE && - ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) + ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { regnode *trie_op; /* this can happen only on restudy */ @@ -6159,35 +6774,35 @@ reStudy: PL_regkind[OP(first)] == NBOUND) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOL) { - r->extflags |= (OP(first) == MBOL - ? RXf_ANCH_MBOL + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL : (OP(first) == SBOL - ? RXf_ANCH_SBOL - : RXf_ANCH_BOL)); + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); first = NEXTOPER(first); goto again; } else if (OP(first) == GPOS) { - r->extflags |= RXf_ANCH_GPOS; + r->intflags |= PREGf_ANCH_GPOS; first = NEXTOPER(first); goto again; } else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = (OP(NEXTOPER(first)) == REG_ANY) - ? RXf_ANCH_MBOL - : RXf_ANCH_SBOL; - r->extflags |= type; - r->intflags |= PREGf_IMPLICIT; + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); first = NEXTOPER(first); goto again; } - if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback) + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6229,15 +6844,17 @@ reStudy: SAVEFREESV(data.last_found); first = scan; if (!ri->regstclass) { - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; } else /* XXXX Check for BOUND? */ stclass_flag = 0; data.last_closep = &last_close; - - minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ - &data, -1, NULL, NULL, + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 0); @@ -6249,9 +6866,11 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen - && !(RExC_seen & REG_SEEN_VERBARG) - && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ r->extflags |= RXf_CHECK_ALL; + } scan_commit(pRExC_state, &data,&minlen,0); longest_float_length = CHR_SVLEN(data.longest_float); @@ -6273,7 +6892,7 @@ reStudy: { r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; - if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; SvREFCNT_inc_simple_void_NN(data.longest_float); } @@ -6311,36 +6930,40 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag - && ! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } - /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; r->check_end_shift = r->anchored_end_shift; r->check_substr = r->anchored_substr; r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; - if (r->extflags & RXf_ANCH_SINGLE) - r->extflags |= RXf_NOSCAN; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; } else { + r->substrs->check_ix = 1; r->check_end_shift = r->float_end_shift; r->check_substr = r->float_substr; r->check_utf8 = r->float_utf8; @@ -6352,83 +6975,96 @@ reStudy: if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) if ( (STRLEN)minlen < longest_float_length ) minlen= longest_float_length; if ( (STRLEN)minlen < longest_fixed_length ) - minlen= longest_fixed_length; + minlen= longest_fixed_length; */ } else { /* Several toplevels. Best we can is to set minlen. */ - I32 fake; - struct regnode_charclass_class ch_class; - I32 last_close = 0; + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = ri->program + 1; - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; data.last_closep = &last_close; - - minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, - &data, -1, NULL, NULL, - SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS - |(restudied ? SCF_TRIE_DOING_RESTUDY : 0), + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), 0); - + CHECK_RESTUDY_GOTO_butfirst(NOOP); r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - if (! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) - { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } } + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", - (IV)minlen, (IV)r->minlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); }); r->minlenret = minlen; - if (r->minlen < minlen) + if (r->minlen < minlen) r->minlen = minlen; - - if (RExC_seen & REG_SEEN_GPOS) - r->extflags |= RXf_GPOS_SEEN; - if (RExC_seen & REG_SEEN_LOOKBEHIND) - r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ + + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_SEEN_CANY) - r->extflags |= RXf_CANY_SEEN; - if (RExC_seen & REG_SEEN_VERBARG) + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ } - if (RExC_seen & REG_SEEN_CUTGROUP) + if (RExC_seen & REG_CUTGROUP_SEEN) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) r->intflags |= PREGf_USE_RE_EVAL; @@ -6437,7 +7073,20 @@ reStudy: else RXp_PAREN_NAMES(r) = NULL; + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); regnode *next = NEXTOPER(first); @@ -6447,16 +7096,28 @@ reStudy: r->extflags |= RXf_NULL; else if (PL_regkind[fop] == BOL && nop == END) r->extflags |= RXf_START_ONLY; - else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END) + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) r->extflags |= RXf_WHITE; - else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + #ifdef DEBUGGING if (RExC_paren_names) { - ri->name_list_idx = add_data( pRExC_state, 1, "a" ); - ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); } else #endif ri->name_list_idx = 0; @@ -6471,15 +7132,17 @@ reStudy: /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ + DEBUG_RExC_seen(); PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); #ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_OFFSETS_r(if (ri->u.offsets) { - const U32 len = ri->u.offsets[0]; - U32 i; + const STRLEN len = ri->u.offsets[0]; + STRLEN i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", @@ -6539,7 +7202,8 @@ Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, else if (flags & RXapif_NEXTKEY) return reg_named_buff_nextkey(rx, flags); else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); return NULL; } } @@ -6665,7 +7329,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) { SV *ret; AV *av; - I32 length; + SSize_t length; struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; @@ -6676,11 +7340,12 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) } else if (flags & RXapif_ONE) { ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = MUTABLE_AV(SvRV(ret)); - length = av_len(av); + length = av_tindex(av); SvREFCNT_dec_NN(ret); return newSViv(length + 1); } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); return NULL; } } @@ -6728,12 +7393,12 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, { struct regexp *const rx = ReANY(r); char *s = NULL; - I32 i = 0; - I32 s1, t1; + SSize_t i = 0; + SSize_t s1, t1; I32 n = paren; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - + if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH @@ -6766,14 +7431,14 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, i = rx->offs[0].start; s = rx->subbeg; } - else + else if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) && rx->offs[0].end != -1) { /* $', ${^POSTMATCH} */ s = rx->subbeg - rx->suboffset + rx->offs[0].end; i = rx->sublen + rx->suboffset - rx->offs[0].end; - } + } else if ( 0 <= n && n <= (I32)rx->nparens && (s1 = rx->offs[n].start) != -1 && @@ -6784,12 +7449,12 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, s = rx->subbeg + s1 - rx->suboffset; } else { goto ret_undef; - } + } assert(s >= rx->subbeg); - assert(rx->sublen >= (s - rx->subbeg) + i ); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); if (i >= 0) { -#if NO_TAINT_SUPPORT +#ifdef NO_TAINT_SUPPORT sv_setpvn(sv, s, i); #else const int oldtainted = TAINT_get; @@ -6797,7 +7462,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->extflags & RXf_CANY_SEEN) + if ( (rx->intflags & PREGf_CANY_SEEN) ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) : (RXp_MATCH_UTF8(rx)) ) @@ -6822,7 +7487,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, TAINT; SvTAINT(sv); } - } else + } else SvTAINTED_off(sv); } } else { @@ -6957,7 +7622,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) PERL_ARGS_ASSERT_REG_SCAN_NAME; - if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { /* skip IDFIRST by using do...while */ if (UTF) do { @@ -6968,7 +7635,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) RExC_parse++; } while (isWORDCHAR(*RExC_parse)); } else { - RExC_parse++; /* so the <- from the vFAIL is after the offending character */ + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ vFAIL("Group name must start with a non-digit word character"); } if ( flags ) { @@ -7235,10 +7903,9 @@ Perl__new_invlist(pTHX_ IV initial_size) return new_list; } -#endif -STATIC SV* -S__new_invlist_C_array(pTHX_ const UV* const list) +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) { /* Return a pointer to a newly constructed inversion list, initialized to * point to , which has to be in the exact correct inversion list @@ -7288,8 +7955,11 @@ S__new_invlist_C_array(pTHX_ const UV* const list) /* Initialize the iteration pointer. */ invlist_iterfinish(invlist); + SvREADONLY_on(invlist); + return invlist; } +#endif /* ifndef PERL_IN_XSUB_RE */ STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) @@ -7317,10 +7987,9 @@ S_invlist_trim(pTHX_ SV* const invlist) SvPV_shrink_to_cur((SV *) invlist); } -#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) - STATIC void -S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) { /* Subject to change or removal. Append the range from 'start' to 'end' at * the end of the inversion list. The range must be above any existing @@ -7350,8 +8019,8 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) { Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", - array[final_element], start, - ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); } /* Here, it is a legal append. If the new range begins with the first @@ -7494,7 +8163,8 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) } void -Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) +Perl__invlist_populate_swatch(pTHX_ SV* const invlist, + const UV start, const UV end, U8* swatch) { /* populates a swatch of a swash the same way swatch_get() does in utf8.c, * but is used when the swash has an inversion list. This makes this much @@ -7587,14 +8257,16 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV } void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. The first list, - * , may be NULL, in which case a copy of the second list is returned. - * If is TRUE, the union is taken of the complement - * (inversion) of instead of b itself. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7635,9 +8307,13 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b /* If either one is empty, the union is the other one */ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + if (*output == a) { if (a != NULL) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } } if (*output != b) { @@ -7646,18 +8322,27 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b _invlist_invert(*output); } } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; if (*output == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } /* The complement of an empty list is a list that has everything in it, * so the union with includes everything too */ if (complement_b) { if (a == *output) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } *output = _new_invlist(1); _append_range_to_invlist(*output, 0, UV_MAX); @@ -7666,6 +8351,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b *output = invlist_clone(a); } /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } @@ -7805,24 +8494,36 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b } } - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *output || b == *output) { assert(! invlist_is_iterating(*output)); - SvREFCNT_dec_NN(*output); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } } *output = u; + return; } void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. - * If is TRUE, the result will be the intersection of - * and the complement (or inversion) of instead of directly. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7860,6 +8561,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Special case if either one is empty */ len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; if (len_a != 0 && complement_b) { @@ -7869,24 +8571,38 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * simply 'a'. */ if (*i != a) { if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } *i = invlist_clone(a); } /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } return; } /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The * intersection must be empty */ if (*i == a) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } else if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + return; } @@ -7991,7 +8707,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } /* The final length is what we've output so far plus what else is in the - * intersection. At most one of the subexpressions below will be non-zero */ + * intersection. At most one of the subexpressions below will be non-zero + * */ len_r = i_r; if (count >= 2) { len_r += (len_a - i_a) + (len_b - i_b); @@ -8016,13 +8733,21 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *i || b == *i) { assert(! invlist_is_iterating(*i)); - SvREFCNT_dec_NN(*i); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } } *i = r; + return; } @@ -8069,6 +8794,35 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) return invlist; } +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + #endif PERL_STATIC_INLINE SV* @@ -8097,43 +8851,6 @@ Perl__invlist_invert(pTHX_ SV* const invlist) *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); } -void -Perl__invlist_invert_prop(pTHX_ SV* const invlist) -{ - /* Complement the input inversion list (which must be a Unicode property, - * all of which don't match above the Unicode maximum code point.) And - * Perl has chosen to not have the inversion match above that either. This - * adds a 0x110000 if the list didn't end with it, and removes it if it did - */ - - UV len; - UV* array; - - PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; - - _invlist_invert(invlist); - - len = _invlist_len(invlist); - - if (len != 0) { /* If empty do nothing */ - array = invlist_array(invlist); - if (array[len - 1] != PERL_UNICODE_MAX + 1) { - /* Add 0x110000. First, grow if necessary */ - len++; - if (invlist_max(invlist) < len) { - invlist_extend(invlist, len); - array = invlist_array(invlist); - } - invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist)); - array[len - 1] = PERL_UNICODE_MAX + 1; - } - else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist)); - } - } - - return; -} #endif PERL_STATIC_INLINE SV* @@ -8141,7 +8858,7 @@ S_invlist_clone(pTHX_ SV* const invlist) { /* Return a new inversion list that is a copy of the input one, which is - * unchanged */ + * unchanged. The new list will not be mortal even if the old one was. */ /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ @@ -8303,7 +9020,8 @@ Perl__invlist_contents(pTHX_ SV* const invlist) #ifndef PERL_IN_XSUB_RE void -Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist) +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) { /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by @@ -8422,7 +9140,7 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) /* End of inversion list object */ STATIC void -S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) { /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' * constructs, and updates RExC_flags with them. On input, RExC_parse @@ -8482,7 +9200,6 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) } cs = REGEX_LOCALE_CHARSET; has_charset_modifier = LOCALE_PAT_MOD; - RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier) { @@ -8536,7 +9253,8 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); } else if (has_charset_modifier == *(RExC_parse - 1)) { - vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); } else { vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); @@ -8544,12 +9262,15 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) /*NOTREACHED*/ neg_modifier: RExC_parse++; - vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { - const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; if (! (wastedflags & wflagbit) ) { wastedflags |= wflagbit; /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ @@ -8602,13 +9323,17 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) RExC_flags |= posflags; RExC_flags &= ~negflags; set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } return; /*NOTREACHED*/ default: fail_modifiers: - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", - RExC_parse-seqstart, seqstart); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } @@ -8683,7 +9408,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char *start_arg = NULL; unsigned char op = 0; int argok = 1; - int internal_argval = 0; /* internal_argval is only useful if !argok */ + int internal_argval = 0; /* internal_argval is only useful if + !argok */ if (has_intervening_patws && SIZE_ONLY) { ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); @@ -8699,9 +9425,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) verb_len = RExC_parse - start_verb; if ( start_arg ) { RExC_parse++; - while ( *RExC_parse && *RExC_parse != ')' ) + while ( *RExC_parse && *RExC_parse != ')' ) RExC_parse++; - if ( *RExC_parse != ')' ) + if ( *RExC_parse != ')' ) vFAIL("Unterminated verb pattern argument"); if ( RExC_parse == start_arg ) start_arg = NULL; @@ -8709,7 +9435,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if ( *RExC_parse != ')' ) vFAIL("Unterminated verb pattern"); } - + switch ( *start_verb ) { case 'A': /* (*ACCEPT) */ if ( memEQs(start_verb,verb_len,"ACCEPT") ) { @@ -8738,48 +9464,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if ( memEQs(start_verb,verb_len,"PRUNE") ) op = PRUNE; break; - case 'S': /* (*SKIP) */ - if ( memEQs(start_verb,verb_len,"SKIP") ) + case 'S': /* (*SKIP) */ + if ( memEQs(start_verb,verb_len,"SKIP") ) op = SKIP; break; case 'T': /* (*THEN) */ /* [19:06] :: is then */ if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; - RExC_seen |= REG_SEEN_CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; } break; } if ( ! op ) { - RExC_parse++; - vFAIL3("Unknown verb pattern '%.*s'", - verb_len, start_verb); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); } if ( argok ) { if ( start_arg && internal_argval ) { vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); + verb_len, start_verb); } else if ( argok < 0 && !start_arg ) { vFAIL3("Verb pattern '%.*s' has a mandatory argument", - verb_len, start_verb); + verb_len, start_verb); } else { ret = reganode(pRExC_state, op, internal_argval); if ( ! internal_argval && ! SIZE_ONLY ) { if (start_arg) { - SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); - ARG(ret) = add_data( pRExC_state, 1, "S" ); + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); RExC_rxi->data->data[ARG(ret)]=(void*)sv; ret->flags = 0; } else { - ret->flags = 1; + ret->flags = 1; } - } + } } if (!internal_argval) - RExC_seen |= REG_SEEN_VERBARG; + RExC_seen |= REG_VERBARG_SEEN; } else if ( start_arg ) { vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); + verb_len, start_verb); } else { ret = reg_node(pRExC_state, op); } @@ -8806,17 +9535,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto named_recursion; } else if (paren == '=') { /* (?P=...) named backref */ - /* this pretty much dupes the code for \k in regatom(), if - you change this make sure you change that */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ char* name_start = RExC_parse; U32 num = 0; SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -8841,12 +9572,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); /*NOTREACHED*/ case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; - else if (*RExC_parse != '=') + else if (*RExC_parse != '=') named_capture: { /* (?<...>) */ char *name_start; @@ -8855,15 +9588,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '\'': /* (?'...') */ name_start= RExC_parse; svname = reg_scan_name(pRExC_state, - SIZE_ONLY ? /* reverse test from the others */ - REG_RSN_RETURN_NAME : - REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - /*NOTREACHED*/ - } - if (*RExC_parse != paren) + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); if (SIZE_ONLY) { @@ -8903,20 +9631,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if ( count ) { - pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); pv[count] = RExC_npar; SvIV_set(sv_dat, SvIVX(sv_dat) + 1); } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); - sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); SvIOK_on(sv_dat); SvIV_set(sv_dat, 1); } #ifdef DEBUGGING - /* Yes this does cause a memory leak in debugging Perls */ - if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) SvREFCNT_dec_NN(svname); #endif @@ -8926,7 +9658,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = 1; goto capturing_parens; } - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; case '=': /* (?=...) */ @@ -8943,7 +9675,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '|': /* (?|...) */ /* branch reset, behave like a (?:...) except that buffers in alternations share the same numbers */ - paren = ':'; + paren = ':'; after_freeze = freeze_paren = RExC_npar; break; case ':': /* (?:...) */ @@ -8970,6 +9702,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; *flagp |= POSTPONED; nextchar(pRExC_state); return ret; @@ -8984,6 +9717,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); goto gen_recurse_regop; assert(0); /* NOT REACHED */ case '+': @@ -8997,7 +9732,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse--; /* rewind to let it be handled later */ goto parse_flags; - } + } /*FALLTHROUGH */ case '1': case '2': case '3': case '4': /* (?1) */ case '5': case '6': case '7': case '8': case '9': @@ -9009,7 +9744,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; while (isDIGIT(*RExC_parse)) RExC_parse++; - if (*RExC_parse!=')') + if (*RExC_parse!=')') vFAIL("Expecting close bracket"); gen_recurse_regop: @@ -9043,11 +9778,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ARG2L_SET( ret, RExC_recurse_count++); RExC_emit++; DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); } else { RExC_size++; } - RExC_seen |= REG_SEEN_RECURSE; + RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ @@ -9061,7 +9797,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) is_logical = 1; if (*RExC_parse != '{') { RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } *flagp |= POSTPONED; @@ -9090,14 +9829,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { OP *o = cb->block; if (cb->src_regex) { - n = add_data(pRExC_state, 2, "rl"); + n = add_data(pRExC_state, STR_WITH_LEN("rl")); RExC_rxi->data->data[n] = (void*)SvREFCNT_inc((SV*)cb->src_regex); RExC_rxi->data->data[n+1] = (void*)o; } else { - n = add_data(pRExC_state, 1, - (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); RExC_rxi->data->data[n] = (void*)o; } } @@ -9135,7 +9874,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reg_node(pRExC_state, LOGICAL); if (!SIZE_ONLY) ret->flags = 1; - + tail = reg(pRExC_state, 1, &flag, depth+1); if (flag & RESTART_UTF8) { *flagp = RESTART_UTF8; @@ -9158,7 +9897,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) (ch == '>' ? '<' : ch)); RExC_parse++; if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -9188,15 +9927,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV *sv_dat; RExC_parse++; sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } - ret = reganode(pRExC_state,INSUBP,parno); + ret = reganode(pRExC_state,INSUBP,parno); goto insert_if_check_paren; } else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; + char *tmp; parno = atoi(RExC_parse++); while (isDIGIT(*RExC_parse)) @@ -9204,8 +9946,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if ((c = *nextchar(pRExC_state)) != ')') + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); + } insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -9217,14 +9965,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); } else - REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); c = *nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { - if (is_define) + if (is_define) vFAIL("(?(DEFINE)....) does not allow branches"); - lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { if (flags & RESTART_UTF8) { *flagp = RESTART_UTF8; @@ -9256,7 +10008,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else { - vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); } } case '[': /* (?[ ... ]) */ @@ -9285,16 +10038,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) capturing_parens: parno = RExC_npar; RExC_npar++; - + ret = reganode(pRExC_state, OPEN, parno); if (!SIZE_ONLY ){ - if (!RExC_nestroot) + if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE + if (RExC_seen & REG_RECURSE_SEEN && !RExC_open_parens[parno-1]) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Setting open paren #%"IVdf" to %d\n", + "Setting open paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); RExC_open_parens[parno-1]= ret; } @@ -9306,7 +10059,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else /* ! paren */ ret = NULL; - + parse_rest: /* Pick up the branches, linking them together. */ parse_start = RExC_parse; /* MJD */ @@ -9347,7 +10100,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { ender = reganode(pRExC_state, LONGJMP,0); - REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); } if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ @@ -9355,7 +10110,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (freeze_paren) { if (RExC_npar > after_freeze) after_freeze = RExC_npar; - RExC_npar = freeze_paren; + RExC_npar = freeze_paren; } br = regbranch(pRExC_state, &flags, 0, depth+1); @@ -9379,14 +10134,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Setting close paren #%"IVdf" to %d\n", + "Setting close paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ender))); RExC_close_parens[parno-1]= ender; - if (RExC_nestroot == parno) + if (RExC_nestroot == parno) RExC_nestroot = 0; - } + } Set_Node_Offset(ender,RExC_parse+1); /* MJD */ Set_Node_Length(ender,1); /* MJD */ break; @@ -9411,8 +10166,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("lsbr"); - regprop(RExC_rx, mysv_val1, lastbr); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(lastbr), @@ -9426,20 +10181,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (have_branch && !SIZE_ONLY) { char is_nothing= 1; if (depth==1) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; /* Hook the tails of the branches to the closing node. */ for (br = ret; br; br = regnext(br)) { const U8 op = PL_regkind[OP(br)]; if (op == BRANCH) { REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); - if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender) + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) is_nothing= 0; } else if (op == BRANCHJ) { REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); /* for now we always disable this optimisation * / - if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender) + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) */ is_nothing= 0; } @@ -9450,8 +10207,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("NADA"); - regprop(RExC_rx, mysv_val1, ret); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(ret), @@ -9693,6 +10450,19 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { + if (SIZE_ONLY) { + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); + } + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); + } do_curly: if ((flags&SIMPLE)) { @@ -9734,6 +10504,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; goto nest_check; } @@ -9771,6 +10543,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '*') { min = 0; @@ -9780,6 +10553,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '+') { min = 1; @@ -9792,10 +10566,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) nest_check: if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN3reg(RExC_parse, - "%.*s matches null string many times", - (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), - origparse); + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); (void)ReREFCNT_inc(RExC_rx_sv); } @@ -9825,11 +10601,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } STATIC bool -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, - const bool strict /* Apply stricter parsing rules? */ +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ ) { - + /* This is expected to be called by a parser routine that has recognized '\N' and needs to handle the rest. RExC_parse is expected to point at the first char following the N at the time of the call. On successful return, @@ -9884,7 +10661,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I more than one character */ GET_RE_DEBUG_FLAGS_DECL; - + PERL_ARGS_ASSERT_GROK_BSLASH_N; GET_RE_DEBUG_FLAGS; @@ -9892,7 +10669,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ /* The [^\n] meaning of \N ignores spaces and comments under the /x - * modifier. The other meaning does not */ + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) ? regwhite( pRExC_state, RExC_parse ) : RExC_parse; @@ -9902,17 +10680,18 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (*p != '{' || regcurly(p, FALSE)) { RExC_parse = p; if (! node_p) { - /* no bare \N in a charclass */ + /* no bare \N allowed in a charclass */ if (in_char_class) { vFAIL("\\N in a character class must be a named character: \\N{...}"); } return FALSE; } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ nextchar(pRExC_state); *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; - RExC_parse--; Set_Node_Length(*node_p, 1); /* MJD */ return TRUE; } @@ -9931,8 +10710,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ || ! (endbrace == RExC_parse /* nothing between the {} */ - || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ - && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ { if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); @@ -10070,7 +10851,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I } FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", (UV) flags); - } + } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); RExC_parse = endbrace; @@ -10138,7 +10919,9 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) } PERL_STATIC_INLINE void -S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) { /* This knows the details about sizing an EXACTish node, setting flags for * it (by setting <*flagp>, and potentially populating it with a single @@ -10153,48 +10936,111 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 * If is zero, the function assumes that the node is to contain only * the single character given by and calculates what * should be. In pass 1, it sizes the node appropriately. In pass 2, it - * additionally will populate the node's STRING with , if - * is 0. In both cases <*flagp> is appropriately set + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set * * It knows that under FOLD, the Latin Sharp S and UTF characters above * 255, must be folded (the former only when the rules indicate it can - * match 'ss') */ + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + if (! len_passed_in) { if (UTF) { - if (FOLD && (! LOC || code_point > 255)) { - _to_uni_fold_flags(NATIVE_TO_UNI(code_point), + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, character, &len, - FOLD_FLAGS_FULL | ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; } else { uvchr_to_utf8( character, code_point); len = UTF8SKIP(character); } - } - else if (! FOLD - || code_point != LATIN_SMALL_LETTER_SHARP_S - || ASCII_FOLD_RESTRICTED - || ! AT_LEAST_UNI_SEMANTICS) - { + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { *character = (U8) code_point; len = 1; - } - else { + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { *character = 's'; *(character + 1) = 's'; len = 2; } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } } if (SIZE_ONLY) { @@ -10218,8 +11064,29 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 { *flagp |= SIMPLE; } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } +} + + +/* return atoi(p), unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + char *q = p; + + for (;isDIGIT(*q); q++); /* calculate length of num */ + if (q - p == 0 || q - p > 9) + return I32_MAX; + return atoi(p); } + /* - regatom - the lowest level @@ -10280,7 +11147,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 by the other. Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with - TRYAGAIN. + TRYAGAIN. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be restarted. Otherwise does not return NULL. @@ -10377,7 +11244,8 @@ tryagain: *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; @@ -10425,7 +11293,7 @@ tryagain: goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); - RExC_seen |= REG_SEEN_GPOS; + RExC_seen |= REG_GPOS_SEEN; *flagp |= SIMPLE; goto finish_meta_pat; case 'K': @@ -10436,7 +11304,7 @@ tryagain: * be necessary here to avoid cases of memory corruption, as * with: C<$_="x" x 80; s/x\K/y/> -- rgs */ - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -10450,7 +11318,7 @@ tryagain: goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_SEEN_CANY; + RExC_seen |= REG_CANY_SEEN; *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'X': @@ -10467,30 +11335,38 @@ tryagain: case 'b': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = BOUND + get_regex_charset(RExC_flags); if (op > BOUNDA) { /* /aa is same as /a */ op = BOUNDA; } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = NBOUND + get_regex_charset(RExC_flags); if (op > NBOUNDA) { /* /aa is same as /a */ op = NBOUNDA; } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); } goto finish_meta_pat; @@ -10534,6 +11410,9 @@ tryagain: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } join_posix_op_known: @@ -10549,10 +11428,10 @@ tryagain: *flagp |= HASWIDTH|SIMPLE; /* FALL THROUGH */ - finish_meta_pat: + finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ - break; + break; case 'p': case 'P': { @@ -10582,7 +11461,7 @@ tryagain: nextchar(pRExC_state); } break; - case 'N': + case 'N': /* Handle \N and \N{NAME} with multiple code points here and not * below because it can be multicharacter. join_exact() will join * them up later on. Also this makes sure that things like @@ -10604,10 +11483,11 @@ tryagain: break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: - { - char ch= RExC_parse[1]; + { + char ch= RExC_parse[1]; if (ch != '<' && ch != '\'' && ch != '{') { RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.2s... not terminated",parse_start); } else { /* this pretty much dupes the code for (?P=...) in reg(), if @@ -10618,10 +11498,11 @@ tryagain: SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -10648,15 +11529,16 @@ tryagain: } break; } - case 'g': + case 'g': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { I32 num; - bool isg = *RExC_parse == 'g'; - bool isrel = 0; bool hasbrace = 0; - if (isg) { + + if (*RExC_parse == 'g') { + bool isrel = 0; + RExC_parse++; if (*RExC_parse == '{') { RExC_parse++; @@ -10668,37 +11550,52 @@ tryagain: } if (hasbrace && !isDIGIT(*RExC_parse)) { if (isrel) RExC_parse--; - RExC_parse -= 2; + RExC_parse -= 2; goto parse_named_seq; - } } - num = atoi(RExC_parse); - if (isg && num == 0) { - if (*RExC_parse == '0') { + } + + num = S_backref_value(RExC_parse); + if (num == 0) vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); } - else { - vFAIL("Unterminated \\g... pattern"); + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); } } - if (isrel) { - num = RExC_npar - num; - if (num < 1) - vFAIL("Reference to nonexistent or unclosed group"); + else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } } - if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9') - /* Probably a character specified in octal, e.g. \35 */ - goto defchar; - else { + + /* at this point RExC_parse definitely points to a backref + * number */ + { #ifdef RE_TRACK_PATTERN_OFFSETS char * const parse_start = RExC_parse - 1; /* MJD */ #endif while (isDIGIT(*RExC_parse)) RExC_parse++; if (hasbrace) { - if (*RExC_parse != '}') + if (*RExC_parse != '}') vFAIL("Unterminated \\g{...} pattern"); RExC_parse++; - } + } if (!SIZE_ONLY) { if (num > (I32)RExC_rx->nparens) vFAIL("Reference to nonexistent group"); @@ -10759,7 +11656,6 @@ tryagain: char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; - STRLEN foldlen; U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; @@ -10767,9 +11663,15 @@ tryagain: /* We can convert EXACTF nodes to EXACTFU if they contain only * characters that match identically regardless of the target * string's UTF8ness. The reason to do this is that EXACTF is not - * trie-able, EXACTFU is. (We don't need to figure this out until - * pass 2) */ - bool maybe_exactfu = node_type == EXACTF && PASS2; + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, @@ -10786,10 +11688,9 @@ tryagain: reparse: - /* We do the EXACTFish to EXACT node only if folding, and not if in - * locale, as whether a character folds or not isn't known until - * runtime. (And we don't need to figure this out until pass 2) */ - maybe_exact = FOLD && ! LOC && PASS2; + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10854,7 +11755,8 @@ tryagain: case 's': case 'S': /* space class */ case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ - case 'X': /* eXtended Unicode "combining character sequence" */ + case 'X': /* eXtended Unicode "combining + character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ --p; goto loopdone; @@ -10902,7 +11804,7 @@ tryagain: p++; break; case 'a': - ender = ASCII_TO_NATIVE('\007'); + ender = '\a'; p++; break; case 'o': @@ -10965,24 +11867,30 @@ tryagain: } case 'c': p++; - ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); + ender = grok_bslash_c(*p++, SIZE_ONLY); break; case '8': case '9': /* must be a backreference */ --p; goto loopdone; case '1': case '2': case '3':case '4': case '5': case '6': case '7': - /* When we parse backslash escapes there is ambiguity between - * backreferences and octal escapes. Any escape from \1 - \9 is - * a backreference, any multi-digit escape which does not start with - * 0 and which when evaluated as decimal could refer to an already - * parsed capture buffer is a backslash. Anything else is octal. + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. * - * Note this implies that \118 could be interpreted as 118 OR as - * "\11" . "8" depending on whether there were 118 capture buffers - * defined already in the pattern. - */ - if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar ) + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) { /* Not to be treated as an octal constant, go find backref */ --p; @@ -11039,7 +11947,7 @@ tryagain: if (! SIZE_ONLY && RExC_flags & RXf_PMf_EXTENDED && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low(p, UTF)) + && is_PATWS_non_low_safe(p, RExC_end, UTF)) { vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), "Escape literal pattern white space under /x"); @@ -11077,7 +11985,10 @@ tryagain: goto loopdone; } - if (! FOLD) { + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { if (UTF) { const STRLEN unilen = reguni(pRExC_state, ender, s); if (unilen > 0) { @@ -11095,14 +12006,27 @@ tryagain: else { REGC((char)ender, s++); } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } } - else /* FOLD */ + else /* FOLD */ if (! ( UTF /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ if (IS_IN_SOME_FOLD_L1(ender)) { maybe_exact = FALSE; @@ -11114,77 +12038,68 @@ tryagain: || ender == LATIN_SMALL_LETTER_SHARP_S || (len > 0 && isARG2_lower_or_UPPER_ARG1('s', ender) - && isARG2_lower_or_UPPER_ARG1('s', *(s-1))))) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) { maybe_exactfu = FALSE; } } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ *(s++) = (char) ender; } - else { /* UTF */ - - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UNI_IS_INVARIANT(ender)) { - *s = (U8) ender; - foldlen = 1; - } else { - *s = UTF8_TWO_BYTE_HI(ender); - *(s + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; - } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; } else { - UV folded = _to_uni_fold_flags( - ender, - (U8 *) s, - &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) - ); - - /* If this node only contains non-folding code points - * so far, see if this new one is also non-folding */ - if (maybe_exact) { - if (folded != ender) { + STRLEN foldlen; + + folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { maybe_exact = FALSE; } - else { - /* Here the fold is the original; we have - * to check further to see if anything - * folds to it */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", - "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = - _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - if (_invlist_contains_cp(PL_utf8_foldable, - ender)) - { - maybe_exact = FALSE; - } - } } - ender = folded; } - s += foldlen; - - /* The loop increments each time, as all but this - * path (and one other) through it add a single byte to the - * EXACTish node. But this one has changed len to be the - * correct final value, so subtract one to cancel out the - * increment that follows */ - len += foldlen - 1; + ender = folded; } if (next_is_quantifier) { @@ -11233,9 +12148,8 @@ tryagain: if (! UTF) { - /* These two have no multi-char folds to non-UTF characters - */ - if (ASCII_FOLD_RESTRICTED || LOC) { + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { goto loopdone; } @@ -11266,12 +12180,8 @@ tryagain: } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - - /* No Latin1 characters participate in multi-char - * folds under /l */ - if (LOC - || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( - *s, *(s+1)))) + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + *s, *(s+1)))) { break; } @@ -11381,7 +12291,7 @@ tryagain: * code points in the node that participate in folds; * similarly for 'maybe_exactfu' and code points that match * differently depending on UTF8ness of the target string - * */ + * (for /u), or depending on locale for /l */ if (maybe_exact) { OP(ret) = EXACT; } @@ -11389,7 +12299,12 @@ tryagain: OP(ret) = EXACTFU; } } - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); } RExC_parse = p - 1; @@ -11428,7 +12343,7 @@ S_regwhite( RExC_state_t *pRExC_state, char *p ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -11442,7 +12357,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */ + * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -11462,7 +12377,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -11470,6 +12385,72 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) return p; } +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. Character classes ([:foo:]) can also be negated ([:^foo:]). Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. @@ -11588,8 +12569,9 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) } if (namedclass == OOB_NAMEDCLASS) - Simple_vFAIL3("POSIX class [:%.*s:] unknown", - t - s - 1, s + 1); + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); /* The #defines are structured so each complement is +1 to * the normal one */ @@ -11677,8 +12659,9 @@ S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) } STATIC regnode * -S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth, - char * const oregcomp_parse) +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) { /* Handle the (?[...]) construct to do set operations */ @@ -11714,7 +12697,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), "The regex_sets feature is experimental" REPORT_LOCATION, - (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse); + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); while (RExC_parse < RExC_end) { SV* current = NULL; @@ -12168,7 +13154,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ -#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, @@ -12218,8 +13205,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ SV* properties = NULL; /* Code points that match \p{} \P{} */ - SV* posixes = NULL; /* Code points that match classes like, [:word:], - extended beyond the Latin1 range */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ AV * multi_char_matches = NULL; /* Code points that fold to more than one @@ -12246,11 +13241,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * string is in UTF-8. (Because is under /d) */ SV* depends_list = NULL; - /* inversion list of code points this node matches. For much of the - * function, it includes only those that match regardless of the utf8ness - * of the target string */ + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ SV* cp_list = NULL; + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -12258,14 +13260,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif bool invert = FALSE; /* Is this class to be complemented */ - /* Is there any thing like \W or [:^digit:] that matches above the legal - * Unicode range? */ - bool runtime_posix_matches_above_Unicode = FALSE; + bool warn_super = ALWAYS_WARN_SUPER; regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; - const I32 orig_size = RExC_size; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -12286,9 +13287,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ANYOF_FLAGS(ret) = 0; RExC_emit += ANYOF_SKIP; - if (LOC) { - ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } listsv = newSVpvs_flags("# comment\n", SVs_TEMP); initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ @@ -12410,7 +13408,7 @@ parseit: case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { - /* We only pay attention to the first char of + /* We only pay attention to the first char of multichar strings being returned. I kinda wonder if this makes sense as it does change the behaviour from earlier versions, OTOH that behaviour was broken @@ -12431,7 +13429,12 @@ parseit: char *e; /* We will handle any undefined properties ourselves */ - U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF; + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; if (RExC_parse >= RExC_end) vFAIL2("Empty \\%c{}", (U8)value); @@ -12454,6 +13457,7 @@ parseit: } if (!SIZE_ONLY) { SV* invlist; + char* formatted; char* name; if (UCHARAT(RExC_parse) == '^') { @@ -12474,14 +13478,14 @@ parseit: * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - Newx(name, n + sizeof("_i__\n"), char); - - sprintf(name, "%s%.*s%s\n", - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - ); + formatted = Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + ); + name = savepvn(formatted, strlen(formatted)); /* Look up the property name, and get its swash and * inversion list, if the property is found */ @@ -12506,11 +13510,13 @@ parseit: * otherwise add it to the list for run-time look up */ if (ret_invlist) { RExC_parse = e + 1; - vFAIL3("Property '%.*s' is unknown", (int) n, name); + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", (value == 'p' ? '+' : '!'), - name); + UTF8fARG(UTF, n, name)); has_user_defined_property = TRUE; /* We don't know yet, so have to assume that the @@ -12519,7 +13525,7 @@ parseit: * would cause things in to match * inappropriately, except that any \p{}, including * this one forces Unicode semantics, which means there - * is */ + * is no */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; } else { @@ -12527,9 +13533,23 @@ parseit: /* Here, did get the swash and its inversion list. If * the swash is from a user-defined property, then this * whole character class should be regarded as such */ - has_user_defined_property = - (swash_init_flags - & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + /* Invert if asking for the complement */ if (value == 'P') { @@ -12563,7 +13583,7 @@ parseit: case 'f': value = '\f'; break; case 'b': value = '\b'; break; case 'e': value = ASCII_TO_NATIVE('\033');break; - case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'a': value = '\a'; break; case 'o': RExC_parse--; /* function expects to be pointed at the 'o' */ { @@ -12603,7 +13623,7 @@ parseit: goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -12675,31 +13695,8 @@ parseit: /* Here, we have the current token in 'value' */ - /* What matches in a locale is not known until runtime. This includes - * what the Posix classes (like \w, [:space:]) match. Room must be - * reserved (one time per class) to store such classes, either if Perl - * is compiled so that locale nodes always should have this space, or - * if there is such class info to be stored. The space will contain a - * bit for each named class that is to be matched against. This isn't - * needed for \p{} and pseudo-classes, as they are not affected by - * locale, and hence are dealt with separately */ - if (LOC - && ! need_class - && (ANYOF_LOCALE == ANYOF_CLASS - || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX))) - { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; - ANYOF_CLASS_ZERO(ret); - } - ANYOF_FLAGS(ret) |= ANYOF_CLASS; - } - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a * literal, as is the character that began the false range, i.e. @@ -12710,16 +13707,19 @@ parseit: ? RExC_parse - rangebegin : 0; if (strict) { - vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); } else { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN4reg(RExC_parse, - "False [] range \"%*.*s\"", - w, w, rangebegin); + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); (void)ReREFCNT_inc(RExC_rx_sv); cp_list = add_cp_to_invlist(cp_list, '-'); - cp_list = add_cp_to_invlist(cp_list, prevvalue); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); } } @@ -12727,16 +13727,89 @@ parseit: element_count += 2; /* So counts for three values */ } - if (! SIZE_ONLY) { - U8 classnum = namedclass_to_classnum(namedclass); - if (namedclass >= ANYOF_MAX) { /* If a special class */ + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ - /* Here, should be \h, \H, \v, or \V. Neither /d nor - * /l make a difference in what these match. There - * would be problems if these characters had folds - * other than themselves, as cp_list is subject to - * folding. */ + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ if (classnum != _CC_VERTSPACE) { assert( namedclass == ANYOF_HORIZWS || namedclass == ANYOF_NHORIZWS); @@ -12749,246 +13822,25 @@ parseit: _invlist_union_maybe_complement_2nd( cp_list, PL_XPosix_ptrs[classnum], - cBOOL(namedclass % 2), /* Complement if odd + namedclass % 2 != 0, /* Complement if odd (NHORIZWS, NVERTWS) */ &cp_list); } } - else if (classnum == _CC_ASCII) { -#ifdef HAS_ISASCII - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else -#endif /* Not isascii(); just use the hard-coded definition for it */ - _invlist_union_maybe_complement_2nd( - posixes, - PL_ASCII, - cBOOL(namedclass % 2), /* Complement if odd - (NASCII) */ - &posixes); - } - else { /* Garden variety class */ - - /* The ascii range inversion list */ - SV* ascii_source = PL_Posix_ptrs[classnum]; - - /* The full Latin1 range inversion list */ - SV* l1_source = PL_L1Posix_ptrs[classnum]; - - /* This code is structured into two major clauses. The - * first is for classes whose complete definitions may not - * already be known. It not, the Latin1 definition - * (guaranteed to already known) is used plus code is - * generated to load the rest at run-time (only if needed). - * If the complete definition is known, it drops down to - * the second clause, where the complete definition is - * known */ - - if (classnum < _FIRST_NON_SWASH_CC) { - - /* Here, the class has a swash, which may or not - * already be loaded */ - - /* The name of the property to use to match the full - * eXtended Unicode range swash for this character - * class */ - const char *Xname = swash_property_names[classnum]; - - /* If returning the inversion list, we can't defer - * getting this until runtime */ - if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) { - PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", Xname, &PL_sv_undef, - 1, /* binary */ - 0, /* not tr/// */ - NULL, /* No inversion list */ - NULL /* No flags */ - ); - assert(PL_utf8_swash_ptrs[classnum]); - } - if ( ! PL_utf8_swash_ptrs[classnum]) { - if (namedclass % 2 == 0) { /* A non-complemented - class */ - /* If not /a matching, there are code points we - * don't know at compile time. Arrange for the - * unknown matches to be loaded at run-time, if - * needed */ - if (! AT_LEAST_ASCII_RESTRICTED) { - Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", - Xname); - } - if (LOC) { /* Under locale, set run-time - lookup */ - ANYOF_CLASS_SET(ret, namedclass); - } - else { - /* Add the current class's code points to - * the running total */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : l1_source, - &posixes); - } - } - else { /* A complemented class */ - if (AT_LEAST_ASCII_RESTRICTED) { - /* Under /a should match everything above - * ASCII, plus the complement of the set's - * ASCII matches */ - _invlist_union_complement_2nd(posixes, - ascii_source, - &posixes); - } - else { - /* Arrange for the unknown matches to be - * loaded at run-time, if needed */ - Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", - Xname); - runtime_posix_matches_above_Unicode = TRUE; - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else { - - /* We want to match everything in - * Latin1, except those things that - * l1_source matches */ - SV* scratch_list = NULL; - _invlist_subtract(PL_Latin1, l1_source, - &scratch_list); - - /* Add the list from this class to the - * running total */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, - scratch_list, - &posixes); - SvREFCNT_dec_NN(scratch_list); - } - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) - |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - } - } - goto namedclass_done; - } - - /* Here, there is a swash loaded for the class. If no - * inversion list for it yet, get it */ - if (! PL_XPosix_ptrs[classnum]) { - PL_XPosix_ptrs[classnum] - = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]); - } - } - - /* Here there is an inversion list already loaded for the - * entire class */ - - if (namedclass % 2 == 0) { /* A non-complemented class, - like ANYOF_PUNCT */ - if (! LOC) { - /* For non-locale, just add it to any existing list - * */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - } - else { /* Locale */ - SV* scratch_list = NULL; - - /* For above Latin1 code points, we use the full - * Unicode range */ - _invlist_intersection(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - /* And set the output to it, adding instead if - * there already is an output. Checking if - * 'posixes' is NULL first saves an extra clone. - * Its reference count will be decremented at the - * next union, etc, or if this is the only - * instance, at the end of the routine */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } - -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_BLANK) { -#endif - /* Set this class in the node for runtime - * matching */ - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* No isblank(), use the hard-coded ASCII-range - * blanks, adding them to the running total. */ - - _invlist_union(posixes, ascii_source, &posixes); - } -#endif - } - } - else { /* A complemented class, like ANYOF_NPUNCT */ - if (! LOC) { - _invlist_union_complement_2nd( - posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - /* Under /d, everything in the upper half of the - * Latin1 range matches this complement */ - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* Locale */ - SV* scratch_list = NULL; - _invlist_subtract(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_NBLANK) { -#endif - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* Get the list of all code points in Latin1 - * that are not ASCII blanks, and add them to - * the running total */ - _invlist_subtract(PL_Latin1, ascii_source, - &scratch_list); - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#endif - } - } + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); } - namedclass_done: - continue; /* Go get next character */ + continue; /* Go get next character */ } } /* end of namedclass \blah */ @@ -13007,7 +13859,9 @@ parseit: if (range) { if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; - Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); range = 0; /* not a valid range */ } } @@ -13088,11 +13942,9 @@ parseit: value, foldbuf, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) ); /* Here, should be the first character of the @@ -13160,7 +14012,8 @@ parseit: /* Deal with this element of the class */ if (! SIZE_ONLY) { #ifndef EBCDIC - cp_list = _add_range_to_invlist(cp_list, prevvalue, value); + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); #else SV* this_range = _new_invlist(1); _append_range_to_invlist(this_range, prevvalue, value); @@ -13174,13 +14027,18 @@ parseit: * included. literal_endpoint==2 means both ends of the range used * a literal character, not \x{foo} */ if (literal_endpoint == 2 - && (prevvalue >= 'a' && value <= 'z') - || (prevvalue >= 'A' && value <= 'Z')) + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) { - _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA], + _invlist_intersection(this_range, PL_ASCII, + &this_range); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], &this_range); } - _invlist_union(cp_list, this_range, &cp_list); + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); literal_endpoint = 0; #endif } @@ -13210,7 +14068,7 @@ parseit: #endif /* Look at the longest folds first */ - for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -13265,15 +14123,29 @@ parseit: return ret; } - /* If the character class contains only a single element, it may be - * optimizable into another node type which is smaller and runs faster. - * Check if this is the case for this class */ - if (element_count == 1 && ! ret_invlist) { + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { U8 op = END; U8 arg = 0; - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or - [:digit:] or \p{foo} */ + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ /* All named classes are mapped into POSIXish nodes, with its FLAG * argument giving which class it is */ @@ -13329,14 +14201,6 @@ parseit: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } -#ifndef HAS_ISBLANK - if (op == POSIXL - && (namedclass == ANYOF_BLANK - || namedclass == ANYOF_NBLANK)) - { - op = POSIXA; - } -#endif join_posix: /* The odd numbered ones are the complements of the @@ -13391,13 +14255,16 @@ parseit: /* To get locale nodes to not use the full ANYOF size would * require moving the code above that writes the portions * of it that aren't in other nodes to after this point. - * e.g. ANYOF_CLASS_SET */ + * e.g. ANYOF_POSIXL_SET */ RExC_size = orig_size; } } else { RExC_emit = (regnode *)orig_emit; if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } if (invert) { op += NPOSIXD - POSIXD; } @@ -13413,13 +14280,17 @@ parseit: *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } RExC_parse = (char *) cur_parse; SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); return ret; } } @@ -13430,238 +14301,260 @@ parseit: /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ - if (FOLD && cp_list) { - UV start, end; /* End points of code point ranges */ + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; + } - SV* fold_intersection = NULL; + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, + &fold_intersection); - /* If the highest code point is within Latin1, we can use the - * compiled-in Alphas list, and not have to go out to disk. This - * yields two false positives, the masculine and feminine ordinal - * indicators, which are weeded out below using the - * IS_IN_SOME_FOLD_L1() macro */ - if (invlist_highest(cp_list) < 256) { - _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, - &fold_intersection); - } - else { + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { + + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { - /* Here, there are non-Latin1 code points, so we will have to go - * fetch the list of all the characters that participate in folds - */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { - - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode - * rules hard-coded into Perl. (This case happens legitimately - * during compilation of Perl itself before the Unicode tables - * are generated) */ - if (_invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = newHV(); - } - else { /* If the folds haven't been read in, call a fold function * to force that */ if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; + U8 dummy[UTF8_MAXBYTES_CASE+1]; /* This string is just a short named one above \xff */ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); assert(PL_utf8_tofold); /* Verify that worked */ } - PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); + PL_utf8_foldclosures + = _swash_inversion_hash(PL_utf8_tofold); } } - /* Only the characters in this class that participate in folds need - * be checked. Get the intersection of this class and all the - * possible characters that are foldable. This can quickly narrow - * down a large class */ - _invlist_intersection(PL_utf8_foldable, cp_list, - &fold_intersection); - } - - /* Now look at the foldable characters in this class individually */ - invlist_iterinit(fold_intersection); - while (invlist_iternext(fold_intersection, &start, &end)) { - UV j; - - /* Locale folding for Latin1 characters is deferred until runtime */ - if (LOC && start < 256) { - start = 256; - } - - /* Look at every character in the range */ - for (j = start; j <= end; j++) { - - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - SV** listp; - - if (j < 256) { - - /* We have the latin1 folding rules hard-coded here so that - * an innocent-looking character class, like /[ks]/i won't - * have to go out to disk to find the possible matches. - * XXX It would be better to generate these via regen, in - * case a new version of the Unicode standard adds new - * mappings, though that is not really likely, and may be - * caught by the default: case of the switch below. */ - - if (IS_IN_SOME_FOLD_L1(j)) { - - /* ASCII is always matched; non-ASCII is matched only - * under Unicode rules */ - if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { - cp_list = - add_cp_to_invlist(cp_list, PL_fold_latin1[j]); - } - else { - depends_list = - add_cp_to_invlist(depends_list, PL_fold_latin1[j]); + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; + + if (j < 256) { + + /* We have the latin1 folding rules hard-coded here so + * that an innocent-looking character class, like + * /[ks]/i won't have to go out to disk to find the + * possible matches. XXX It would be better to + * generate these via regen, in case a new version of + * the Unicode standard adds new mappings, though that + * is not really likely, and may be caught by the + * default: case of the switch below. */ + + if (IS_IN_SOME_FOLD_L1(j)) { + + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } } - } - if (HAS_NONLATIN1_FOLD_CLOSURE(j) - && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) - { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for to the - * inversion list. */ - switch (j) { - case 'k': - case 'K': - cp_list = - add_cp_to_invlist(cp_list, KELVIN_SIGN); - break; - case 's': - case 'S': - cp_list = add_cp_to_invlist(cp_list, + if (HAS_NONLATIN1_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + /* Certain Latin1 characters have matches outside + * Latin1. To get here, is one of those + * characters. None of these matches is valid for + * ASCII characters under /aa, which is why the 'if' + * just above excludes those. These matches only + * happen when the target string is utf8. The code + * below adds the single fold closures for to the + * inversion list. */ + + switch (j) { + case 'k': + case 'K': + *use_list = + add_cp_to_invlist(*use_list, KELVIN_SIGN); + break; + case 's': + case 'S': + *use_list = add_cp_to_invlist(*use_list, LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - cp_list = add_cp_to_invlist(cp_list, - GREEK_CAPITAL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - cp_list = - add_cp_to_invlist(cp_list, ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - cp_list = add_cp_to_invlist(cp_list, + break; + case MICRO_SIGN: + *use_list = add_cp_to_invlist(*use_list, + GREEK_CAPITAL_LETTER_MU); + *use_list = add_cp_to_invlist(*use_list, + GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *use_list = + add_cp_to_invlist(*use_list, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *use_list = add_cp_to_invlist(*use_list, LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - cp_list = add_cp_to_invlist(cp_list, - LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 to - * express, so they can't match unless the - * target string is in UTF-8, so no action here - * is necessary, as regexec.c properly handles - * the general case for UTF-8 matching and - * multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; + break; + case LATIN_SMALL_LETTER_SHARP_S: + *use_list = add_cp_to_invlist(*use_list, + LATIN_CAPITAL_LETTER_SHARP_S); + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character + * folds from code points that require UTF8 + * to express, so they can't match unless + * the target string is in UTF-8, so no + * action here is necessary, as regexec.c + * properly handles the general case for + * UTF-8 matching and multi-char folds */ + break; + default: + /* Use deprecated warning to increase the + * chances of this being output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); + break; + } } + continue; } - continue; - } - /* Here is an above Latin1 character. We don't have the rules - * hard-coded for it. First, get its fold. This is the simple - * fold, as the multi-character folds have been handled earlier - * and separated out */ - _to_uni_fold_flags(j, foldbuf, &foldlen, - ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); - - /* Single character fold of above Latin1. Add everything in - * its fold closure to the list that this node should match. - * The fold closures data structure is a hash with the keys - * being the UTF-8 of every character that is folded to, like - * 'k', and the values each an array of all code points that - * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. - * Multi-character folds are not included */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); - - /* /aa doesn't allow folds between ASCII and non-; /l - * doesn't allow them between above and below 256 */ - if ((ASCII_FOLD_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && c < 256)) { - continue; - } + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c = SvUV(*c_p); - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ - if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) - { - cp_list = add_cp_to_invlist(cp_list, c); - } - else { - depends_list = add_cp_to_invlist(depends_list, c); + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } + + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } } } } } - } - SvREFCNT_dec_NN(fold_intersection); + SvREFCNT_dec_NN(fold_intersection); + } + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); } /* And combine the result (if any) with any inversion list from posix * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ - if (posixes) { + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } if (! DEPENDS_SEMANTICS) { if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); @@ -13675,10 +14568,8 @@ parseit: /* Under /d, we put into a separate list the Latin1 things that * match only when the target string is utf8 */ SV* nonascii_but_latin1_properties = NULL; - _invlist_intersection(posixes, PL_Latin1, + _invlist_intersection(posixes, PL_UpperLatin1, &nonascii_but_latin1_properties); - _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII, - &nonascii_but_latin1_properties); _invlist_subtract(posixes, nonascii_but_latin1_properties, &posixes); if (cp_list) { @@ -13712,7 +14603,6 @@ parseit: * , because having a Unicode property forces Unicode * semantics */ if (properties) { - bool warn_super = ! has_user_defined_property; if (cp_list) { /* If it matters to the final outcome, see if a non-property @@ -13723,14 +14613,8 @@ parseit: * are using above-Unicode code points indicates they should know * the issues involved */ if (warn_super) { - bool non_prop_matches_above_Unicode = - runtime_posix_matches_above_Unicode - | (invlist_highest(cp_list) > PERL_UNICODE_MAX); - if (invert) { - non_prop_matches_above_Unicode = - ! non_prop_matches_above_Unicode; - } - warn_super = ! non_prop_matches_above_Unicode; + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); } _invlist_union(properties, cp_list, &cp_list); @@ -13741,7 +14625,7 @@ parseit: } if (warn_super) { - OP(ret) = ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; } } @@ -13754,12 +14638,32 @@ parseit: * shouldn't. Therefore we can't invert folded locale now, as it won't be * folded until runtime */ + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ if (invert - && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { @@ -13789,15 +14693,6 @@ parseit: return orig_emit; } - /* If we didn't do folding, it's because some information isn't available - * until runtime; set the run-time fold flag for these. (We don't have to - * worry about properties folding, as that is taken care of by the swash - * fetching) */ - if (FOLD && LOC) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; - } - /* Some character classes are equivalent to other nodes. Such nodes take * up less room and generally fewer operations to execute than ANYOF nodes. * Above, we checked for and optimized into some such equivalents for @@ -13816,8 +14711,13 @@ parseit: if (cp_list && ! invert && ! depends_list - && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) { UV start, end; U8 op = END; /* The optimzation node-type */ @@ -13841,7 +14741,7 @@ parseit: && (start < 256 || UTF)) { /* Here, the list contains a single code point. Can optimize - * into an EXACT node */ + * into an EXACTish node */ value = start; @@ -13871,12 +14771,6 @@ parseit: } } else { - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } if (_invlist_contains_cp(PL_utf8_foldable, value)) { op = EXACT; } @@ -13916,7 +14810,9 @@ parseit: RExC_parse = (char *)cur_parse; if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } SvREFCNT_dec_NN(cp_list); @@ -13929,53 +14825,8 @@ parseit: * for things that belong in the bitmap, put them there, and delete from * . While we are at it, see if everything above 255 is in the * list, and if so, set a flag to speed up execution */ - ANYOF_BITMAP_ZERO(ret); - if (cp_list) { - - /* This gets set if we actually need to modify things */ - bool change_invlist = FALSE; - - UV start, end; - - /* Start looking through */ - invlist_iterinit(cp_list); - while (invlist_iternext(cp_list, &start, &end)) { - UV high; - int i; - if (end == UV_MAX && start <= 256) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - } - - /* Quit if are above what we should change */ - if (start > 255) { - break; - } - - change_invlist = TRUE; - - /* Set all the bits in the range, up to the max that we are doing */ - high = (end < 255) ? end : 255; - for (i = start; i <= (int) high; i++) { - if (! ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_SET(ret, i); - } - } - } - invlist_iterfinish(cp_list); - - /* Done with loop; remove any code points that are in the bitmap from - * */ - if (change_invlist) { - _invlist_subtract(cp_list, PL_Latin1, &cp_list); - } - - /* If have completely emptied it, remove it completely */ - if (_invlist_len(cp_list) == 0) { - SvREFCNT_dec_NN(cp_list); - cp_list = NULL; - } - } + populate_ANYOF_from_invlist(ret, &cp_list); if (invert) { ANYOF_FLAGS(ret) |= ANYOF_INVERT; @@ -13992,6 +14843,7 @@ parseit: else { cp_list = depends_list; } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; } /* If there is a swash and more than one element, we can't use the swash in @@ -14001,56 +14853,104 @@ parseit: swash = NULL; } - if (! cp_list - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - { - ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { - /* av[0] stores the character class description in its textual form: - * used later (regexec.c:Perl_regclass_swash()) to initialize the - * appropriate swash, and is also useful for dumping the regnode. - * av[1] if NULL, is a placeholder to later contain the swash computed - * from av[0]. But if no further computation need be done, the - * swash is stored there now. - * av[2] stores the cp_list inversion list for use in addition or - * instead of av[0]; used only if av[1] is NULL - * av[3] is set if any component of the class is from a user-defined - * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; - av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - ? SvREFCNT_inc(listsv) : &PL_sv_undef); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { av_store(av, 1, swash); SvREFCNT_dec_NN(cp_list); } else { - av_store(av, 1, NULL); + av_store(av, 1, &PL_sv_undef); if (cp_list) { - av_store(av, 2, cp_list); - av_store(av, 3, newSVuv(has_user_defined_property)); + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); } } + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + rv = newRV_noinc(MUTABLE_SV(av)); - n = add_data(pRExC_state, 1, "s"); + n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; - ARG_SET(ret, n); + ARG_SET(node, n); } - - *flagp |= HASWIDTH|SIMPLE; - return ret; } -#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* reg_skipcomment() Absorbs an /x style # comments from the input stream. Returns true if there is more text remaining in the stream. - Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment terminates the pattern without including a newline. Note its the callers responsibility to ensure that we are @@ -14073,7 +14973,7 @@ S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) if (!ended) { /* we ran off the end of the pattern without ending the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; return 0; } else return 1; @@ -14154,14 +15054,15 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) FILL_ADVANCE_NODE(ptr, op); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", - "reg_node", __LINE__, + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + "reg_node", __LINE__, PL_reg_name[op], - (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } #endif @@ -14185,16 +15086,16 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 2; - /* + /* We can't do this: - - assert(2==regarglen[op]+1); + + assert(2==regarglen[op]+1); Anything larger than this has to allocate the extra amount. If we changed this to be: - + RExC_size += (1 + regarglen[op]); - + then it wouldn't matter. Its not clear what side effect might come from that so its not done so far. -- dmq @@ -14210,18 +15111,19 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) FILL_ADVANCE_NODE_ARG(ptr, op, arg); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", __LINE__, PL_reg_name[op], - (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Cur_Node_Offset; } -#endif +#endif RExC_emit = ptr; return(ret); } @@ -14229,7 +15131,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) /* - reguni - emit (if appropriate) a Unicode character */ -STATIC STRLEN +PERL_STATIC_INLINE STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; @@ -14290,30 +15192,32 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) StructCopy(--src, --dst, regnode); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", __LINE__, PL_reg_name[op], - (UV)(dst - RExC_emit_start) > RExC_offsets[0] + (UV)(dst - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(src - RExC_emit_start), (UV)(dst - RExC_emit_start), - (UV)RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); } #endif } - + place = opnd; /* Op node, where operand used to be. */ #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", __LINE__, PL_reg_name[op], - (UV)(place - RExC_emit_start) > RExC_offsets[0] + (UV)(place - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(place - RExC_emit_start), (UV)(RExC_parse - RExC_start), @@ -14321,7 +15225,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset(place, RExC_parse); Set_Node_Length(place, 1); } -#endif +#endif src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); @@ -14333,7 +15237,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) */ /* TODO: All three parms should be const */ STATIC void -S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14354,7 +15259,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tail" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), @@ -14380,7 +15285,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de - Look for optimizable sequences at the same time. - currently only looks for EXACT chains. -This is experimental code. The idea is to use this routine to perform +This is experimental code. The idea is to use this routine to perform in place optimizations on branches and groups as they are constructed, with the long term intention of removing optimization from study_chunk so that it is purely analytical. @@ -14392,7 +15297,8 @@ to control which is which. /* TODO: All four parms should be const */ STATIC U8 -S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14415,8 +15321,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN if (PL_regkind[OP(scan)] == EXACT) { - bool has_exactf_sharp_s; /* Unexamined in this routine */ - if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) return EXACT; } #endif @@ -14424,10 +15331,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, switch (OP(scan)) { case EXACT: case EXACTF: + case EXACTFA_NO_TRIE: case EXACTFA: case EXACTFU: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -14442,7 +15349,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), @@ -14455,8 +15362,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv_val=sv_newmortal(); DEBUG_PARSE_MSG(""); - regprop(RExC_rx, mysv_val, val); - PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", SvPV_nolen_const(mysv_val), (IV)REG_NODE_NUM(val), (IV)(val - scan) @@ -14484,7 +15392,9 @@ S_regdump_intflags(pTHX_ const char *lead, const U32 flags) int bit; int set=0; - for (bit=0; bit<32; bit++) { + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitanchored_substr) { - RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), RE_SV_DUMPLEN(r->anchored_substr), 30); PerlIO_printf(Perl_debug_log, "anchored %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_substr), (IV)r->anchored_offset); } else if (r->anchored_utf8) { - RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), RE_SV_DUMPLEN(r->anchored_utf8), 30); PerlIO_printf(Perl_debug_log, "anchored utf8 %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_utf8), (IV)r->anchored_offset); - } + } if (r->float_substr) { - RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), RE_SV_DUMPLEN(r->float_substr), 30); PerlIO_printf(Perl_debug_log, "floating %s%s at %"IVdf"..%"UVuf" ", s, RE_SV_TAIL(r->float_substr), (IV)r->float_min_offset, (UV)r->float_max_offset); } else if (r->float_utf8) { - RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), RE_SV_DUMPLEN(r->float_utf8), 30); PerlIO_printf(Perl_debug_log, "floating utf8 %s%s at %"IVdf"..%"UVuf" ", @@ -14598,7 +15510,7 @@ Perl_regdump(pTHX_ const regexp *r) (r->check_substr == r->float_substr && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored")); - if (r->extflags & RXf_NOSCAN) + if (r->intflags & PREGf_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->extflags & RXf_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); @@ -14606,22 +15518,22 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, ") "); if (ri->regstclass) { - regprop(r, sv, ri->regstclass); + regprop(r, sv, ri->regstclass, NULL); PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); } - if (r->extflags & RXf_ANCH) { + if (r->intflags & PREGf_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); - if (r->extflags & RXf_ANCH_BOL) + if (r->intflags & PREGf_ANCH_BOL) PerlIO_printf(Perl_debug_log, "(BOL)"); - if (r->extflags & RXf_ANCH_MBOL) + if (r->intflags & PREGf_ANCH_MBOL) PerlIO_printf(Perl_debug_log, "(MBOL)"); - if (r->extflags & RXf_ANCH_SBOL) + if (r->intflags & PREGf_ANCH_SBOL) PerlIO_printf(Perl_debug_log, "(SBOL)"); - if (r->extflags & RXf_ANCH_GPOS) + if (r->intflags & PREGf_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); PerlIO_putc(Perl_debug_log, ' '); } - if (r->extflags & RXf_GPOS_SEEN) + if (r->intflags & PREGf_GPOS_SEEN) PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) PerlIO_printf(Perl_debug_log, "plus "); @@ -14643,21 +15555,11 @@ Perl_regdump(pTHX_ const regexp *r) } /* -- regprop - printable representation of opcode +- regprop - printable representation of opcode, with run time support */ -#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ -STMT_START { \ - if (do_sep) { \ - Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ - if (flags & ANYOF_INVERT) \ - /*make sure the invert info is in each */ \ - sv_catpvs(sv, "^"); \ - do_sep = 0; \ - } \ -} STMT_END void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) { #ifdef DEBUGGING dVAR; @@ -14673,10 +15575,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) || _CC_VERTSPACE != 16 #error Need to adjust order of anyofs[] #endif - "[\\w]", - "[\\W]", - "[\\d]", - "[\\D]", + "\\w", + "\\W", + "\\d", + "\\D", "[:alpha:]", "[:^alpha:]", "[:lower:]", @@ -14693,8 +15595,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^graph:]", "[:cased:]", "[:^cased:]", - "[\\s]", - "[\\S]", + "\\s", + "\\S", "[:blank:]", "[:^blank:]", "[:xdigit:]", @@ -14705,12 +15607,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^cntrl:]", "[:ascii:]", "[:^ascii:]", - "[\\v]", - "[\\V]" + "\\v", + "\\V" }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; - + PERL_ARGS_ASSERT_REGPROP; sv_setpvs(sv, ""); @@ -14718,16 +15620,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from regexec.c, and it would be hard to supply pRExC_state. */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; if (k == EXACT) { sv_catpvs(sv, " "); - /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) - * is a crude hack but it may be the best for now since - * we have no flag "this EXACTish node was UTF-8" + /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) + * is a crude hack but it may be the best for now since + * we have no flag "this EXACTish node was UTF-8" * --jhi */ pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], PERL_PV_ESCAPE_UNI_DETECT | @@ -14746,19 +15649,19 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) NULL; const reg_trie_data * const trie = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; - + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); DEBUG_TRIE_COMPILE_r( - Perl_sv_catpvf(aTHX_ sv, - "", - (UV)trie->startstate, - (IV)trie->statecount-1, /* -1 because of the unused 0 element */ - (UV)trie->wordcount, - (UV)trie->minlen, - (UV)trie->maxlen, - (UV)TRIE_CHARCOUNT(trie), - (UV)trie->uniquecharcount - ) + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { sv_catpvs(sv, "["); @@ -14766,8 +15669,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) ? ANYOF_BITMAP(o) : TRIE_BITMAP(trie)); sv_catpvs(sv, "]"); - } - + } + } else if (k == CURLY) { if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ @@ -14775,7 +15678,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || (OP(o) < NREF)) { @@ -14783,7 +15688,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SV **name= av_fetch(list, ARG(o), 0 ); if (name) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); - } + } else { AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); @@ -14798,21 +15703,37 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } } - } - } else if (k == GOSUB) - Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ + } + if ( k == REF && reginfo) { + U32 n = ARG(o); /* which paren pair */ + I32 ln = prog->offs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } + } else if (k == GOSUB) + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); else if (k == VERB) { - if (!o->flags) - Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + if (!o->flags) + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - if (flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE_FLAGS) sv_catpvs(sv, "{loc}"); if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); @@ -14822,97 +15743,136 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* output what the standard cp 0-255 bitmap matches */ do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); - - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - /* output any special charclass tests (used entirely under use locale) */ - if (ANYOF_CLASS_TEST_ANY_SET(o)) { + + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { int i; - for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) { - if (ANYOF_CLASS_TEST(o,i)) { + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { sv_catpv(sv, anyofs[i]); do_sep = 1; } } } - - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - - if (flags & ANYOF_NON_UTF8_LATIN1_ALL) { - sv_catpvs(sv, "{non-utf8-latin1-all}"); - } - - /* output information about the unicode matching */ - if (flags & ANYOF_UNICODE_ALL) - sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) { - SV *lv; /* Set if there is something outside the bit map. */ - SV * sw; - bool byte_output = FALSE; /* If something in the bitmap has been - output */ - if (flags & ANYOF_NONBITMAP_NON_UTF8) { - sv_catpvs(sv, "{outside bitmap}"); + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); } - else { - sv_catpvs(sv, "{utf8}"); + + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); } - /* Get the stuff that wasn't in the bitmap */ - sw = regclass_swash(prog, o, FALSE, &lv, NULL); - if (lv && lv != &PL_sv_undef) { - char *s = savesvpv(lv); - char * const origs = s; + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { + char *s = savesvpv(lv); + char * const origs = s; + + while (*s && *s != '\n') + s++; - while (*s && *s != '\n') - s++; + if (*s == '\n') { + const char * const t = ++s; - if (*s == '\n') { - const char * const t = ++s; + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } - if (byte_output) { - sv_catpvs(sv, " "); - } + if (byte_output) { + sv_catpvs(sv, " "); + } - while (*s) { - if (*s == '\n') { + while (*s) { + if (*s == '\n') { - /* Truncate very long output */ - if (s - origs > 256) { - Perl_sv_catpvf(aTHX_ sv, - "%.*s...", - (int) (s - origs - 1), - t); - goto out_dump; + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; } - *s = ' '; - } - else if (*s == '\t') { - *s = '-'; + else if (*s == '\t') { + *s = '-'; + } + s++; } - s++; + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); } - if (s[-1] == ' ') - s[-1] = 0; - sv_catpv(sv, t); - } + out_dump: - out_dump: + Safefree(origs); + SvREFCNT_dec_NN(lv); + } - Safefree(origs); - SvREFCNT_dec_NN(lv); - } + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); + } + } } Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; - if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } } else { - sv_catpv(sv, anyofs[index]); + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); } } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -14922,9 +15882,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) PERL_UNUSED_ARG(sv); PERL_UNUSED_ARG(o); PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); #endif /* DEBUGGING */ } + + SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ @@ -14954,17 +15917,17 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r) return prog->check_substr ? prog->check_substr : prog->check_utf8; } -/* - pregfree() - - handles refcounting and freeing the perl core regexp structure. When - it is necessary to actually free the structure the first thing it +/* + pregfree() + + handles refcounting and freeing the perl core regexp structure. When + it is necessary to actually free the structure the first thing it does is call the 'free' method of the regexp_engine associated to - the regexp, allowing the handling of the void *pprivate; member - first. (This routine is not overridable by extensions, which is why + the regexp, allowing the handling of the void *pprivate; member + first. (This routine is not overridable by extensions, which is why the extensions free is called first.) - - See regdupe and regdupe_internal if you change anything here. + + See regdupe and regdupe_internal if you change anything here. */ #ifndef PERL_IN_XSUB_RE void @@ -14988,7 +15951,7 @@ Perl_pregfree2(pTHX_ REGEXP *rx) CALLREGFREE_PVT(rx); /* free the private data */ SvREFCNT_dec(RXp_PAREN_NAMES(r)); Safefree(r->xpv_len_u.xpvlenu_pv); - } + } if (r->substrs) { SvREFCNT_dec(r->anchored_substr); SvREFCNT_dec(r->anchored_utf8); @@ -15006,22 +15969,22 @@ Perl_pregfree2(pTHX_ REGEXP *rx) } /* reg_temp_copy() - + This is a hacky workaround to the structural issue of match results being stored in the regexp structure which is in turn stored in PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern could be PL_curpm in multiple contexts, and could require multiple result sets being associated with the pattern simultaneously, such as when doing a recursive match with (??{$qr}) - - The solution is to make a lightweight copy of the regexp structure + + The solution is to make a lightweight copy of the regexp structure when a qr// is returned from the code executed by (??{$qr}) this lightweight copy doesn't actually own any of its data except for - the starp/end and the actual regexp structure itself. - -*/ - - + the starp/end and the actual regexp structure itself. + +*/ + + REGEXP * Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) { @@ -15054,7 +16017,7 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) sv_force_normal(sv) is called. */ SvFAKE_on(ret_x); ret = ReANY(ret_x); - + SvFLAGS(ret_x) |= SvUTF8(rx); /* We share the same string buffer as the original regexp, on which we hold a reference count, incremented when mother_re is set below. @@ -15085,23 +16048,23 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) #endif ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); SvREFCNT_inc_void(ret->qr_anoncv); - + return ret_x; } #endif -/* regfree_internal() +/* regfree_internal() - Free the private data in a regexp. This is overloadable by - extensions. Perl takes care of the regexp structure in pregfree(), - this covers the *pprivate pointer which technically perl doesn't - know about, however of course we have to handle the - regexp_internal structure when no extension is in use. - - Note this is called before freeing anything in the regexp - structure. + Free the private data in a regexp. This is overloadable by + extensions. Perl takes care of the regexp structure in pregfree(), + this covers the *pprivate pointer which technically perl doesn't + know about, however of course we have to handle the + regexp_internal structure when no extension is in use. + + Note this is called before freeing anything in the regexp + structure. */ - + void Perl_regfree_internal(pTHX_ REGEXP * const rx) { @@ -15119,7 +16082,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RX_UTF8(rx), dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); - PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } }); @@ -15153,7 +16116,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) case 'l': case 'L': break; - case 'T': + case 'T': { /* Aho Corasick add-on structure for a trie node. Used in stclass optimization only */ U32 refcount; @@ -15193,7 +16156,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } break; default: - Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); } } Safefree(ri->data->what); @@ -15207,9 +16171,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) -/* - re_dup - duplicate a regexp. - +/* + re_dup - duplicate a regexp. + This routine is expected to clone a given regexp structure. It is only compiled under USE_ITHREADS. @@ -15218,7 +16182,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) stored in the *pprivate pointer. This allows extensions to handle any duplication it needs to do. - See pregfree() and regfree_internal() if you change anything here. + See pregfree() and regfree_internal() if you change anything here. */ #if defined(USE_ITHREADS) #ifndef PERL_IN_XSUB_RE @@ -15229,7 +16193,7 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) I32 npar; const struct regexp *r = ReANY(sstr); struct regexp *ret = ReANY(dstr); - + PERL_ARGS_ASSERT_RE_DUP_GUTS; npar = r->nparens+1; @@ -15301,15 +16265,15 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) /* regdupe_internal() - + This is the internal complement to regdupe() which is used to copy the structure pointed to by the *pprivate pointer in the regexp. This is the core version of the extension overridable cloning hook. The regexp structure being duplicated will be copied by perl prior - to this and will be provided as the regexp *r argument, however + to this and will be provided as the regexp *r argument, however with the /old/ structures pprivate pointer value. Thus this routine may override any copying normally done by perl. - + It returns a pointer to the new regexp_internal structure. */ @@ -15323,10 +16287,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) RXi_GET_DECL(r,ri); PERL_ARGS_ASSERT_REGDUPE_INTERNAL; - + len = ProgLen(ri); - - Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); + + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); reti->num_code_blocks = ri->num_code_blocks; @@ -15368,9 +16333,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) break; case 'f': /* This is cheating. */ - Newx(d->data[i], 1, struct regnode_charclass_class); - StructCopy(ri->data->data[i], d->data[i], - struct regnode_charclass_class); + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); reti->regstclass = (regnode*)d->data[i]; break; case 'T': @@ -15390,7 +16354,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) d->data[i] = ri->data->data[i]; break; default: - Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); } } @@ -15430,7 +16395,8 @@ Perl_regnext(pTHX_ regnode *p) return(NULL); if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); } offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); @@ -15442,7 +16408,7 @@ Perl_regnext(pTHX_ regnode *p) #endif STATIC void -S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) { va_list args; STRLEN l1 = strlen(pat1); @@ -15461,20 +16427,15 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) Copy(pat2, buf + l1, l2 , char); buf[l1 + l2] = '\n'; buf[l1 + l2 + 1] = '\0'; -#ifdef I_STDARG - /* ANSI variant takes additional second argument */ va_start(args, pat2); -#else - va_start(args); -#endif msv = vmess(buf, &args); va_end(args); message = SvPV_const(msv,l1); if (l1 > 512) l1 = 512; Copy(message, buf, l1 , char); - buf[l1-1] = '\0'; /* Overwrite \n */ - Perl_croak(aTHX_ "%s", buf); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); } /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ @@ -15492,7 +16453,8 @@ Perl_save_re_context(pTHX) U32 i; for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; - const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); GV *const *const gvp = (GV**)hv_fetch(PL_defstash, digits, len, 0); @@ -15514,19 +16476,6 @@ S_put_byte(pTHX_ SV *sv, int c) { PERL_ARGS_ASSERT_PUT_BYTE; - /* Our definition of isPRINT() ignores locales, so only bytes that are - not part of UTF-8 are considered printable. I assume that the same - holds for UTF-EBCDIC. - Also, code point 255 is not printable in either (it's E0 in EBCDIC, - which Wikipedia says: - - EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all - ones (binary 1111 1111, hexadecimal FF). It is similar, but not - identical, to the ASCII delete (DEL) or rubout control character. ... - it is typically mapped to hexadecimal code 9F, in order to provide a - unique character mapping in both directions) - - So the old condition can be simplified to !isPRINT(c) */ if (!isPRINT(c)) { switch (c) { case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; @@ -15548,6 +16497,48 @@ S_put_byte(pTHX_ SV *sv, int c) } } +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end' */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) + put_byte(sv, start); + } + else if ( end > 255 + || ! isALPHANUMERIC(start) + || ! isALPHANUMERIC(end) + || isDIGIT(start) != isDIGIT(end) + || isUPPER(start) != isUPPER(end) + || isLOWER(start) != isLOWER(end) + + /* This final test should get optimized out except on EBCDIC + * platforms, where it causes ranges that cross discontinuities + * like i/j to be shown as hex instead of the misleading, + * e.g. H-K (since that range includes more than H, I, J, K). + * */ + || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start)) + { + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + } + else { /* Here, the ends of the range are both digits, or both uppercase, + or both lowercase; and there's no discontinuity in the range + (which could happen on EBCDIC platforms) */ + put_byte(sv, start); + sv_catpvs(sv, "-"); + put_byte(sv, end); + } +} + STATIC bool S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) { @@ -15556,50 +16547,27 @@ S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) * output anything */ int i; - int rangestart = -1; bool has_output_anything = FALSE; PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; - for (i = 0; i <= 256; i++) { + for (i = 0; i < 256; i++) { if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - int j = i - 1; - if (i <= rangestart + 3) { /* Individual chars in short ranges */ - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - } - else if ( j > 255 - || ! isALPHANUMERIC(rangestart) - || ! isALPHANUMERIC(j) - || isDIGIT(rangestart) != isDIGIT(j) - || isUPPER(rangestart) != isUPPER(j) - || isLOWER(rangestart) != isLOWER(j) - - /* This final test should get optimized out except - * on EBCDIC platforms, where it causes ranges that - * cross discontinuities like i/j to be shown as hex - * instead of the misleading, e.g. H-K (since that - * range includes more than H, I, J, K). */ - || (j - rangestart) - != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart)) - { - Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}", - rangestart, - (j < 256) ? j : 255); - } - else { /* Here, the ends of the range are both digits, or both - uppercase, or both lowercase; and there's no - discontinuity in the range (which could happen on EBCDIC - platforms) */ - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, j); - } - rangestart = -1; + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j <= 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); has_output_anything = TRUE; + i = j; } } @@ -15607,23 +16575,26 @@ S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) } #define CLEAR_OPTSTART \ - if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ } STMT_END -#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); STATIC const regnode * S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, - const regnode *last, const regnode *plast, + const regnode *last, const regnode *plast, SV* sv, I32 indent, U32 depth) { dVAR; U8 op = PSEUDO; /* Arbitrary non-END op. */ const regnode *next; const regnode *optstart= NULL; - + RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; @@ -15633,8 +16604,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, last ? last-start : 0,plast ? plast-start : 0); #endif - - if (plast && plast < last) + + if (plast && plast < last) last= plast; while (PL_regkind[op] != END && (!last || node < last)) { @@ -15654,20 +16625,21 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else CLEAR_OPTSTART; - regprop(r, sv, node); + regprop(r, sv, node, NULL); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); - - if (OP(node) != OPTIMIZED) { + + if (OP(node) != OPTIMIZED) { if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, " (0)"); - else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) PerlIO_printf(Perl_debug_log, " (FAIL)"); - else + else PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); - (void)PerlIO_putc(Perl_debug_log, '\n'); + (void)PerlIO_putc(Perl_debug_log, '\n'); } - + after_print: if (PL_regkind[(U8)op] == BRANCHJ) { assert(next); @@ -15694,7 +16666,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const reg_trie_data * const trie = (reg_trie_data*)ri->data->data[optrie]; #ifdef DEBUGGING - AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); #endif const regnode *nextbranch= NULL; I32 word_idx; @@ -15704,21 +16677,25 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PerlIO_printf(Perl_debug_log, "%*s%s ", (int)(2*(indent+3)), "", - elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, - PL_colors[0], PL_colors[1], - (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELLIPSES | - PERL_PV_PRETTY_LTGT + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT ) - : "???" + : "???" ); if (trie->jump) { U16 dist= trie->jump[word_idx+1]; PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", - (UV)((dist ? this_trie + dist : next) - start)); + (UV)((dist ? this_trie + dist : next) - start)); if (dist) { if (!nextbranch) - nextbranch= this_trie + trie->jump[0]; + nextbranch= this_trie + trie->jump[0]; DUMPUNTIL(this_trie + dist, nextbranch); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) @@ -15745,8 +16722,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS) - ? ANYOF_CLASS_SKIP : ANYOF_SKIP); + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) { @@ -15762,7 +16740,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, indent++; } CLEAR_OPTSTART; -#ifdef DEBUG_DUMPUNTIL +#ifdef DEBUG_DUMPUNTIL PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); #endif return node; diff --git a/src/5019003/orig/regexec.c b/src/5020000/orig/regexec.c similarity index 80% rename from src/5019003/orig/regexec.c rename to src/5020000/orig/regexec.c index 5f142a0..362390b 100644 --- a/src/5019003/orig/regexec.c +++ b/src/5020000/orig/regexec.c @@ -37,16 +37,6 @@ #include "re_top.h" #endif -/* At least one required character in the target string is expressible only in - * UTF-8. */ -static const char* const non_utf8_target_but_utf8_required - = "Can't match, because target string needs to be in UTF-8\n"; - -#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ - goto target; \ -} STMT_END - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * @@ -93,16 +83,28 @@ static const char* const non_utf8_target_but_utf8_required #include "inline_invlist.c" #include "unicode_constants.h" +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #ifndef STATIC #define STATIC static #endif -/* Valid for non-utf8 strings: avoids the reginclass +/* Valid only for non-utf8 strings: avoids the reginclass * call if there are no complications: i.e., if everything matchable is * straight forward in the bitmap */ -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ : ANYOF_BITMAP_TEST(p,*(c))) /* @@ -117,6 +119,7 @@ static const char* const non_utf8_target_but_utf8_required ? reghop3((U8*)pos, off, \ (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ : (U8*)(pos + off)) + #define HOPBACKc(pos, off) \ (char*)(reginfo->is_utf8_target \ ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ @@ -127,6 +130,24 @@ static const char* const non_utf8_target_but_utf8_required #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ #define NEXTCHR_IS_EOS (nextchr < 0) @@ -139,11 +160,11 @@ static const char* const non_utf8_target_but_utf8_required SET_nextchr -#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \ +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ if (!swash_ptr) { \ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ - 1, 0, NULL, &flags); \ + 1, 0, invlist, &flags); \ assert(swash_ptr); \ } \ } STMT_END @@ -152,28 +173,33 @@ static const char* const non_utf8_target_but_utf8_required #ifdef DEBUGGING # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); #else # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name) + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) #endif #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ PL_utf8_swash_ptrs[_CC_WORDCHAR], \ - swash_property_names[_CC_WORDCHAR], \ - GREEK_SMALL_LETTER_IOTA_UTF8) + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ STMT_START { \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ "_X_regular_begin", \ - GREEK_SMALL_LETTER_IOTA_UTF8); \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ "_X_extend", \ + NULL, \ COMBINING_GRAVE_ACCENT_UTF8); \ } STMT_END @@ -190,14 +216,14 @@ static const char* const non_utf8_target_but_utf8_required * although it may be done at run time beause of the REF possibility - more * investigation required. -- demerphq */ -#define JUMPABLE(rn) ( \ - OP(rn) == OPEN || \ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ - OP(rn) == EVAL || \ - OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ - OP(rn) == PLUS || OP(rn) == MINMOD || \ - OP(rn) == KEEPS || \ - (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ ) #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) @@ -207,13 +233,13 @@ static const char* const non_utf8_target_but_utf8_required /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -223,7 +249,7 @@ static const char* const non_utf8_target_but_utf8_required Search for mandatory following text node; for lookahead, the text must follow but for lookbehind (rn->flags != 0) we skip to the next step. */ -#define FIND_NEXT_IMPT(rn) STMT_START { \ +#define FIND_NEXT_IMPT(rn) STMT_START { \ while (JUMPABLE(rn)) { \ const OPCODE type = OP(rn); \ if (type == SUSPEND || PL_regkind[type] == CURLY) \ @@ -274,8 +300,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) PERL_ARGS_ASSERT_REGCPPUSH; if (paren_elems_to_push < 0) - Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", - paren_elems_to_push); + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i", + paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS); if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf @@ -296,8 +322,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) ); for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(rex->offs[p].end); - SSPUSHINT(rex->offs[p].start); + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", @@ -369,10 +395,10 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) ); paren = *maxopenparen_p; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { - I32 tmps; + SSize_t tmps; rex->offs[paren].start_tmp = SSPOPINT; - rex->offs[paren].start = SSPOPINT; - tmps = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; if (paren <= rex->lastparen) rex->offs[paren].end = tmps; DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, @@ -484,7 +510,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } if (classnum < _FIRST_NON_SWASH_CC) { @@ -492,8 +518,11 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) /* Initialize the swash unless done already */ if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags); + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) @@ -526,7 +555,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) */ I32 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, - char *strbeg, I32 minend, SV *screamer, U32 nosave) + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -543,64 +572,70 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, } #endif -/* - * Need to implement the following flags for reg_anch: - * - * USE_INTUIT_NOML - Useful to call re_intuit_start() first - * USE_INTUIT_ML - * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer - * INTUIT_AUTORITATIVE_ML - * INTUIT_ONCE_NOML - Intuit can match in one location only. - * INTUIT_ONCE_ML - * - * Another flag for this function: SECOND_TIME (so that float substrs - * with giant delta may be not rechecked). - */ - -/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. - Otherwise, only SvCUR(sv) is used to get strbeg. */ -/* XXXX Some places assume that there is a fixed substring. - An update may be needed if optimizer marks as "INTUITable" - RExen without fixed substrings. Similarly, it is assumed that - lengths of all the strings are no more than minlen, thus they - cannot come from lookahead. - (Or minlen should take into account lookahead.) - NOTE: Some of this comment is not correct. minlen does now take account - of lookahead/behind. Further research is required. -- demerphq -*/ - -/* A failure to find a constant substring means that there is no need to make - an expensive call to REx engine, thus we celebrate a failure. Similarly, - finding a substring too deep into the string means that fewer calls to - regtry() should be needed. - - REx compiler's optimizer found 4 possible hints: - a) Anchored substring; - b) Fixed substring; - c) Whether we are anchored (beginning-of-line or \G); - d) First node (of those at offset 0) which may distinguish positions; - We use a)b)d) and multiline-part of c), and try to find a position in the - string which does not contradict any of them. - */ - -/* Most of decisions we do here should have been done at compile time. - The nodes of the REx which we used for the search should have been - deleted from the finite automaton. */ - -/* args: - * rx: the regex to match against - * sv: the SV being matched: only used for utf8 flag; the string - * itself is accessed via the pointers below. Note that on - * something like an overloaded SV, SvPOK(sv) may be false - * and the string pointers may point to something unrelated to - * the SV itself. - * strbeg: real beginning of string - * strpos: the point in the string at which to begin matching - * strend: pointer to the byte following the last char of the string - * flags currently unused; set to 0 - * data: currently unused; set to NULL +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. */ char * @@ -615,34 +650,60 @@ Perl_re_intuit_start(pTHX_ { dVAR; struct regexp *const prog = ReANY(rx); - I32 start_shift = 0; + SSize_t start_shift = prog->check_offset_min; /* Should be nonnegative! */ - I32 end_shift = 0; - char *s; + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; SV *check; - char *t; const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ - I32 ml_anch; - char *other_last = NULL; /* other substr checked before this */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ char *check_at = NULL; /* check substr found at this pos */ - char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; -#ifdef DEBUGGING - const char * const i_strpos = strpos; -#endif GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_START; PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(data); - /* CHR_DIST() would be more correct here but it makes things slow. */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ if (prog->minlen > strend - strpos) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "String too short... [re_intuit_start]\n")); + " String too short...\n")); goto fail; } @@ -667,448 +728,511 @@ Perl_re_intuit_start(pTHX_ } check = prog->check_substr; } - if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */ - && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */ - { - ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) - || ( (prog->extflags & RXf_ANCH_BOL) - && !multiline ) ); /* Check after \n? */ - - if (!ml_anch) { - if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ - && (strpos != strbeg)) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); - goto fail; - } - if (prog->check_offset_min == prog->check_offset_max - && !(prog->extflags & RXf_CANY_SEEN) - && ! multiline) /* /m can cause \n's to match that aren't - accounted for in the string max length. - See [perl #115242] */ - { - /* Substring at constant offset from beg-of-str... */ - I32 slen; - - s = HOP3c(strpos, prog->check_offset_min, strend); - - if (SvTAIL(check)) { - slen = SvCUR(check); /* >= 1 */ - if ( strend - s > slen || strend - s < slen - 1 - || (strend - s == slen && strend[-1] != '\n')) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); - goto fail_finish; - } - /* Now should match s[0..slen-2] */ - slen--; - if (slen && (*SvPVX_const(check) != *s - || (slen > 1 - && memNE(SvPVX_const(check), s, slen)))) { - report_neq: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); - goto fail_finish; - } + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); + } + }); + + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ + + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); + + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; } - else if (*SvPVX_const(check) != *s - || ((slen = SvCUR(check)) > 1 - && memNE(SvPVX_const(check), s, slen))) - goto report_neq; - check_at = s; - goto success_at_start; - } - } - /* Match is anchored, but substr is not anchored wrt beg-of-str. */ - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - - if (!ml_anch) { - const I32 end = prog->check_offset_max + CHR_SVLEN(check) - - (SvTAIL(check) != 0); - const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; - if (end_shift < eshift) - end_shift = eshift; + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; + } + if (slen && (*SvPVX_const(check) != *s + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); + goto fail_finish; + } + + check_at = s; + goto success_at_start; + } } } - else { /* Can match at random position */ - ml_anch = 0; - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - - /* end shift should be non negative here */ - } -#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ + end_shift = prog->check_end_shift; + +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", (IV)end_shift, RX_PRECOMP(prog)); #endif restart: - /* Find a possible match in the region s..strend by looking for - the "check" substring in the region corrected by start/end_shift. */ + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ + { - I32 srch_start_shift = start_shift; - I32 srch_end_shift = end_shift; U8* start_point; U8* end_point; - if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { - srch_end_shift -= ((strbeg - s) - srch_start_shift); - srch_start_shift = strbeg - s; - } - DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", - (IV)prog->check_offset_min, - (IV)srch_start_shift, - (IV)srch_end_shift, - (IV)prog->check_end_shift); - }); + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); - if (prog->extflags & RXf_CANY_SEEN) { - start_point= (U8*)(s + srch_start_shift); - end_point= (U8*)(strend - srch_end_shift); + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; } else { - start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); - end_point= HOP3(strend, -srch_end_shift, strbeg); + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } + } + DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", (int)(end_point - start_point), (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), start_point); }); - s = fbm_instr( start_point, end_point, + check_at = fbm_instr( start_point, end_point, check, multiline ? FBMrf_MULTILINE : 0); + + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ + + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ + + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); } - /* Update the count-of-usability, remove useless subpatterns, - unshift s. */ - - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(check), RE_SV_DUMPLEN(check), 30); - PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", - (s ? "Found" : "Did not find"), - (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) - ? "anchored" : "floating"), - quoted, - RE_SV_TAIL(check), - (s ? " at offset " : "...\n") ); - }); - if (!s) - goto fail_finish; - /* Finish the diagnostic message */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); - /* XXX dmq: first branch is for positive lookbehind... - Our check string is offset from the beginning of the pattern. - So we need to do any stclass tests offset forward from that - point. I think. :-( - */ - - - - check_at=s; - - - /* Got a candidate. Check MBOL anchoring, and the *other* substr. - Start with the other substr. - XXXX no SCREAM optimization yet - and a very coarse implementation - XXXX /ttx+/ results in anchored="ttx", floating="x". floating will - *always* match. Probably should be marked during compile... - Probably it is right to do no SCREAM here... - */ + /* now look for the 'other' substring if defined */ - if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8) - : (prog->float_substr && prog->anchored_substr)) + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) { /* Take into account the "other" substring. */ - /* XXXX May be hopelessly wrong for UTF... */ - if (!other_last) - other_last = strpos; - if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { - do_other_anchored: - { - char * const last = HOP3c(s, -start_shift, strbeg); - char *last1, *last2; - char * const saved_s = s; - SV* must; - - t = s - prog->check_offset_max; - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) - && t > strpos))) - NOOP; - else - t = strpos; - t = HOP3c(t, prog->anchored_offset, strend); - if (t < other_last) /* These positions already checked */ - t = other_last; - last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); - if (last < last1) - last1 = last; - /* XXXX It is not documented what units *_offsets are in. - We assume bytes, but this is clearly wrong. - Meaning this code needs to be carefully reviewed for errors. - dmq. - */ - - /* On end-of-str: see comment below. */ - must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->anchored_utf8); /* for debug */ - } - else - s = fbm_instr( - (unsigned char*)t, - HOP3(HOP3(last1, prog->anchored_offset, strend) - + SvCUR(must), -(SvTAIL(must)!=0), strbeg), - must, - multiline ? FBMrf_MULTILINE : 0 - ); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); - - - if (!s) { - if (last1 >= last2) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying floating at offset %ld...\n", - (long)(HOP3c(saved_s, 1, strend) - i_strpos))); - other_last = HOP3c(last1, prog->anchored_offset+1, strend); - s = HOP3c(last, 1, strend); - goto restart; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - t = HOP3c(s, -prog->anchored_offset, strbeg); - other_last = HOP3c(s, 1, strend); - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; - } - } - } - else { /* Take into account the floating substring. */ - char *last, *last1; - char * const saved_s = s; - SV* must; - - t = HOP3c(s, -start_shift, strbeg); - last1 = last = - HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); - if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) - last = HOP3c(t, prog->float_max_offset, strend); - s = HOP3c(t, prog->float_min_offset, strend); - if (s < other_last) - s = other_last; - /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ - must = utf8_target ? prog->float_utf8 : prog->float_substr; - /* fbm_instr() takes into account exact value of end-of-str - if the check is SvTAIL(ed). Since false positives are OK, - and end-of-str is not later than strend we are OK. */ - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->float_utf8); /* for debug message */ - } - else - s = fbm_instr((unsigned char*)s, - (unsigned char*)last + SvCUR(must) - - (SvTAIL(must)!=0), - must, multiline ? FBMrf_MULTILINE : 0); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); - if (!s) { - if (last1 == last) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying anchored starting at offset %ld...\n", - (long)(saved_s + 1 - i_strpos))); - other_last = last; - s = HOP3c(t, 1, strend); - goto restart; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - other_last = s; /* Fix this later. --Hugo */ - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; - } - } + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ + + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); + + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); + } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; + } + else { + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); + } + } + } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); } - - t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); - - DEBUG_OPTIMISE_MORE_r( - PerlIO_printf(Perl_debug_log, - "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", - (IV)prog->check_offset_min, - (IV)prog->check_offset_max, - (IV)(s-strpos), - (IV)(t-strpos), - (IV)(t-s), - (IV)(strend-strpos) - ) - ); + postprocess_substr_matches: - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) - && t > strpos))) - { - /* Fixed substring is found far enough so that the match - cannot start at strpos. */ - try_at_offset: - if (ml_anch && t[-1] != '\n') { - /* Eventually fbm_*() should handle this, but often - anchored_offset is not 0, so this check will not be wasted. */ - /* XXXX In the code below we prefer to look for "^" even in - presence of anchored substrings. And we search even - beyond the found float position. These pessimizations - are historical artefacts only. */ - find_anchor: - while (t < strend - prog->minlen) { - if (*t == '\n') { - if (t < check_at - prog->check_offset_min) { - if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { - /* Since we moved from the found position, - we definitely contradict the found anchored - substr. Due to the above check we do not - contradict "check" substr. - Thus we can arrive here only if check substr - is float. Redo checking for "other"=="fixed". - */ - strpos = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); - goto do_other_anchored; - } - /* We don't contradict the found floating substring. */ - /* XXXX Why not check for STCLASS? */ - s = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(s - i_strpos))); - goto set_useful; - } - /* Position contradicts check-string */ - /* XXXX probably better to look for check-string - than for "\n", so one should lower the limit for t? */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos))); - other_last = strpos = s = t + 1; - goto restart; - } - t++; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); - goto fail_finish; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); - } - s = t; - set_useful: - ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + /* handle the extra constraint of /^.../m if present */ + + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); + goto fail_finish; + } + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) + { + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; + } + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; + } + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); } else { - /* The found string does not prohibit matching at strpos, - - no optimization of calling REx engine can be performed, - unless it was an MBOL and we are not after MBOL, - or a future STCLASS check will fail this. */ - try_at_start: - /* Even in this situation we may use MBOL flag if strpos is offset - wrt the start of the string. */ - if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n' - /* May be due to an implicit anchor of m{.*foo} */ - && !(prog->intflags & PREGf_IMPLICIT)) - { - t = strpos; - goto find_anchor; - } - DEBUG_EXECUTE_r( if (ml_anch) - PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", - (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]); - ); - success_at_start: - if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ - && (utf8_target ? ( - prog->check_utf8 /* Could be deleted already */ - && --BmUSEFUL(prog->check_utf8) < 0 - && (prog->check_utf8 == prog->float_utf8) - ) : ( - prog->check_substr /* Could be deleted already */ - && --BmUSEFUL(prog->check_substr) < 0 - && (prog->check_substr == prog->float_substr) - ))) - { - /* If flags & SOMETHING - do not do it many times on the same match */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); - /* XXX Does the destruction order has to change with utf8_target? */ - SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); - SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); - prog->check_substr = prog->check_utf8 = NULL; /* disable */ - prog->float_substr = prog->float_utf8 = NULL; /* clear */ - check = NULL; /* abort */ - s = strpos; - /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag - see http://bugs.activestate.com/show_bug.cgi?id=87173 */ - if (prog->intflags & PREGf_IMPLICIT) - prog->extflags &= ~RXf_ANCH_MBOL; - /* XXXX This is a remnant of the old implementation. It - looks wasteful, since now INTUIT can use many - other heuristics. */ - prog->extflags &= ~RXf_USE_INTUIT; - /* XXXX What other flags might need to be cleared in this branch? */ - } - else - s = strpos; + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); } - /* Last resort... */ - /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ - /* trie stclasses are too expensive to use here, we are better off to - leave it to regmatch itself */ + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ + if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { - /* minlen == 0 is possible if regstclass is \b or \B, - and the fixed substr is ''$. - Since minlen is already taken into account, s+1 is before strend; - accidentally, minlen >= 1 guaranties no false positives at s + 1 - even for \b or \B. But (minlen? 1 : 0) below assumes that - regstclass does not come from lookahead... */ - /* If regstclass takes bytelength more than 1: If charlength==1, OK. - This leaves EXACTF-ish only, which are dealt with in find_byclass(). */ const U8* const str = (U8*)STRING(progi->regstclass); + /* XXX this value could be pre-computed */ const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT ? (reginfo->is_utf8_pat @@ -1116,106 +1240,190 @@ Perl_re_intuit_start(pTHX_ : STR_LEN(progi->regstclass)) : 1); char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) - endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); - else if (prog->float_substr || prog->float_utf8) - endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } else endpos= strend; - if (checked_upto < s) - checked_upto = s; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); - t = s; - s = find_byclass(prog, progi->regstclass, checked_upto, endpos, + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, reginfo); - if (s) { - checked_upto = s; - } else { -#ifdef DEBUGGING - const char *what = NULL; -#endif + if (!s) { if (endpos == strend) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); + " Could not match STCLASS...\n") ); goto fail; } DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "This position contradicts STCLASS...\n") ); - if ((prog->extflags & RXf_ANCH) && !ml_anch) + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) goto fail; - checked_upto = HOPBACKc(endpos, start_shift); - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { - if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { - DEBUG_EXECUTE_r( what = "anchored" ); - hop_and_restart: - s = HOP3c(t, 1, strend); - if (s + start_shift + end_shift > strend) { - /* XXXX Should be taken into account earlier? */ - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); - goto fail; - } - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for %s substr starting at offset %ld...\n", - what, (long)(s + start_shift - i_strpos)) ); - goto restart; - } - /* Have both, check_string is floating */ - if (t + start_shift >= check_at) /* Contradicts floating=check */ - goto retry_floating_check; - /* Recheck anchored substring, but not floating... */ - s = check_at; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for anchored substr starting at offset %ld...\n", - (long)(other_last - i_strpos)) ); - goto do_other_anchored; - } - /* Another way we could have checked stclass at the - current position only: */ - if (ml_anch) { - s = t = t + 1; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for /%s^%s/m starting at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); - goto try_at_offset; - } - if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ - goto fail; - /* Check is floating substring. */ - retry_floating_check: - t = check_at - start_shift; - DEBUG_EXECUTE_r( what = "floating" ); - goto hop_and_restart; + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; + } + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; + } + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; + } + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; } - if (t != s) { + + /* Success !!! */ + + if (rx_origin != s) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "By STCLASS: moving %ld --> %ld\n", - (long)(t - i_strpos), (long)(s - i_strpos)) + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) ); } else { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "Does not contradict STCLASS...\n"); + " Does not contradict STCLASS...\n"); ); } } - giveup: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", - PL_colors[4], (check ? "Guessed" : "Giving up"), - PL_colors[5], (long)(s - i_strpos)) ); - return s; + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; + } + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; fail_finish: /* Substring not found */ if (prog->check_substr || prog->check_utf8) /* could be removed already */ @@ -1226,46 +1434,57 @@ Perl_re_intuit_start(pTHX_ return NULL; } + #define DECL_TRIE_TYPE(scan) \ - const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ trie_type = ((scan->flags == EXACT) \ ? (utf8_target ? trie_utf8 : trie_plain) \ - : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ -STMT_START { \ +STMT_START { \ STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ - uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ len = UTF8SKIP(uc); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_latin_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ case trie_utf8: \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ case trie_plain: \ uvc = (UV)*uc; \ @@ -1336,28 +1555,28 @@ REXEC_FBC_SCAN( \ tmp = 1; \ ) -#define REXEC_FBC_TRYIT \ +#define REXEC_FBC_TRYIT \ if ((reginfo->intuit || regtry(reginfo, &s))) \ goto got_it #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ - if (utf8_target) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ } -#define DUMP_EXEC_POS(li,s,doutf8) \ +#define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ startpos, doutf8) -#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ tmp = TEST_NON_UTF8(tmp); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1366,18 +1585,19 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ -#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ if (s == reginfo->strbeg) { \ tmp = '\n'; \ } \ else { \ U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ } \ tmp = TeSt1_UtF8; \ - LOAD_UTF8_CHARCLASS_ALNUM(); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! (TeSt2_UtF8)) { \ + if (tmp == ! (TeSt2_UtF8)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1412,9 +1632,9 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ * one, and compare it with the wordness of this one. If they differ, we have * a boundary. At the beginning of the string, pretend that the previous * character was a new-line */ -#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ if (utf8_target) { \ - UTF8_CODE \ + UTF8_CODE \ } \ else { /* Not utf8 */ \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ @@ -1429,7 +1649,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ } \ - if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ + if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ goto got_it; /* We know what class REx starts with. Try to find this position... */ @@ -1468,11 +1688,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { case ANYOF: - case ANYOF_SYNTHETIC: - case ANYOF_WARN_SUPER: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( - reginclass(prog, c, (U8*)s, utf8_target)); + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); } else { REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); @@ -1487,6 +1705,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: if (is_utf8_pat || utf8_target) { utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; @@ -1496,10 +1717,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1; /* /a, except the sharp s one which */ goto do_exactf_non_utf8; /* isn't dealt with by these */ - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); if (utf8_target) { - - /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1508,8 +1728,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; case EXACTFL: - if (is_utf8_pat || utf8_target) { - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; } fold_array = PL_fold_locale; @@ -1522,7 +1742,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_TRICKYFOLD: case EXACTFU: if (is_utf8_pat || utf8_target) { utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -1555,7 +1774,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * characters, and there are only 2 availabe, we know without * trying that it will fail; so don't start a match past the * required minimum number from the far end */ - e = HOP3c(strend, -((I32)ln), s); + e = HOP3c(strend, -((SSize_t)ln), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1601,7 +1820,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * only 2 are left, it's guaranteed to fail, so don't start a * match that would require us to go beyond the end of the string */ - e = HOP3c(strend, -((I32)lnc), s); + e = HOP3c(strend, -((SSize_t)lnc), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1627,15 +1846,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; } case BOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -1682,7 +1899,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); break; @@ -1746,7 +1962,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), classnum)))) { if (tmp && (reginfo->intuit || regtry(reginfo, &s))) @@ -1804,8 +2021,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } /* This is a copy of the loop above for swash classes, though using the @@ -2097,9 +2316,9 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, } else #endif { - I32 min = 0; - I32 max = strend - strbeg; - I32 sublen; + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; if ( (flags & REXEC_COPY_SKIP_POST) && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ @@ -2179,7 +2398,8 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; * from going quadratic */ if (SvPOKp(sv) && SvPVX(sv) == strbeg) - sv_pos_b2u(sv, &(prog->subcoffset)); + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); else prog->subcoffset = utf8_length((U8*)strbeg, (U8*)(strbeg+prog->suboffset)); @@ -2202,7 +2422,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, */ I32 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, - char *strbeg, I32 minend, SV *sv, void *data, U32 flags) + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -2219,9 +2439,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, char *s; regnode *c; char *startpos; - I32 minlen; /* must match at least this many chars */ - I32 dontbother = 0; /* how many characters not to try at end */ - I32 end_shift = 0; /* Same for the end. */ /* CC */ + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); @@ -2247,7 +2466,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, startpos = stringarg; - if (prog->extflags & RXf_GPOS_SEEN) { + if (prog->intflags & PREGf_GPOS_SEEN) { MAGIC *mg; /* set reginfo->ganch, the position where \G can match */ @@ -2256,15 +2475,16 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, (flags & REXEC_IGNOREPOS) ? stringarg /* use start pos rather than pos() */ : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) - ? strbeg + mg->mg_len /* Defined pos() */ + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) : strbeg; /* pos() not defined; use start of string */ DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg)); + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); /* in the presence of \G, we may need to start looking earlier in * the string than the suggested start point of stringarg: - * if gofs->prog is set, then that's a known, fixed minimum + * if prog->gofs is set, then that's a known, fixed minimum * offset, such as * /..\G/: gofs = 2 * /ab|c\G/: gofs = 1 @@ -2272,7 +2492,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, * to the start of the string, e.g. /w+\G/ */ - if (prog->extflags & RXf_ANCH_GPOS) { + if (prog->intflags & PREGf_ANCH_GPOS) { startpos = reginfo->ganch - prog->gofs; if (startpos < ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) @@ -2288,7 +2508,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, else startpos -= prog->gofs; } - else if (prog->extflags & RXf_GPOS_FLOAT) + else if (prog->intflags & PREGf_GPOS_FLOAT) startpos = strbeg; } @@ -2328,7 +2548,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, && (s < stringarg)) { /* this should only be possible under \G */ - assert(prog->extflags & RXf_GPOS_SEEN); + assert(prog->intflags & PREGf_GPOS_SEEN); DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); goto phooey; @@ -2378,15 +2598,16 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* see how far we have to get to not match where we matched before */ reginfo->till = stringarg + minend; - if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) { + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after S_cleanup_regmatch_info_aux has executed (registered by SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies magic belonging to this SV. Not newSVsv, either, as it does not COW. */ + assert(!IS_PADGV(sv)); reginfo->sv = newSV(0); - sv_setsv(reginfo->sv, sv); + SvSetSV_nosteal(reginfo->sv, sv); SAVEFREESV(reginfo->sv); } @@ -2463,11 +2684,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ - if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { if (s == startpos && regtry(reginfo, &s)) goto got_it; - else if (multiline || (prog->intflags & PREGf_IMPLICIT) - || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ { char *end; @@ -2541,8 +2761,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* end search for newline */ } /* end anchored/multiline check string search */ goto phooey; - } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) + } else if (prog->intflags & PREGf_ANCH_GPOS) { + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); /* For anchored \G, the only position it can match from is * (ganch-gofs); we already set startpos to this above; if intuit * moved us on from there, we can't possibly succeed */ @@ -2603,8 +2825,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, || ((prog->float_substr != NULL || prog->float_utf8 != NULL) && prog->float_max_offset < strend - s)) { SV *must; - I32 back_max; - I32 back_min; + SSize_t back_max; + SSize_t back_min; char *last; char *last1; /* Last position checked before */ #ifdef DEBUGGING @@ -2649,7 +2871,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, last = strend; } else { last = HOP3c(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) + -(SSize_t)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min), strbeg); } if (s > reginfo->strbeg) @@ -2659,10 +2881,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* XXXX check_substr already used to find "s", can optimize if check_substr==must. */ - dontbother = end_shift; + dontbother = 0; strend = HOPc(strend, -dontbother); while ( (s <= last) && - (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), (unsigned char*)strend, must, multiline ? FBMrf_MULTILINE : 0)) ) { DEBUG_EXECUTE_r( did_match = 1 ); @@ -2715,7 +2937,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, c); + regprop(prog, prop, c, reginfo); { RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); @@ -2858,7 +3080,7 @@ got_it: && (prog->offs[0].start < stringarg - strbeg)) { /* this should only be possible under \G */ - assert(prog->extflags & RXf_GPOS_SEEN); + assert(prog->intflags & PREGf_GPOS_SEEN); DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); goto phooey; @@ -2920,7 +3142,7 @@ phooey: /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. * Do inc before dec, in case old and new rex are the same */ -#define SET_reg_curpm(Re2) \ +#define SET_reg_curpm(Re2) \ if (reginfo->info_aux_eval) { \ (void)ReREFCNT_inc(Re2); \ ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ @@ -2938,7 +3160,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) CHECKPOINT lastcp; REGEXP *const rx = reginfo->prog; regexp *const prog = ReANY(rx); - I32 result; + SSize_t result; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -3185,11 +3407,11 @@ regmatch(), slabs allocated since entry are freed. #define DEBUG_STATE_pp(pp) \ DEBUG_STATE_r({ \ - DUMP_EXEC_POS(locinput, scan, utf8_target); \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ PerlIO_printf(Perl_debug_log, \ " %*s"pp" %s%s%s%s%s\n", \ depth*2, "", \ - PL_reg_name[st->resume_state], \ + PL_reg_name[st->resume_state], \ ((st==yes_state||st==mark_state) ? "[" : ""), \ ((st==yes_state) ? "Y" : ""), \ ((st==mark_state) ? "M" : ""), \ @@ -3393,6 +3615,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, dVAR; U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; if (OP(text_node) == EXACT) { @@ -3412,136 +3635,193 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = c1 = valid_utf8_to_uvchr(pat, NULL); } } - else /* an EXACTFish node */ - if ((is_utf8_pat - && is_MULTI_CHAR_FOLD_utf8_safe(pat, - pat + STR_LEN(text_node))) - || (!is_utf8_pat - && is_MULTI_CHAR_FOLD_latin1_safe(pat, - pat + STR_LEN(text_node)))) - { - /* Multi-character folds require more context to sort out. Also - * PL_utf8_foldclosures used below doesn't handle them, so have to be - * handled outside this routine */ - use_chrtest_void = TRUE; - } - else { /* an EXACTFish node which doesn't begin with a multi-char fold */ - c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; - if (c1 > 256) { - /* Load the folds hash, if not already done */ - SV** listp; - if (! PL_utf8_foldclosures) { - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - - /* Force loading this by folding an above-Latin1 char */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); } + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } + } - /* The fold closures data structure is a hash with the keys being - * the UTF-8 of every character that is folded to, like 'k', and - * the values each an array of all code points that fold to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are - * not included */ - if ((! (listp = hv_fetch(PL_utf8_foldclosures, - (char *) pat, - UTF8SKIP(pat), - FALSE)))) - { - /* Not found in the hash, therefore there are no folds - * containing it, so there is only a single character that - * could match */ - c2 = c1; + pat = folded; + pat_end = d; } - else { /* Does participate in folds */ - AV* list = (AV*) *listp; - if (av_len(list) != 1) { + } - /* If there aren't exactly two folds to this, it is outside - * the scope of this function */ - use_chrtest_void = TRUE; - } - else { /* There are two. Get them */ - SV** c_p = av_fetch(list, 0, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 256) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* Force loading this by folding an above-Latin1 char */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ } - c1 = SvUV(*c_p); + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + } + + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { - c_p = av_fetch(list, 1, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; } - c2 = SvUV(*c_p); - - /* Folds that cross the 255/256 boundary are forbidden if - * EXACTFL, or EXACTFA and one is ASCIII. Since the - * pattern character is above 256, and its only other match - * is below 256, the only legal match will be to itself. - * We have thrown away the original, so have to compute - * which is the one above 255 */ - if ((c1 < 256) != (c2 < 256)) { - if (OP(text_node) == EXACTFL - || (OP(text_node) == EXACTFA - && (isASCII(c1) || isASCII(c2)))) - { - if (c1 < 256) { - c1 = c2; - } - else { - c2 = c1; + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 256, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255 */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } } } } } } - } - else /* Here, c1 is < 255 */ - if (utf8_target - && HAS_NONLATIN1_FOLD_CLOSURE(c1) - && OP(text_node) != EXACTFL - && (OP(text_node) != EXACTFA || ! isASCII(c1))) - { - /* Here, there could be something above Latin1 in the target which - * folds to this character in the pattern. All such cases except - * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters - * involved in their folds, so are outside the scope of this - * function */ - if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { - c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; - } - else { - use_chrtest_void = TRUE; + else /* Here, c1 is < 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } } - } - else { /* Here nothing above Latin1 can fold to the pattern character */ - switch (OP(text_node)) { + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { - case EXACTFL: /* /l rules */ - c2 = PL_fold_locale[c1]; - break; + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; - case EXACTF: - if (! utf8_target) { /* /d rules */ - c2 = PL_fold[c1]; + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; break; - } - /* FALLTHROUGH */ - /* /u rules for all these. This happens to work for - * EXACTFA as nothing in Latin1 folds to ASCII */ - case EXACTFA: - case EXACTFU_TRICKYFOLD: - case EXACTFU_SS: - case EXACTFU: - c2 = PL_fold_latin1[c1]; - break; - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); - assert(0); /* NOTREACHED */ + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } } } } @@ -3583,7 +3863,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } /* returns -1 on failure, $+[0] on success */ -STATIC I32 +STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { #if PERL_VERSION < 9 && !defined(PERL_CORE) @@ -3601,7 +3881,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regnode *scan; regnode *next; U32 n = 0; /* general value; init to avoid compiler warning */ - I32 ln = 0; /* len or last; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput) */ @@ -3629,7 +3909,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; - SV* const oreplsv = GvSV(PL_replgv); + SV* const oreplsv = GvSVn(PL_replgv); /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops @@ -3659,6 +3939,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) GET_RE_DEBUG_FLAGS_DECL; #endif + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ multicall_oldcatch = 0; multicall_cv = NULL; @@ -3684,7 +3967,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SV * const prop = sv_newmortal(); regnode *rnext=regnext(scan); DUMP_EXEC_POS( locinput, scan, utf8_target ); - regprop(rex, prop, scan); + regprop(rex, prop, scan, reginfo); PerlIO_printf(Perl_debug_log, "%3"IVdf":%*s%s(%"IVdf")\n", @@ -3706,7 +3989,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); switch (state_num) { - case BOL: /* /^../ */ + case BOL: /* /^../ */ + case SBOL: /* /^../s */ if (locinput == reginfo->strbeg) break; sayNO; @@ -3719,11 +4003,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } sayNO; - case SBOL: /* /^../s */ - if (locinput == reginfo->strbeg) - break; - sayNO; - case GPOS: /* \G */ if (locinput == reginfo->ganch) break; @@ -3741,16 +4020,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO_SILENT; assert(0); /*NOTREACHED*/ - case EOL: /* /..$/ */ - goto seol; - case MEOL: /* /..$/m */ if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; break; + case EOL: /* /..$/ */ + /* FALL THROUGH */ case SEOL: /* /..$/s */ - seol: if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; if (reginfo->strend - locinput > 1) @@ -4048,7 +4325,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (chars) { if (utf8_target) { - uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; } @@ -4061,7 +4338,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (foldlen) { if (!--chars) break; - uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, uniflags); uscan += len; foldlen -= len; @@ -4152,7 +4429,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { sayNO; } l += 2; @@ -4175,7 +4453,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { sayNO; } s += 2; @@ -4205,27 +4484,31 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * s; U32 fold_utf8_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - fold_utf8_flags = FOLDEQ_UTF8_LOCALE; + fold_utf8_flags = FOLDEQ_LOCALE; goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ - case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: /* /abc/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; - case EXACTF: /* /abc/i */ + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); folder = foldEQ; fold_array = PL_fold; fold_utf8_flags = 0; @@ -4234,7 +4517,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) { + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { /* Either target or the pattern are utf8, or has the issue where * the fold lengths may differ. */ const char * const l = locinput; @@ -4269,8 +4556,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * have to set the FLAGS fields of these */ case BOUNDL: /* /\b/l */ case NBOUNDL: /* /\B/l */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* FALL THROUGH */ case BOUND: /* /\b/ */ case BOUNDU: /* /\b/u */ case BOUNDA: /* /\b/a */ @@ -4288,7 +4573,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 * const r = reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); - ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); } if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isWORDCHAR_uni(ln); @@ -4301,7 +4587,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -4350,11 +4636,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case ANYOF: /* /[abc]/ */ - case ANYOF_WARN_SUPER: if (NEXTCHR_IS_EOS) sayNO; if (utf8_target) { - if (!reginclass(rex, scan, (U8*)locinput, utf8_target)) + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) sayNO; locinput += UTF8SKIP(locinput); } @@ -4376,10 +4662,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (NEXTCHR_IS_EOS) sayNO; - /* The locale hasn't influenced the outcome before this, so defer - * tainting until now */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* Use isFOO_lc() for characters within Latin1. (Note that * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else * wouldn't be invariant) */ @@ -4390,7 +4672,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_UNI(nextchr, + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -4471,9 +4753,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), - FLAGS(scan))))) + FLAGS(scan))))) { sayNO; } @@ -4489,8 +4771,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } if (! (to_complement ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], @@ -4752,11 +5035,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 *fold_array; UV utf8_fold_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; type = REFFL; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; case NREFFA: /* /\g{name}/iaa */ @@ -4797,10 +5079,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_nref_ref_common; case REFFL: /* /\1/il */ - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_ref; case REFFA: /* /\1/iaa */ @@ -4840,8 +5121,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = reginfo->strbeg + ln; if (type != REF /* REF can do byte comparison */ - && (utf8_target || type == REFFU)) - { /* XXX handle REFFL better */ + && (utf8_target || type == REFFU || type == REFFL)) + { char * limit = reginfo->strend; /* This call case insensitively compares the entire buffer @@ -4915,7 +5196,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) startpoint = rei->program+1; ST.close_paren = 0; } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ goto eval_recurse_doit; + assert(0); /* NOTREACHED */ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ @@ -5027,8 +5315,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->offs[0].end = locinput - reginfo->strbeg; if (reginfo->info_aux_eval->pos_magic) - reginfo->info_aux_eval->pos_magic->mg_len - = locinput - reginfo->strbeg; + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); @@ -5066,20 +5355,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* /(??{}) */ /* if its overloaded, let the regex compiler handle * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); if (!SvAMAGIC(ret)) { SV *sv = ret; if (SvROK(sv)) sv = SvRV(sv); if (SvTYPE(sv) == SVt_REGEXP) re_sv = (REGEXP*) sv; - else if (SvSMAGICAL(sv)) { - MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); if (mg) re_sv = (REGEXP *) mg->mg_obj; } - /* force any magic, undef warnings here */ - if (!re_sv) { + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { ret = sv_mortalcopy(ret); (void) SvPV_force_nolen(ret); } @@ -5133,17 +5424,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) pm_flags); if (!(SvFLAGS(ret) - & (SVs_TEMP | SVs_PADTMP | SVf_READONLY - | SVs_GMG))) { + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { /* This isn't a first class regexp. Instead, it's caching a regexp onto an existing, Perl visible scalar. */ sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); } - /* safe to do now that any $1 etc has been - * interpolated into the new pattern string and - * compiled */ - S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); } SAVEFREESV(re_sv); re = ReANY(re_sv); @@ -5153,6 +5440,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) re->sublen = rex->sublen; re->suboffset = rex->suboffset; re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; rei = RXi_GET(re); DEBUG_EXECUTE_r( debug_start_match(re_sv, utf8_target, locinput, @@ -5160,18 +5449,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ + /* Save all the seen positions so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + /* and set maxopenparen to 0, since we are starting a "fresh" match */ + maxopenparen = 0; + /* run the pattern returned from (??{...}) */ - eval_recurse_doit: /* Share code with GOSUB below this line */ - /* run the pattern returned from (??{...}) */ - - /* Save *all* the positions. */ - ST.cp = regcppush(rex, 0, maxopenparen); - REGCP_SET(ST.lastcp); - - re->lastparen = 0; - re->lastcloseparen = 0; - - maxopenparen = 0; + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ /* invalidate the S-L poscache. We're now executing a * different set of WHILEM ops (and their associated @@ -5183,6 +5470,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * pattern again */ reginfo->poscache_maxiter = 0; + /* the new regexp might have a different is_utf8_pat than we do */ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); ST.prev_rex = rex_sv; @@ -5207,7 +5495,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); rexi = RXi_GET(rex); - regcpblow(ST.cp); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; @@ -5557,10 +5852,10 @@ NULL if (reginfo->poscache_iter-- == 0) { /* initialise cache */ - const I32 size = (reginfo->poscache_maxiter + 7)/8; + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; regmatch_info_aux *const aux = reginfo->info_aux; if (aux->poscache) { - if ((I32)reginfo->poscache_size < size) { + if ((SSize_t)reginfo->poscache_size < size) { Renew(aux->poscache, size, char); reginfo->poscache_size = size; } @@ -5578,7 +5873,7 @@ NULL if (reginfo->poscache_iter < 0) { /* have we already failed at this position? */ - I32 offset, mask; + SSize_t offset, mask; reginfo->poscache_iter = -1; /* stop eventual underflow */ offset = (scan->flags & 0xf) - 1 @@ -5903,7 +6198,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", (int)(REPORT_CODE_OFF+(depth*2)),"", valid_utf8_to_uvchr((U8 *) locinput, NULL), valid_utf8_to_uvchr(ST.c1_utf8, NULL), @@ -5917,7 +6212,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", (int)(REPORT_CODE_OFF+(depth*2)),"", (int) nextchr, ST.c1, ST.c2) ); @@ -6656,6 +6951,10 @@ yes: * When popping the save stack, all these locals would be undone; * bypass this by setting the outermost saved $^R to the latest * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ if (oreplsv != GvSV(PL_replgv)) sv_setsv(oreplsv, GvSV(PL_replgv)); } @@ -6858,7 +7157,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -6885,21 +7184,23 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: - utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; case EXACTFL: - RXp_MATCH_TAINTED_on(prog); - utf8_flags = FOLDEQ_UTF8_LOCALE; + utf8_flags = FOLDEQ_LOCALE; goto do_exactf; - case EXACTF: - utf8_flags = 0; - goto do_exactf; + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6963,11 +7264,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; } case ANYOF: - case ANYOF_WARN_SUPER: if (utf8_target) { while (hardcount < max && scan < loceol - && reginclass(prog, p, (U8*)scan, utf8_target)) + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; @@ -6985,7 +7285,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); if (! utf8_target) { while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) @@ -7085,8 +7384,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, - *(scan + 1)), + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), classnum)))) { break; @@ -7168,8 +7467,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init( - "utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } while (hardcount < max && scan < loceol @@ -7237,7 +7538,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, GET_RE_DEBUG_FLAGS_DECL; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, p); + regprop(prog, prop, p, reginfo); PerlIO_printf(Perl_debug_log, "%*s %s can match %"IVdf" times out of %"IVdf"...\n", REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); @@ -7263,14 +7564,18 @@ Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, *altsvp = NULL; } - return newSVsv(core_regclass_swash(prog, node, doinit, listsvp)); + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); } -#endif -STATIC SV * -S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp) +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) { - /* Returns the swash for the input 'node' in the regex 'prog'. + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. * If is 'true', will attempt to create the swash if not already * done. * If is non-null, will return the printable contents of the @@ -7288,9 +7593,10 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; - PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; - assert(ANYOF_NONBITMAP(node)); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); if (data && data->count) { const U32 n = ARG(node); @@ -7303,25 +7609,38 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit si = *ary; /* ary[0] = the string to initialize the swash with */ - /* Elements 2 and 3 are either both present or both absent. [2] is - * any inversion list generated at compile time; [3] indicates if + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if * that inversion list has any user-defined properties in it. */ - if (av_len(av) >= 2) { - invlist = ary[2]; - if (SvUV(ary[3])) { - swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; } - } - else { - invlist = NULL; } /* Element [1] is reserved for the set-up swash. If already there, * return it; if not, create it and store it there */ - if (SvROK(ary[1])) { + if (ary[1] && SvROK(ary[1])) { sw = ary[1]; } - else if (si && doinit) { + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { sw = _core_swash_init("utf8", /* the utf8 package */ "", /* nameless */ @@ -7360,12 +7679,14 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit return sw; } +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ /* - reginclass - determine if a character falls into a character class n is the ANYOF regnode p is the target string + p_end points to one byte beyond the end of the target string utf8_target tells whether p is in UTF-8. Returns true if matched; false otherwise. @@ -7377,7 +7698,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit */ STATIC bool -S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target) +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); @@ -7390,7 +7711,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_IS_INVARIANT() works even if not in UTF-8 */ if (! UTF8_IS_INVARIANT(c) && utf8_target) { STRLEN c_len = 0; - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, + c = utf8n_to_uvchr(p, p_end - p, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for @@ -7403,21 +7724,19 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_NON_UTF8_LATIN1_ALL + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL && ! utf8_target && ! isASCII(c)) { match = TRUE; } - else if (flags & ANYOF_LOCALE) { - RXp_MATCH_TAINTED_on(prog); - - if ((flags & ANYOF_LOC_FOLD) - && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) - { - match = TRUE; - } - else if (ANYOF_CLASS_TEST_ANY_SET(n)) { + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } + } + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { /* The data structure is arranged so bits 0, 2, 4, ... are set * if the class includes the Posix character class given by @@ -7451,8 +7770,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const int count = 0; int to_complement = 0; + while (count < ANYOF_MAX) { - if (ANYOF_CLASS_TEST(n, count) + if (ANYOF_POSIXL_TEST(n, count) && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) { match = TRUE; @@ -7465,27 +7785,22 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } } + /* If the bitmap didn't (or couldn't) match, and something outside the - * bitmap could match, try that. Locale nodes specify completely the - * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except in - * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes. So allow locale - * to match for the synthetic start class, which will give a false - * positive that will be resolved when the match is done again as not part - * of the synthetic start class */ + * bitmap could match, try that. */ if (!match) { - if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { match = TRUE; /* Everything above 255 matches */ } - else if (ANYOF_NONBITMAP(n) - && ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target - && (c >=256 - || (! (flags & ANYOF_LOCALE)) - || OP(n) == ANYOF_SYNTHETIC)))) - { - SV * const sw = core_regclass_swash(prog, n, TRUE, 0); + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) + { + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); if (sw) { U8 * utf8_p; if (utf8_target) { @@ -7502,23 +7817,32 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const /* If we allocated a string above, free it */ if (! utf8_target) Safefree(utf8_p); } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } } if (UNICODE_IS_SUPER(c) - && OP(n) == ANYOF_WARN_SUPER + && (flags & ANYOF_WARN_SUPER) && ckWARN_d(WARN_NON_UNICODE)) { Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); } } +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ - return cBOOL(flags & ANYOF_INVERT) ^ match; + return (flags & ANYOF_INVERT) ^ match; } STATIC U8 * -S_reghop3(U8 *s, I32 off, const U8* lim) +S_reghop3(U8 *s, SSize_t off, const U8* lim) { /* return the position 'off' UTF-8 characters away from 's', forward if * 'off' >= 0, backwards if negative. But don't go outside of position @@ -7547,13 +7871,8 @@ S_reghop3(U8 *s, I32 off, const U8* lim) return s; } -#ifdef XXX_dmq -/* there are a bunch of places where we use two reghop3's that should - be replaced with this routine. but since thats not done yet - we ifdef it out - dmq -*/ STATIC U8 * -S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) { dVAR; @@ -7577,10 +7896,12 @@ S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) } return s; } -#endif + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ STATIC U8 * -S_reghopmaybe3(U8* s, I32 off, const U8* lim) +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) { dVAR; @@ -7648,6 +7969,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) } eval_state->pos_magic = mg; eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; } else eval_state->pos_magic = NULL; @@ -7664,7 +7986,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) /* this regexp is also owned by the new PL_reg_curpm, which will try to free it. */ av_push(PL_regex_padav, repointer); - PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); PL_regex_pad = AvARRAY(PL_regex_padav); } #endif @@ -7722,7 +8044,12 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } if (eval_state->pos_magic) + { eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } PL_curpm = eval_state->curpm; } diff --git a/src/5019003/regcomp.c b/src/5020000/regcomp.c similarity index 76% rename from src/5019003/regcomp.c rename to src/5020000/regcomp.c index 55189fe..2e6d5e2 100644 --- a/src/5019003/regcomp.c +++ b/src/5020000/regcomp.c @@ -81,7 +81,7 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" -extern const struct regexp_engine my_reg_engine; +EXTERN_C const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif @@ -91,51 +91,46 @@ extern const struct regexp_engine my_reg_engine; #include "inline_invlist.c" #include "unicode_constants.h" -#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) -#ifdef op -#undef op -#endif /* op */ - -#ifdef MSDOS -# if defined(BUGGY_MSC6) - /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ -# pragma optimize("a",off) - /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ -# pragma optimize("w",on ) -# endif /* BUGGY_MSC6 */ -#endif /* MSDOS */ - #ifndef STATIC #define STATIC static #endif -typedef struct RExC_state_t { +struct RExC_state_t { U32 flags; /* RXf_* are we folding, multilining? */ U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ - regexp_internal *rxi; /* internal data for regexp object pprivate field */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ - I32 whilem_seen; /* number of WHILEM in this expr */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ - regnode *emit_bound; /* First regnode outside of the allocated space */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ regnode *emit; /* Code-emit pointer; if = &emit_dummy, implies compiling, so don't emit */ - regnode emit_dummy; /* placeholder for emit to point to */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; - I32 size; /* Code size. */ - I32 npar; /* Capture buffer count, (OPEN). */ - I32 cpar; /* Capture buffer count, (CLOSE). */ - I32 nestroot; /* root parens we are in - used by accept */ + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ I32 extralen; I32 seen_zerolen; regnode **open_parens; /* pointers to open parens */ @@ -152,15 +147,20 @@ typedef struct RExC_state_t { regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ I32 in_lookbehind; I32 contains_locale; + I32 contains_i; I32 override_recoding; I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ -#if ADD_TO_REGEXEC + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif @@ -173,7 +173,7 @@ typedef struct RExC_state_t { #define RExC_lastnum (pRExC_state->lastnum) #define RExC_paren_name_list (pRExC_state->paren_name_list) #endif -} RExC_state_t; +}; #define RExC_flags (pRExC_state->flags) #define RExC_pm_flags (pRExC_state->pm_flags) @@ -186,7 +186,8 @@ typedef struct RExC_state_t { #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) #ifdef RE_TRACK_PATTERN_OFFSETS -#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ #endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_dummy (pRExC_state->emit_dummy) @@ -196,6 +197,7 @@ typedef struct RExC_state_t { #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) @@ -209,8 +211,12 @@ typedef struct RExC_state_t { #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) #define RExC_override_recoding (pRExC_state->override_recoding) #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) @@ -219,9 +225,6 @@ typedef struct RExC_state_t { #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s, FALSE))) -#ifdef SPSTART -#undef SPSTART /* dratted cpp namespace... */ -#endif /* * Flags to be passed up and down. */ @@ -267,6 +270,11 @@ typedef struct RExC_state_t { #define namedclass_to_classnum(class) ((int) ((class) / 2)) #define classnum_to_namedclass(classnum) ((classnum) * 2) +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + /* About scan_data_t. During optimisation we recurse through the regexp program performing @@ -301,7 +309,7 @@ typedef struct RExC_state_t { - max_offset Only used for floating strings. This is the rightmost point that - the string can appear at. If set to I32 max it indicates that the + the string can appear at. If set to SSize_t_MAX it indicates that the string can occur infinitely far to the right. - minlenp @@ -345,26 +353,26 @@ typedef struct RExC_state_t { typedef struct scan_data_t { /*I32 len_min; unused */ /*I32 len_delta; unused */ - I32 pos_min; - I32 pos_delta; + SSize_t pos_min; + SSize_t pos_delta; SV *last_found; - I32 last_end; /* min value, <0 unless valid. */ - I32 last_start_min; - I32 last_start_max; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; SV **longest; /* Either &l_fixed, or &l_float. */ SV *longest_fixed; /* longest fixed string found in pattern */ - I32 offset_fixed; /* offset where it starts */ - I32 *minlen_fixed; /* pointer to the minlen relevant to the string */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ I32 lookbehind_fixed; /* is the position of the string modfied by LB */ SV *longest_float; /* longest floating string found in pattern */ - I32 offset_float_min; /* earliest point in string it can appear */ - I32 offset_float_max; /* latest point in string it can appear */ - I32 *minlen_float; /* pointer to the minlen relevant to the string */ - I32 lookbehind_float; /* is the position of the string modified by LB */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ I32 flags; I32 whilem_c; - I32 *last_closep; - struct regnode_charclass_class *start_class; + SSize_t *last_closep; + regnode_ssc *start_class; } scan_data_t; /* The below is perhaps overboard, but this allows us to save a test at the @@ -398,13 +406,8 @@ static const scan_data_t zero_scan_data = #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) -#ifdef NO_UNARY_PLUS -# define SF_FIX_SHIFT_EOL (0+2) -# define SF_FL_SHIFT_EOL (0+4) -#else -# define SF_FIX_SHIFT_EOL (+2) -# define SF_FL_SHIFT_EOL (+4) -#endif +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) @@ -429,15 +432,25 @@ static const scan_data_t zero_scan_data = /* The enums for all these are ordered so things work out correctly */ #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) -#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) -#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) -#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + #define OOB_NAMEDCLASS -1 /* There is no code point that is out-of-bounds, so this is problematic. But @@ -460,7 +473,12 @@ static const scan_data_t zero_scan_data = #define MARKER1 "<-- HERE" /* marker as it appears in the description */ #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ -#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given @@ -482,12 +500,12 @@ static const scan_data_t zero_scan_data = } STMT_END #define FAIL(msg) _FAIL( \ - Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) #define FAIL2(msg,arg) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ - arg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan @@ -495,7 +513,7 @@ static const scan_data_t zero_scan_data = #define Simple_vFAIL(m) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -512,8 +530,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL2(m,a1) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -531,8 +549,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL3(m, a1, a2) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -549,8 +567,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vFAIL4(m,a1,a2,a3) STMT_START { \ @@ -559,80 +577,90 @@ static const scan_data_t zero_scan_data = Simple_vFAIL4(m, a1, a2, a3); \ } STMT_END +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + /* m is not necessarily a "literal string", in this macro */ #define reg_warn_non_literal_string(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN_dep(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg_d(loc,m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN3(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ } STMT_END @@ -667,7 +695,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ __LINE__, (int)(node), (int)(byte))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)-1] = (byte); \ } \ @@ -683,7 +712,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ __LINE__, (int)(node), (int)(len))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)] = (len); \ } \ @@ -709,6 +739,49 @@ static const scan_data_t zero_scan_data = #define EXPERIMENTAL_INPLACESCAN #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log, \ @@ -748,7 +821,8 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ floating substrings if needed. */ STATIC void -S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) { const STRLEN l = CHR_SVLEN(data->last_found); const STRLEN old_l = CHR_SVLEN(*data->longest); @@ -772,9 +846,12 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min data->offset_float_min = l ? data->last_start_min : data->pos_min; data->offset_float_max = (l ? data->last_start_max - : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta)); - if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) - data->offset_float_max = I32_MAX; + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; if (data->flags & SF_BEFORE_EOL) data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); @@ -798,299 +875,592 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min DEBUG_STUDYDATA("commit: ",data,0); } -/* These macros set, clear and test whether the synthetic start class ('ssc', - * given by the parameter) matches an empty string (EOS). This uses the - * 'next_off' field in the node, to save a bit in the flags field. The ssc - * stands alone, so there is never a next_off, so this field is otherwise - * unused. The EOS information is used only for compilation, but theoretically - * it could be passed on to the execution code. This could be used to store - * more than one bit of information, but only this one is currently used. */ -#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END -#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END -#define TEST_SSC_EOS(node) cBOOL((node)->next_off) - -/* Can match anything (initialization) */ +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(pTHX_ const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + STATIC void -S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) { - PERL_ARGS_ASSERT_CL_ANYTHING; + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ + + PERL_ARGS_ASSERT_SSC_INIT; - ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_UNICODE_ALL; - SET_SSC_EOS(cl); + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); /* If any portion of the regex is to operate under locale rules, * initialization includes it. The reason this isn't done for all regexes * is that the optimizer was written under the assumption that locale was * all-or-nothing. Given the complexity and lack of documentation in the - * optimizer, and that there are inadequate test cases for locale, so many + * optimizer, and that there are inadequate test cases for locale, many * parts of it may not work properly, it is safest to avoid locale unless * necessary. */ if (RExC_contains_locale) { - ANYOF_CLASS_SETALL(cl); /* /l uses class */ - cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD; + ANYOF_POSIXL_SETALL(ssc); } else { - ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ + ANYOF_POSIXL_ZERO(ssc); } } -/* Can match anything (initialization) */ STATIC int -S_cl_is_anything(const struct regnode_charclass_class *cl) +S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) { - int value; + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ - PERL_ARGS_ASSERT_CL_IS_ANYTHING; + UV start, end; + bool ret; - for (value = 0; value < ANYOF_MAX; value += 2) - if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) - return 1; - if (!(cl->flags & ANYOF_UNICODE_ALL)) - return 0; - if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) - return 0; - return 1; + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; } -/* Can match anything (initialization) */ -STATIC void -S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) { - PERL_ARGS_ASSERT_CL_INIT; + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ - Zero(cl, 1, struct regnode_charclass_class); - cl->type = ANYOF; - cl_anything(pRExC_state, cl); - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } + + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } + + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } + + return invlist; } /* These two functions currently do the exact same thing */ -#define cl_init_zero cl_init +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ -/* 'AND' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if - * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_and(struct regnode_charclass_class *cl, - const struct regnode_charclass_class *and_with) +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) { - PERL_ARGS_ASSERT_CL_AND; + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } - assert(PL_regkind[and_with->type] == ANYOF); + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ - /* I (khw) am not sure all these restrictions are necessary XXX */ - if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) - && !(ANYOF_CLASS_TEST_ANY_SET(cl)) - && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(and_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD)) { - int i; + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; - if (and_with->flags & ANYOF_INVERT) - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= ~and_with->bitmap[i]; - else - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= and_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ - - if (and_with->flags & ANYOF_INVERT) { - - /* Here, the and'ed node is inverted. Get the AND of the flags that - * aren't affected by the inversion. Those that are affected are - * handled individually below */ - U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; - cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); - cl->flags |= affected_flags; - - /* We currently don't know how to deal with things that aren't in the - * bitmap, but we know that the intersection is no greater than what - * is already in cl, so let there be false positives that get sorted - * out after the synthetic start class succeeds, and the node is - * matched for real. */ - - /* The inversion of these two flags indicate that the resulting - * intersection doesn't have them */ - if (and_with->flags & ANYOF_UNICODE_ALL) { - cl->flags &= ~ANYOF_UNICODE_ALL; - } - if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { - cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* and'd node is not inverted */ - U8 outside_bitmap_but_not_utf8; /* Temp variable */ - - if (! ANYOF_NONBITMAP(and_with)) { - - /* Here 'and_with' doesn't match anything outside the bitmap - * (except possibly ANYOF_UNICODE_ALL), which means the - * intersection can't either, except for ANYOF_UNICODE_ALL, in - * which case we don't know what the intersection is, but it's no - * greater than what cl already has, so can just leave it alone, - * with possible false positives */ - if (! (and_with->flags & ANYOF_UNICODE_ALL)) { - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); - cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; - } - } - else if (! ANYOF_NONBITMAP(cl)) { - - /* Here, 'and_with' does match something outside the bitmap, and cl - * doesn't have a list of things to match outside the bitmap. If - * cl can match all code points above 255, the intersection will - * be those above-255 code points that 'and_with' matches. If cl - * can't match all Unicode code points, it means that it can't - * match anything outside the bitmap (since the 'if' that got us - * into this block tested for that), so we leave the bitmap empty. - */ - if (cl->flags & ANYOF_UNICODE_ALL) { - ARG_SET(cl, ARG(and_with)); + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); + + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); - /* and_with's ARG may match things that don't require UTF8. - * And now cl's will too, in spite of this being an 'and'. See - * the comments below about the kludge */ - cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8; + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } } } - else { - /* Here, both 'and_with' and cl match something outside the - * bitmap. Currently we do not do the intersection, so just match - * whatever cl had at the beginning. */ - } - - - /* Take the intersection of the two sets of flags. However, the - * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a - * kludge around the fact that this flag is not treated like the others - * which are initialized in cl_anything(). The way the optimizer works - * is that the synthetic start class (SSC) is initialized to match - * anything, and then the first time a real node is encountered, its - * values are AND'd with the SSC's with the result being the values of - * the real node. However, there are paths through the optimizer where - * the AND never gets called, so those initialized bits are set - * inappropriately, which is not usually a big deal, as they just cause - * false positives in the SSC, which will just mean a probably - * imperceptible slow down in execution. However this bit has a - * higher false positive consequence in that it can cause utf8.pm, - * utf8_heavy.pl ... to be loaded when not necessary, which is a much - * bigger slowdown and also causes significant extra memory to be used. - * In order to prevent this, the code now takes a different tack. The - * bit isn't set unless some part of the regular expression needs it, - * but once set it won't get cleared. This means that these extra - * modules won't get loaded unless there was some path through the - * pattern that would have required them anyway, and so any false - * positives that occur by not ANDing them out when they could be - * aren't as severe as they would be if we treated this bit like all - * the others */ - outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags) - & ANYOF_NONBITMAP_NON_UTF8; - cl->flags &= and_with->flags; - cl->flags |= outside_bitmap_but_not_utf8; + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } } } -/* 'OR' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if - * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) { - PERL_ARGS_ASSERT_CL_OR; - - if (or_with->flags & ANYOF_INVERT) { - - /* Here, the or'd node is to be inverted. This means we take the - * complement of everything not in the bitmap, but currently we don't - * know what that is, so give up and match anything */ - if (ANYOF_NONBITMAP(or_with)) { - cl_anything(pRExC_state, cl); - } - /* We do not use - * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) - * <= (B1 | !B2) | (CL1 | !CL2) - * which is wasteful if CL2 is small, but we ignore CL2: - * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 - * XXXX Can we handle case-fold? Unclear: - * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = - * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) - */ - else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(or_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD) ) { - int i; + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= ~or_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise */ - else { - cl_anything(pRExC_state, cl); - } + SV* ored_cp_list; + U8 ored_flags; - /* And, we can just take the union of the flags that aren't affected - * by the inversion */ - cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; - - /* For the remaining flags: - ANYOF_UNICODE_ALL and inverted means to not match anything above - 255, which means that the union with cl should just be - what cl has in it, so can ignore this flag - ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord - is 127-255 to match them, but then invert that, so the - union with cl should just be what cl has in it, so can - ignore this flag - */ - } else { /* 'or_with' is not inverted */ - /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ - if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_LOC_FOLD) - || (cl->flags & ANYOF_LOC_FOLD)) ) { - int i; + PERL_ARGS_ASSERT_SSC_OR; - /* OR char bitmap and class bitmap separately */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= or_with->bitmap[i]; - if (or_with->flags & ANYOF_CLASS) { - ANYOF_CLASS_OR(or_with, cl); - } - } - else { /* XXXX: logic is complicated, leave it along for a moment. */ - cl_anything(pRExC_state, cl); - } + assert(is_ANYOF_SYNTHETIC(ssc)); - if (ANYOF_NONBITMAP(or_with)) { + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } - /* Use the added node's outside-the-bit-map match if there isn't a - * conflict. If there is a conflict (both nodes match something - * outside the bitmap, but what they match outside is not the same - * pointer, and hence not easily compared until XXX we extend - * inversion lists this far), give up and allow the start class to - * match everything outside the bitmap. If that stuff is all above - * 255, can just set UNICODE_ALL, otherwise caould be anything. */ - if (! ANYOF_NONBITMAP(cl)) { - ARG_SET(cl, ARG(or_with)); - } - else if (ARG(cl) != ARG(or_with)) { + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ - if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { - cl_anything(pRExC_state, cl); - } - else { - cl->flags |= ANYOF_UNICODE_ALL; + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); } } } + } + + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} + +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} - /* Take the union */ - cl->flags |= or_with->flags; +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) -#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) #ifdef DEBUGGING @@ -1153,10 +1523,12 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; - PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); if ( trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); } else { PerlIO_printf( Perl_debug_log, "%6s", "" ); } @@ -1168,19 +1540,23 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, while( ( base + ofs < trie->uniquecharcount ) || ( base + ofs - trie->uniquecharcount < trie->lasttrans - && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) ofs++; PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) && - ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && - trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) { PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, - (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); } else { PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); } @@ -1191,7 +1567,8 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, } PerlIO_printf( Perl_debug_log, "\n" ); } - PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, ""); + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); for (word=1; word <= trie->wordcount; word++) { PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", (int)word, (int)(trie->wordinfo[word].prev), @@ -1235,14 +1612,16 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, ); } for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { - SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); if ( tmp ) { PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR ) , TRIE_LIST_ITEM(state,charid).forid, (UV)TRIE_LIST_ITEM(state,charid).newstate @@ -1318,9 +1697,11 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); } if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); } else { - PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, trie->states[ TRIE_NODENUM( state ) ].wordnum ); } } @@ -1439,7 +1820,7 @@ and would end up looking like: 8: EXACT (10) 10: END(0) - d = uvuni_to_utf8_flags(d, uv, 0); + d = uvchr_to_utf8_flags(d, uv, 0); is the recommended Unicode-aware way of saying @@ -1451,7 +1832,7 @@ is the recommended Unicode-aware way of saying if (UTF) { \ SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ @@ -1462,31 +1843,28 @@ is the recommended Unicode-aware way of saying } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - /* if it is UTF then it is either already folded, or does not need folding */ \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ - } \ - else if (folder == PL_fold_latin1) { \ - /* if we use this folder we have to obey unicode rules on latin-1 data */ \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ - skiplen = UNISKIP(uvc); \ - foldlen -= skiplen; \ - scan = foldbuf + skiplen; \ - } \ - } else { \ - /* raw data, will be folded later if needed */ \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1529,7 +1907,8 @@ is the recommended Unicode-aware way of saying \ if ( noper_next < tail ) { \ if (!trie->jump) \ - trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ trie->jump[curword] = (U16)(noper_next - convert); \ if (!jumper) \ jumper = noper_next; \ @@ -1564,7 +1943,9 @@ is the recommended Unicode-aware way of saying #define MADE_EXACT_TRIE 4 STATIC I32 -S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) { dVAR; /* first pass, loop through and scan words */ @@ -1572,7 +1953,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs HV *widecharmap = NULL; AV *revcharmap = newAV(); regnode *cur; - const U32 uniflags = UTF8_ALLOW_DEFAULT; STRLEN len = 0; UV uvc = 0; U16 curword = 0; @@ -1585,13 +1965,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 * folder = NULL; #ifdef DEBUGGING - const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); AV *trie_words = NULL; /* along with revcharmap, this only used during construction but both are * useful during debugging so we store them in the struct when debugging. */ #else - const U32 data_slot = add_data( pRExC_state, 2, "tu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); STRLEN trie_charcount=0; #endif SV *re_trie_maxbuff; @@ -1606,10 +1986,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs case EXACT: break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; - case EXACTFL: folder = PL_fold_locale; break; default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } @@ -1633,12 +2011,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } DEBUG_TRIE_COMPILE_r({ - PerlIO_printf( Perl_debug_log, - "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - (int)depth * 2 + 2, "", - REG_NODE_NUM(startbranch),REG_NODE_NUM(first), - REG_NODE_NUM(last), REG_NODE_NUM(tail), - (int)depth); + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); }); /* Find the node we are going to overwrite */ @@ -1658,9 +2035,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store Unicode characters. We use the - native representation of the character value as the key and IV's for the - coded index. + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. *TODO* If we keep track of how many times each character is used we can remap the columns so that the table compression later on is more @@ -1677,13 +2054,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); const U8 *e = uc + STR_LEN( noper ); - STRLEN foldlen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; - const U8 *scan = (U8*)NULL; + int foldlen = 0; U32 wordlen = 0; /* required init */ - STRLEN chars = 0; - bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1703,13 +2079,77 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regardless of encoding */ if (OP( noper ) == EXACTFU_SS) { /* false positives are ok, so just set this */ - TRIE_BITMAP_SET(trie,0xDF); + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; - chars++; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ if ( uvc < 256 ) { if ( folder ) { U8 folded= folder[ (U8) uvc ]; @@ -1733,13 +2173,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !UTF ) { /* store first byte of utf8 representation of variant codepoints */ - if (! UNI_IS_INVARIANT(uvc)) { + if (! UVCHR_IS_INVARIANT(uvc)) { TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); } } set_bit = 0; /* We've done our bit :-) */ } } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + SV** svpp; if ( !widecharmap ) widecharmap = newHV(); @@ -1754,30 +2202,22 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs TRIE_STORE_REVCHAR(uvc); } } - } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ if( cur == first ) { - trie->minlen = chars; - trie->maxlen = chars; - } else if (chars < trie->minlen) { - trie->minlen = chars; - } else if (chars > trie->maxlen) { - trie->maxlen = chars; - } - if (OP( noper ) == EXACTFU_SS) { - /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ - if (trie->minlen > 1) - trie->minlen= 1; + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; } - if (OP( noper ) == EXACTFU_TRICKYFOLD) { - /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" - * - We assume that any such sequence might match a 2 byte string */ - if (trie->minlen > 2 ) - trie->minlen= 2; - } - } /* end first pass */ DEBUG_TRIE_COMPILE_r( - PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", (int)depth * 2 + 2,"", ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, @@ -1809,7 +2249,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); prev_states[1] = 0; - if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { /* Second Pass -- Array Of Lists Representation @@ -1841,11 +2283,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1864,14 +2302,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); if ( !svpp ) { charid = 0; } else { charid=(U16)SvIV( *svpp ); } } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ if ( charid ) { U16 check; @@ -1881,8 +2323,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !trie->states[ state ].trans.list ) { TRIE_LIST_NEW( state ); } - for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { - if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { newstate = TRIE_LIST_ITEM( state, check ).newstate; break; } @@ -1952,7 +2399,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, transcount * sizeof(reg_trie_trans) ); - Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); } base = trie->uniquecharcount + tp - minid; if ( maxid == minid ) { @@ -1960,22 +2409,27 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( ; zp < tp ; zp++ ) { if ( ! trie->trans[ zp ].next ) { base = trie->uniquecharcount + zp - minid; - trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ zp ].check = state; set = 1; break; } } if ( !set ) { - trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ tp ].check = state; tp++; zp = tp; } } else { for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; - trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; trie->trans[ tid ].check = state; } tp += ( maxid - minid + 1 ); @@ -1995,26 +2449,26 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* Second Pass -- Flat Table Representation. - we dont use the 0 slot of either trans[] or states[] so we add 1 to each. - We know that we will need Charcount+1 trans at most to store the data - (one row per char at worst case) So we preallocate both structures - assuming worst case. + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. We then construct the trie using only the .next slots of the entry structs. - We use the .check field of the first entry of the node temporarily to - make compression both faster and easier by keeping track of how many non - zero fields are in the node. + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. Since trans are numbered from 1 any 0 pointer in the table is a FAIL transition. - There are two terms at use here: state as a TRIE_NODEIDX() which is a - number representing the first entry of the node, and state as a - TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and - TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there - are 2 entrys per node. eg: + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: A B A B 1. 2 4 1. 3 7 @@ -2022,9 +2476,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs 3. 0 0 5. 0 0 4. 0 0 7. 0 0 - The table is internally in the right hand, idx form. However as we also - have to deal with the states array which is indexed by nodenum we have to - use TRIE_NODENUM() to convert. + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. */ DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, @@ -2051,12 +2505,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U16 charid = 0; /* sanity init */ U32 accept_state = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - STRLEN skiplen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -2075,7 +2525,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); charid = svpp ? (U16)SvIV(*svpp) : 0; } if ( charid ) { @@ -2091,7 +2544,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } else { Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ } } accept_state = TRIE_NODENUM( state ); @@ -2178,7 +2632,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U32 used = trie->trans[ stateidx ].check; trie->trans[ stateidx ].check = 0; - for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { if ( flag || trie->trans[ stateidx + charid ].next ) { if ( trie->trans[ stateidx + charid ].next ) { if (o_used == 1) { @@ -2187,8 +2644,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs break; } } - trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; - trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); trie->trans[ zp ].check = state; if ( ++zp > pos ) pos = zp; break; @@ -2197,9 +2659,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( !flag ) { flag = 1; - trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; } - trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); trie->trans[ pos ].check = state; pos++; } @@ -2210,19 +2675,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->states, laststate * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, - "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", - (int)depth * 2 + 2,"", - (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), - (IV)next_alloc, - (IV)pos, - ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); ); } /* end table compress */ } DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", (int)depth * 2 + 2, "", (UV)trie->statecount, (UV)trie->lasttrans) @@ -2273,7 +2740,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs }); } DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); @@ -2517,22 +2985,27 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STATIC void S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) { -/* The Trie is constructed and compressed now so we can build a fail array if it's needed +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed - This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the - "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 ISBN 0-201-10088-6 - We find the fail state for each state in the trie, this state is the longest proper - suffix of the current state's 'word' that is also a proper prefix of another word in our - trie. State 1 represents the word '' and is thus the default fail state. This allows - the DFA not to have to restart after its tried and failed a word at a given point, it - simply continues as though it had been matching the other word in the first place. + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. Consider 'abcdgu'=~/abcdefg|cdgu/ - When we get to 'd' we are still matching the first word, we would encounter 'g' which would - fail, which would bring us to the state representing 'd' in the second word where we would - try 'g' and succeed, proceeding to match 'cdgu'. + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. */ /* add a fail transition */ const U32 trie_offset = ARG(source); @@ -2547,7 +3020,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 base = trie->states[ 1 ].trans.base; U32 *fail; reg_ac_data *aho; - const U32 data_slot = add_data( pRExC_state, 1, "T" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; @@ -2620,26 +3093,15 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode PerlIO_printf(Perl_debug_log, "\n"); }); Safefree(q); - /*RExC_seen |= REG_SEEN_TRIEDFA;*/ + /*RExC_seen |= REG_TRIEDFA_SEEN;*/ } -/* - * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. - * These need to be revisited when a newer toolchain becomes available. - */ -#if defined(__sparc64__) && defined(__GNUC__) -# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) -# undef SPARC64_GCC_WORKAROUND -# define SPARC64_GCC_WORKAROUND 1 -# endif -#endif - #define DEBUG_PEEP(str,scan,depth) \ DEBUG_OPTIMISE_r({if (scan){ \ SV * const mysv=sv_newmortal(); \ regnode *Next = regnext(scan); \ - regprop(RExC_rx, mysv, scan); \ + regprop(RExC_rx, mysv, scan, NULL); \ PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ Next ? (REG_NODE_NUM(Next)) : 0 ); \ @@ -2658,49 +3120,58 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta of the input nodes. + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. * - * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF - * and contains LATIN SMALL LETTER SHARP S + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) * * This is as good a place as any to discuss the design of handling these * multi-character fold sequences. It's been wrong in Perl for a very long * time. There are three code points in Unicode whose multi-character folds * were long ago discovered to mess things up. The previous designs for * dealing with these involved assigning a special node for them. This - * approach doesn't work, as evidenced by this example: + * approach doesn't always work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node that + * Both sides fold to "sss", but if the pattern is parsed to create a node that * would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node - * that is "sss". + * that is "sss" in this case. * * It turns out that there are problems with all multi-character folds, and not - * just these three. Now the code is general, for all such cases, but the - * three still have some special handling. The approach taken is: + * just these three. Now the code is general, for all such cases. The + * approach taken is: * 1) This routine examines each EXACTFish node that could contain multi- - * character fold sequences. It returns in *min_subtract how much to - * subtract from the the actual length of the string to get a real minimum - * match length; it is 0 if there are no multi-char folds. This delta is - * used by the caller to adjust the min length of the match, and the delta - * between min and max, so that the optimizer doesn't reject these - * possibilities based on size constraints. - * 2) Certain of these sequences require special handling by the trie code, - * so, if found, this code changes the joined node type to special ops: - * EXACTFU_TRICKYFOLD and EXACTFU_SS. - * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS * is used for an EXACTFU node that contains at least one "ss" sequence in * it. For non-UTF-8 patterns and strings, this is the only case where * there is a possible fold length change. That means that a regular * EXACTFU node without UTF-8 involvement doesn't have to concern itself * with length changes, and so can be processed faster. regexec.c takes * advantage of this. Generally, an EXACTFish node that is in UTF-8 is - * pre-folded by regcomp.c. This saves effort in regex matching. - * However, the pre-folding isn't done for non-UTF8 patterns because the - * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things - * down by forcing the pattern into UTF8 unless necessary. Also what - * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold * possibilities for the non-UTF8 patterns are quite simple, except for * the sharp s. All the ones that don't involve a UTF-8 target string are * members of a fold-pair, and arrays are set up for all of them so that @@ -2708,45 +3179,63 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * this file makes sure that in EXACTFU nodes, the sharp s gets folded to * 'ss', even if the pattern isn't UTF-8. This avoids the issues * described in the next item. - * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the - * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a - * UTF-8 pattern.) An assumption that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes is that a - * character in the pattern corresponds to at most a single character in - * the target string. (And I do mean character, and not byte here, unlike - * other parts of the documentation that have never been updated to - * account for multibyte Unicode.) sharp s in EXACTF nodes can match the - * two character string 'ss'; in EXACTFA nodes it can match - * "\x{17F}\x{17F}". These violate the assumption, and they are the only - * instances where it is violated. I'm reluctant to try to change the - * assumption, as the code involved is impenetrable to me (khw), so - * instead the code here punts. This routine examines (when the pattern - * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a - * boolean indicating whether or not the node contains a sharp s. When it - * is true, the caller sets a flag that later causes the optimizer in this - * file to not set values for the floating and fixed string lengths, and - * thus avoids the optimizer code in regexec.c that makes the invalid + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid * assumption. Thus, there is no optimization based on string lengths for - * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. - * (The reason the assumption is wrong only in these two cases is that all - * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all - * other folds to their expanded versions. We can't prefold sharp s to - * 'ss' in EXACTF nodes because we don't know at compile time if it - * actually matches 'ss' or not. It will match iff the target string is - * in UTF-8, unlike the EXACTFU nodes, where it always matches; and - * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 - * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; - * but in a non-UTF8 pattern, folding it to that above-Latin1 string would - * require the pattern to be forced into UTF-8, the overhead of which we - * want to avoid.) - */ - -#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2792,8 +3281,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); - /* XXX I (khw) kind of doubt that this works on platforms where - * U8_MAX is above 255 because of lots of other assumptions */ + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ /* Don't join if the sum can't fit into a single node */ if (oldl + STR_LEN(n) > U8_MAX) break; @@ -2828,7 +3318,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } *min_subtract = 0; - *has_exactf_sharp_s = FALSE; + *unfolded_multi_char = FALSE; /* Here, all the adjacent mergeable EXACTish nodes have been merged. We * can now analyze for sequences of problematic code points. (Prior to @@ -2836,15 +3326,68 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - const U8 * const s0 = (U8*) STRING(scan); - const U8 * s = s0; - const U8 * const s_end = s0 + STR_LEN(scan); + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ /* One pass is made over the node's string looking for all the - * possibilities. to avoid some tests in the loop, there are two main + * possibilities. To avoid some tests in the loop, there are two main * cases, for UTF-8 patterns (which can't have EXACTF nodes) and * non-UTF-8 */ if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ /* Examine the string for a multi-character fold sequence. UTF-8 * patterns have all characters pre-folded by the time this code is @@ -2852,60 +3395,32 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b while (s < s_end - 1) /* Can stop 1 before the end, as minimum length sequence we are looking for is 2 */ { - int count = 0; + int count = 0; /* How many characters in a multi-char fold */ int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); if (! len) { /* Not a multi-char fold: get next char */ s += UTF8SKIP(s); continue; } - /* Nodes with 'ss' require special handling, except for EXACTFL - * and EXACTFA for which there is no multi-char fold to this */ + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ if (len == 2 && *s == 's' && *(s+1) == 's' - && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) { count = 2; - OP(scan) = EXACTFU_SS; - s += 2; - } - else if (len == 6 /* len is the same in both ASCII and EBCDIC - for these */ - && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6) - || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6))) - { - count = 3; - - /* These two folds require special handling by trie's, so - * change the node type to indicate this. If EXACTFA and - * EXACTFL were ever to be handled by trie's, this would - * have to be changed. If this node has already been - * changed to EXACTFU_SS in this loop, leave it as is. (I - * (khw) think it doesn't matter in regexec.c for UTF - * patterns, but no need to change it */ - if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_TRICKYFOLD; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; } - s += 6; + s += 2; } else { /* Here is a generic multi-char fold. */ - const U8* multi_end = s + len; - - /* Count how many characters in it. In the case of /l and - * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. (In - * EXACTFL, no folds are allowed to any Latin1 code point, - * not just ASCII. But there aren't any of these - * currently, nor ever likely, so don't take the time to - * test for them. The code that generates the - * is_MULTI_foo() macros croaks should one actually get put - * into Unicode .) */ - if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + U8* multi_end = s + len; + + /* Count how many characters in it. In the case of /aa, no + * folds which contain ASCII code points are allowed, so + * check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; } @@ -2925,44 +3440,61 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b /* The delta is how long the sequence is minus 1 (1 is how long * the character that folds to the sequence is) */ - *min_subtract += count - 1; + total_count_delta += count - 1; next_iteration: ; } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); } else if (OP(scan) == EXACTFA) { /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char * fold to the ASCII range (and there are no existing ones in the * upper latin1 range). But, as outlined in the comments preceding - * this function, we need to flag any occurrences of the sharp s */ + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { - *has_exactf_sharp_s = TRUE; + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; break; } s++; continue; } } - else if (OP(scan) != EXACTFL) { + else { - /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the - * multi-char folds that are all Latin1. (This code knows that - * there are no current multi-char folds possible with EXACTFL, - * relying on fold_grind.t to catch any errors if the very unlikely - * event happens that some get added in future Unicode versions.) - * As explained in the comments preceding this function, we look - * also for the sharp s in EXACTF nodes; it can be in the final - * position. Otherwise we can stop looking 1 byte earlier because - * have to find at least two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; while (s < upper) { int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) { - *has_exactf_sharp_s = TRUE; + *unfolded_multi_char = TRUE; } s++; continue; @@ -2977,8 +3509,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * changed so that a sharp s in the string can match this * ss in the pattern, but they remain EXACTF nodes, as they * won't match this unless the target string is is UTF-8, - * which we don't know until runtime */ - if (OP(scan) != EXACTF) { + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { OP(scan) = EXACTFU_SS; } } @@ -3012,7 +3545,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b #define INIT_AND_WITHP \ assert(!and_withp); \ - Newx(and_withp,1,struct regnode_charclass_class); \ + Newx(and_withp,1, regnode_ssc); \ SAVEFREEPV(and_withp) /* this is a chain of data about sub patterns we are processing that @@ -3023,20 +3556,19 @@ typedef struct scan_frame { regnode *last; /* last node to process in this frame */ regnode *next; /* next node to process when last is reached */ struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; I32 stop; /* what stopparen do we use */ } scan_frame; -#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) - -STATIC I32 +STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, - I32 *minlenp, I32 *deltap, + SSize_t *minlenp, SSize_t *deltap, regnode *last, scan_data_t *data, I32 stopparen, - U8* recursed, - struct regnode_charclass_class *and_withp, + U32 recursed_depth, + regnode_ssc *and_withp, U32 flags, U32 depth) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ @@ -3047,17 +3579,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0; /* There must be at least this number of characters to match */ + /* There must be at least this number of characters to match */ + SSize_t min = 0; I32 pars = 0, code; regnode *scan = *scanp, *next; - I32 delta = 0; + SSize_t delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; SV *re_trie_maxbuff = NULL; regnode *first_non_open = scan; - I32 stopmin = I32_MAX; + SSize_t stopmin = SSize_t_MAX; scan_frame *frame = NULL; GET_RE_DEBUG_FLAGS_DECL; @@ -3066,7 +3599,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif - if ( depth == 0 ) { while (first_non_open && OP(first_non_open) == OPEN) first_non_open=regnext(first_non_open); @@ -3078,15 +3610,40 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV min_subtract = 0; /* How mmany chars to subtract from the minimum node length to get a real minimum (because the folded version may be shorter) */ - bool has_exactf_sharp_s = FALSE; + bool unfolded_multi_char = FALSE; /* Peephole optimizer: */ - DEBUG_STUDYDATA("Peep:", data,depth); - DEBUG_PEEP("Peep",scan,depth); + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); - /* Its not clear to khw or hv why this is done here, and not in the - * clauses that deal with EXACT nodes. khw's guess is that it's - * because of a previous design */ - JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); + + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -3119,24 +3676,29 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, || OP(scan) == IFTHEN) { next = regnext(scan); code = OP(scan); - /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ if (OP(next) == code || code == IFTHEN) { - /* NOTE - There is similar code to this block below for handling - TRIE nodes on a re-study. If you change stuff here check there - too. */ - I32 max1 = 0, min1 = I32_MAX, num = 0; - struct regnode_charclass_class accum; + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; regnode * const startbranch=scan; - if (flags & SCF_DO_SUBSTR) - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ - if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); while (OP(scan) == code) { - I32 deltanext, minnext, f = 0, fake; - struct regnode_charclass_class this_class; + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; num++; data_fake.flags = 0; @@ -3153,7 +3715,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (code != BRANCH) scan = NEXTOPER(scan); if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -3161,14 +3723,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - next, &data_fake, - stopparen, recursed, NULL, f,depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); if (min1 > minnext) min1 = minnext; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; + max1 = SSize_t_MAX; } else if (max1 < minnext + deltanext) max1 = minnext + deltanext; scan = next; @@ -3187,63 +3749,64 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; if (flags & SCF_DO_SUBSTR) { data->pos_min += min1; - if (data->pos_delta >= I32_MAX - (max1 - min1)) - data->pos_delta = I32_MAX; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; else data->pos_delta += max1 - min1; if (max1 != min1 || is_inf) data->longest = &(data->longest_float); } min += min1; - if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0) - delta = I32_MAX; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; else delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } - if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { /* demq. - Assuming this was/is a branch we are dealing with: 'scan' now - points at the item that follows the branch sequence, whatever - it is. We now start at the beginning of the sequence and look - for subsequences of + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of BRANCH->EXACT=>x1 BRANCH->EXACT=>x2 tail - which would be constructed from a pattern like /A|LIST|OF|WORDS/ + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ If we can find such a subsequence we need to turn the first element into a trie and then add the subsequent branch exact @@ -3251,7 +3814,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, We have two cases - 1. patterns where the whole set of branches can be converted. + 1. patterns where the whole set of branches can be + converted. 2. patterns where only a subset can be converted. @@ -3288,7 +3852,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U32 count=0; #ifdef DEBUGGING - SV * const mysv = sv_newmortal(); /* for dumping */ + SV * const mysv = sv_newmortal(); /* for dumping */ #endif /* var tail is used because there may be a TAIL regop in the way. Ie, the exacts will point to the @@ -3305,11 +3869,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, tail ); + regprop(RExC_rx, mysv, tail, NULL); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", - (int)depth * 2 + 2, "", - "Looking for TRIE'able sequences. Tail node is: ", - SvPV_nolen_const( mysv ) + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) ); }); @@ -3317,35 +3881,46 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, Step through the branches cur represents each branch, - noper is the first thing to be matched as part of that branch + noper is the first thing to be matched as part + of that branch noper_next is the regnext() of that node. - We normally handle a case like this /FOO[xyz]|BAR[pqr]/ - via a "jump trie" but we also support building with NOJUMPTRIE, - which restricts the trie logic to structures like /FOO|BAR/. - - If noper is a trieable nodetype then the branch is a possible optimization - target. If we are building under NOJUMPTRIE then we require that noper_next - is the same as scan (our current position in the regex program). - - Once we have two or more consecutive such branches we can create a - trie of the EXACT's contents and stitch it in place into the program. - - If the sequence represents all of the branches in the alternation we - replace the entire thing with a single TRIE node. - - Otherwise when it is a subsequence we need to stitch it in place and - replace only the relevant branches. This means the first branch has - to remain as it is used by the alternation logic, and its next pointer, - and needs to be repointed at the item on the branch chain following - the last branch we have optimized away. - - This could be either a BRANCH, in which case the subsequence is internal, - or it could be the item following the branch sequence in which case the - subsequence is at the end (which does not necessarily mean the first node - is the start of the alternation). - - TRIE_TYPE(X) is a define which maps the optype to a trietype. + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. optype | trietype ----------------+----------- @@ -3353,14 +3928,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, EXACT | EXACT EXACTFU | EXACTFU EXACTFU_SS | EXACTFU - EXACTFU_TRICKYFOLD | EXACTFU - EXACTFA | 0 + EXACTFA | EXACTFA */ #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ ( EXACT == (X) ) ? EXACT : \ - ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ 0 ) /* dont use tail as the end marker for this traverse */ @@ -3375,16 +3950,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); - regprop(RExC_rx, mysv, noper); + regprop(RExC_rx, mysv, noper, NULL); PerlIO_printf( Perl_debug_log, " -> %s", SvPV_nolen_const(mysv)); if ( noper_next ) { - regprop(RExC_rx, mysv, noper_next ); + regprop(RExC_rx, mysv, noper_next, NULL); PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } @@ -3394,8 +3969,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ); }); - /* Is noper a trieable nodetype that can be merged with the - * current trie (if there is one)? */ + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ if ( noper_trietype && ( @@ -3408,10 +3983,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif && count < U16_MAX) { - /* Handle mergable triable node - * Either we are the first node in a new trieable sequence, - * in which case we do some bookkeeping, otherwise we update - * the end pointer. */ + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ if ( !first ) { first = cur; if ( noper_trietype == NOTHING ) { @@ -3424,8 +3999,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_next_trietype ) { trietype = noper_next_trietype; } else if (noper_next_type) { - /* a NOTHING regop is 1 regop wide. We need at least two - * for a trie so we can't merge this in */ + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ first = NULL; } } else { @@ -3441,31 +4017,39 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle mergable triable node */ else { /* handle unmergable node - - * noper may either be a triable node which can not be tried - * together with the current trie, or a non triable node */ + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ if ( last ) { - /* If last is set and trietype is not NOTHING then we have found - * at least two triable branch sequences in a row of a similar - * trietype so we can turn them into a trie. If/when we - * allow NOTHING to start a trie sequence this condition will be - * required, and it isn't expensive so we leave it in for now. */ + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ if ( trietype && trietype != NOTHING ) make_trie( pRExC_state, - startbranch, first, cur, tail, count, - trietype, depth+1 ); - last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ } if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ - /* noper is triable, so we can start a new trie sequence */ + /* noper is triable, so we can start a new + * trie sequence */ count = 1; first = cur; trietype = noper_trietype; } else if (first) { - /* if we already saw a first but the current node is not triable then we have + /* if we already saw a first but the + * current node is not triable then we have * to reset the first information. */ count = 0; first = NULL; @@ -3474,18 +4058,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle unmergable node */ } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) \n", (int)depth * 2 + 2, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); if ( last && trietype ) { if ( trietype != NOTHING ) { - /* the last branch of the sequence was part of a trie, - * so we have to construct it here outside of the loop - */ - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) @@ -3495,20 +4082,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( startbranch == first && scan == tail ) { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } } #endif } else { - /* at this point we know whatever we have is a NOTHING sequence/branch - * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING */ if ( startbranch == first ) { regnode *opt; - /* the entire thing is a NOTHING sequence, something like this: - * (?:|) So we can turn it into a plain NOTHING op. */ + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); @@ -3536,9 +4126,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 paren; regnode *start; regnode *end; + U32 my_recursed_depth= recursed_depth; if (OP(scan) != SUSPEND) { - /* set the pointer */ + /* set the pointer */ if (OP(scan) == GOSUB) { paren = ARG(scan); RExC_recurse[ARG2L(scan)] = scan; @@ -3549,21 +4140,33 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, start = RExC_rxi->program + 1; end = RExC_opend; } - if (!recursed) { - Newxz(recursed, (((RExC_npar)>>3) +1), U8); - SAVEFREEPV(recursed); - } - if (!PAREN_TEST(recursed,paren+1)) { - PAREN_SET(recursed,paren+1); + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; Newx(newframe,1,scan_frame); } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } } else { @@ -3580,17 +4183,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, newframe->last = last; newframe->stop = stopparen; newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); frame = newframe; scan = start; stopparen = paren; last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; continue; } } else if (OP(scan) == EXACT) { - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc; if (UTF) { const U8 * const s = (U8*)STRING(scan); @@ -3606,7 +4215,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data->last_end == -1) { /* Update the start info. */ data->last_start_min = data->pos_min; data->last_start_max = is_inf - ? I32_MAX : data->pos_min + data->pos_delta; + ? SSize_t_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); if (UTF) @@ -3617,83 +4226,47 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) mg->mg_len += utf8_length((U8*)STRING(scan), - (U8*)STRING(scan)+STR_LEN(scan)); + (U8*)STRING(scan)+STR_LEN(scan)); } data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - - - /* If compatible, we or it in below. It is compatible if is - * in the bitmp and either 1) its bit or its fold is set, or 2) - * it's for a locale. Even if there isn't unicode semantics - * here, at runtime there may be because of matching against a - * utf8 string, so accept a possible false positive for - * latin1-range folds */ - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && (!(data->start_class->flags & ANYOF_LOC_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - ) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) - ANYOF_BITMAP_SET(data->start_class, uc); - else if (uc >= 0x100) { - int i; - - /* Some Unicode code points fold to the Latin1 range; as - * XXX temporary code, instead of figuring out if this is - * one, just assume it is and set all the start class bits - * that could be some such above 255 code point's fold - * which will generate fals positives. As the code - * elsewhere that does compute the fold settles down, it - * can be extracted out and re-used here */ - for (i = 0; i < 256; i++){ - if (HAS_NONLATIN1_FOLD_CLOSURE(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } - } - } - CLEAR_SSC_EOS(data->start_class); - if (uc < 0x100) - data->start_class->flags &= ~ANYOF_UNICODE_ALL; + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); } else if (flags & SCF_DO_STCLASS_OR) { - /* false positive possible if the class is case-folded */ - if (uc < 0x100) - ANYOF_BITMAP_SET(data->start_class, uc); - else - data->start_class->flags |= ANYOF_UNICODE_ALL; - CLEAR_SSC_EOS(data->start_class); - cl_and(data->start_class, and_withp); + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { const U8 * const s = (U8 *)STRING(scan); uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - if (has_exactf_sharp_s) { - RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; } min += l - min_subtract; assert (min >= 0); @@ -3708,99 +4281,95 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) { - ANYOF_BITMAP_SET(data->start_class, uc); - CLEAR_SSC_EOS(data->start_class); - if (OP(scan) == EXACTFL) { - /* XXX This set is probably no longer necessary, and - * probably wrong as LOCALE now is on in the initial - * state */ - data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; - } - else { - - /* Also set the other member of the fold pair. In case - * that unicode semantics is called for at runtime, use - * the full latin1 fold. (Can't do this for locale, - * because not known until runtime) */ - ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); - - /* All other (EXACTFL handled above) folds except under - * /iaa that include s, S, and sharp_s also may include - * the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, + if (OP(scan) == EXACTFL) { + + /* We don't know what the folds are; it could be anything. XXX + * Actually, we only support UTF-8 encoding for code points + * above Latin1, so we could know what those folds are. */ + EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, + 0, + UV_MAX); + } + else { /* Non-locale EXACTFish */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + if (uc < 256) { /* We know what the Latin1 folds are ... */ + if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we + know if anything folds + with this */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); + if (OP(scan) != EXACTFA) { /* The folds below aren't + legal under /iaa */ + if (isARG2_lower_or_UPPER_ARG1('s', uc)) { + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, LATIN_SMALL_LETTER_SHARP_S); } else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, 's'); + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, 'S'); } } + + /* We also know if there are above-Latin1 code points + * that fold to this (none legal for ASCII and /iaa) */ + if ((! isASCII(uc) || OP(scan) != EXACTFA) + && HAS_NONLATIN1_FOLD_CLOSURE(uc)) + { + /* XXX We could know exactly what does fold to this + * if the reverse folds are loaded, as currently in + * S_regclass() */ + _invlist_union(EXACTF_invlist, + PL_AboveLatin1, + &EXACTF_invlist); + } } } - else if (uc >= 0x100) { - int i; - for (i = 0; i < 256; i++){ - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } + else { /* Non-locale, above Latin1. XXX We don't currently + know what participates in folds with this, so have + to assume anything could */ + + /* XXX We could know exactly what does fold to this if the + * reverse folds are loaded, as currently in S_regclass(). + * But we do know that under /iaa nothing in the ASCII + * range can participate */ + if (OP(scan) == EXACTFA) { + _invlist_union_complement_2nd(EXACTF_invlist, + PL_XPosix_ptrs[_CC_ASCII], + &EXACTF_invlist); + } + else { + EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, + 0, UV_MAX); } } } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); + } else if (flags & SCF_DO_STCLASS_OR) { - if (data->start_class->flags & ANYOF_LOC_FOLD) { - /* false positive possible if the class is case-folded. - Assume that the locale settings are the same... */ - if (uc < 0x100) { - ANYOF_BITMAP_SET(data->start_class, uc); - if (OP(scan) != EXACTFL) { - - /* And set the other member of the fold pair, but - * can't do that in locale because not known until - * run-time */ - ANYOF_BITMAP_SET(data->start_class, - PL_fold_latin1[uc]); - - /* All folds except under /iaa that include s, S, - * and sharp_s also may include the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } - } - } - } - CLEAR_SSC_EOS(data->start_class); - } - cl_and(data->start_class, and_withp); + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); } else if (REGNODE_VARIES(OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, fl = 0; - I32 f = flags, pos_before = 0; + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; regnode * const oscan = scan; - struct regnode_charclass_class this_class; - struct regnode_charclass_class *oclass = NULL; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; I32 next_is_eval = 0; switch (PL_regkind[OP(scan)]) { @@ -3830,12 +4399,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan = NEXTOPER(scan); goto do_curly; } - is_inf = is_inf_internal = 1; - scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } + is_inf = is_inf_internal = 1; + scan = regnext(scan); goto optimize_curly_tail; case CURLY: if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) @@ -3856,7 +4426,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ pos_before = data->pos_min; } if (data) { @@ -3866,7 +4438,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags |= SF_IS_INF; } if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); oclass = data->start_class; data->start_class = &this_class; f |= SCF_DO_STCLASS_AND; @@ -3886,35 +4458,35 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* This will finish on WHILEM, setting scan, or on NULL: */ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - last, data, stopparen, recursed, NULL, - (mincount == 0 - ? (f & ~SCF_DO_SUBSTR) : f),depth+1); + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); if (flags & SCF_DO_STCLASS) data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&this_class, data->start_class, - struct regnode_charclass_class); + StructCopy(&this_class, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); - cl_and(data->start_class, and_withp); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); } else if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); flags &= ~SCF_DO_STCLASS; } if (!scan) /* It was not CURLYX, but CURLY. */ @@ -3924,7 +4496,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3) /* Complement check for big count */ + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ { /* Fatal warnings may leak the regexp without this: */ SAVEFREESV(RExC_rx_sv); @@ -3934,14 +4507,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } min += minnext * mincount; - is_inf_internal |= deltanext == I32_MAX - || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); is_inf |= is_inf_internal; - if (is_inf) - delta = I32_MAX; - else - delta += (minnext + deltanext) * maxcount - minnext * mincount; - + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } /* Try powerful optimization CURLYX => CURLYN. */ if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR @@ -3992,7 +4566,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ - && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4039,7 +4616,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif /* Optimize again: */ study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, - NULL, stopparen, recursed, NULL, 0,depth+1); + NULL, stopparen, recursed_depth, NULL, 0,depth+1); } else oscan->flags = 0; @@ -4064,43 +4641,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, pars++; if (flags & SCF_DO_SUBSTR) { SV *last_str = NULL; + STRLEN last_chrs = 0; int counted = mincount != 0; - if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ -#if defined(SPARC64_GCC_WORKAROUND) - I32 b = 0; - STRLEN l = 0; - const char *s = NULL; - I32 old = 0; - - if (pos_before >= data->last_start_min) - b = pos_before; - else - b = data->last_start_min; - - l = 0; - s = SvPV_const(data->last_found, l); - old = b - data->last_start_min; - -#else - I32 b = pos_before >= data->last_start_min + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min ? pos_before : data->last_start_min; STRLEN l; const char * const s = SvPV_const(data->last_found, l); - I32 old = b - data->last_start_min; -#endif + SSize_t old = b - data->last_start_min; if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); repeatcpy(SvPVX(last_str) + l, - SvPVX_const(last_str), l, mincount - 1); + SvPVX_const(last_str), l, + mincount - 1); SvCUR_set(last_str, SvCUR(last_str) * mincount); /* Add additional parts. */ SvCUR_set(data->last_found, @@ -4112,34 +4678,41 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str) - l; + mg->mg_len += last_chrs * (mincount-1); } + last_chrs *= mincount; data->last_end += l * (mincount - 1); } } else { /* start offset must point into the last copy */ data->last_start_min += minnext * (mincount - 1); - data->last_start_max += is_inf ? I32_MAX + data->last_start_max += is_inf ? SSize_t_MAX : (maxcount - 1) * (minnext + data->pos_delta); } } /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n", - counted, deltanext, I32_MAX, minnext, maxcount, mincount); -if (deltanext != I32_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta); +PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf + " SSize_t_MAX=%"UVdf" minnext=%"UVdf + " maxcount=%"UVdf" mincount=%"UVdf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif - if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta) - data->pos_delta = I32_MAX; + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; else data->pos_delta += - counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount; if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); if (mincount && last_str) { SV * const sv = data->last_found; MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? @@ -4149,12 +4722,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext mg->mg_len = -1; sv_setsv(sv, last_str); data->last_end = data->pos_min; - data->last_start_min = - data->pos_min - CHR_SVLEN(last_str); + data->last_start_min = data->pos_min - last_chrs; data->last_start_max = is_inf - ? I32_MAX - : data->pos_min + data->pos_delta - - CHR_SVLEN(last_str); + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; } data->longest = &(data->longest_float); } @@ -4169,164 +4740,212 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext NEXT_OFF(oscan) += NEXT_OFF(next); } continue; - default: /* REF, and CLUMP only? */ + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) - cl_anything(pRExC_state, data->start_class); + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } flags &= ~SCF_DO_STCLASS; break; } } else if (OP(scan) == LNBREAK) { if (flags & SCF_DO_STCLASS) { - int value = 0; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ if (flags & SCF_DO_STCLASS_AND) { - for (value = 0; value < 256; value++) - if (!is_VERTWS_cp(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - else { - for (value = 0; value < 256; value++) - if (is_VERTWS_cp(value)) - ANYOF_BITMAP_SET(data->start_class, value); + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } min++; delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += 1; data->pos_delta += 1; data->longest = &(data->longest_float); } } else if (REGNODE_SIMPLE(OP(scan))) { - int value = 0; if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min++; } min++; if (flags & SCF_DO_STCLASS) { - int loop_max = 256; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; /* Some of the logic below assumes that switching locale on will only add false positives. */ - switch (PL_regkind[OP(scan)]) { - U8 classnum; + switch (OP(scan)) { - case SANY: default: #ifdef DEBUGGING - Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); #endif - do_default: + case CANY: + case SANY: if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_match_all_cp(data->start_class); break; + case REG_ANY: - if (OP(scan) == SANY) - goto do_default; - if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ - value = (ANYOF_BITMAP_TEST(data->start_class,'\n') - || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); - cl_anything(pRExC_state, data->start_class); - } - if (flags & SCF_DO_STCLASS_AND || !value) - ANYOF_BITMAP_CLEAR(data->start_class,'\n'); + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); + } break; + case ANYOF: if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, - (struct regnode_charclass_class*)scan); + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); else - cl_or(pRExC_state, data->start_class, - (struct regnode_charclass_class*)scan); + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); break; - case POSIXA: - loop_max = 128; + + case NPOSIXL: + invert = 1; /* FALL THROUGH */ + case POSIXL: - case POSIXD: - case POSIXU: - classnum = FLAGS(scan); + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); } else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum)); - } - else { - - /* Even if under locale, set the bits for non-locale - * in case it isn't a true locale-node. This will - * create false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } } break; - case NPOSIXA: - loop_max = 128; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; /* FALL THROUGH */ - case NPOSIXL: - case NPOSIXU: + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + case NPOSIXD: - classnum = FLAGS(scan); + case NPOSIXU: + invert = 1; + /* FALL THROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } + + join_posix: + if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } - } + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); } else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1); - } - else { - - /* Even if under locale, set the bits for non-locale in - * case it isn't a true locale-node. This will create - * false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } - } - if (PL_regkind[OP(scan)] == NPOSIXD) { - data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); } - break; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -4334,7 +4953,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } else if ( PL_regkind[OP(scan)] == BRANCHJ @@ -4353,11 +4972,12 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("OPFAIL",data,depth); /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, mysv_val, upto); - PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(mysv_val), - (IV)REG_NODE_NUM(upto), - (IV)(upto - scan) + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) ); }); OP(scan) = OPFAIL; @@ -4374,9 +4994,9 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext In this case we can't do fixed string optimisation. */ - I32 deltanext, minnext, fake = 0; + SSize_t deltanext, minnext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; data_fake.flags = 0; @@ -4389,7 +5009,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.pos_delta = delta; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4398,13 +5018,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f, depth+1); + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (minnext > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)minnext; } @@ -4423,14 +5045,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext * *** HACK *** for now just treat as "no information". * See [perl #56690]. */ - cl_init(pRExC_state, data->start_class); + ssc_init(pRExC_state, data->start_class); } else { /* AND before and after: combine and continue */ - const int was = TEST_SSC_EOS(data->start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } } } @@ -4443,9 +5061,9 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext length of the pattern, something we won't know about until after the recurse. */ - I32 deltanext, fake = 0; + SSize_t deltanext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; /* We use SAVEFREEPV so that when the full compile is finished perl will clean up the allocated @@ -4453,8 +5071,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext have to worry about freeing them when we know they wont be used, which would be a pain. */ - I32 *minnextp; - Newx( minnextp, 1, I32 ); + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); SAVEFREEPV(minnextp); if (data) { @@ -4462,7 +5080,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext if ((flags & SCF_DO_SUBSTR) && data->last_found) { f |= SCF_DO_SUBSTR; if (scan->flags) - SCAN_COMMIT(pRExC_state, &data_fake,minlenp); + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); data_fake.last_found=newSVsv(data->last_found); } } @@ -4474,7 +5092,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.flags |= SF_IS_INF; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4483,14 +5101,17 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f,depth+1); + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (*minnextp > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)*minnextp; } @@ -4498,11 +5119,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext *minnextp += min; if (f & SCF_DO_STCLASS_AND) { - const int was = TEST_SSC_EOS(data.start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4513,7 +5130,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { if (RExC_rx->minlen<*minnextp) RExC_rx->minlen=*minnextp; - SCAN_COMMIT(pRExC_state, &data_fake, minnextp); + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); SvREFCNT_dec_NN(data_fake.last_found); if ( data_fake.minlen_fixed != minlenp ) @@ -4557,7 +5174,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext } else if ( PL_regkind[OP(scan)] == ENDLIKE ) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); flags &= ~SCF_DO_SUBSTR; } if (data && OP(scan)==ACCEPT) { @@ -4569,24 +5186,24 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } else if (OP(scan) == GPOS) { - if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && !(delta || is_inf || (data && data->pos_delta))) { - if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) - RExC_rx->extflags |= RXf_ANCH_GPOS; - if (RExC_rx->gofs < (U32)min) + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) RExC_rx->gofs = min; } else { - RExC_rx->extflags |= RXf_GPOS_FLOAT; + RExC_rx->intflags |= PREGf_GPOS_FLOAT; RExC_rx->gofs = 0; } } @@ -4599,13 +5216,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext regnode *trie_node= scan; regnode *tail= regnext(scan); reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - I32 max1 = 0, min1 = I32_MAX; - struct regnode_charclass_class accum; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; - if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + ssc_init_zero(pRExC_state, &accum); if (!trie->jump) { min1= trie->minlen; @@ -4616,8 +5235,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext for ( word=1 ; word <= trie->wordcount ; word++) { - I32 deltanext=0, minnext=0, f = 0, fake; - struct regnode_charclass_class this_class; + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; data_fake.flags = 0; if (data) { @@ -4628,7 +5247,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.last_closep = &fake; data_fake.pos_delta = delta; if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -4640,22 +5259,21 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext nextbranch = trie_node + trie->jump[0]; scan= trie_node + trie->jump[word]; /* We go from the jump point to the branch that follows - it. Note this means we need the vestigal unused branches - even though they arent otherwise used. - */ + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, (regnode *)nextbranch, &data_fake, - stopparen, recursed, NULL, f,depth+1); + stopparen, recursed_depth, NULL, f,depth+1); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode*)nextbranch); - if (min1 > (I32)(minnext + trie->minlen)) + if (min1 > (SSize_t)(minnext + trie->minlen)) min1 = minnext + trie->minlen; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; - } else if (max1 < (I32)(minnext + deltanext + trie->maxlen)) + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) max1 = minnext + deltanext + trie->maxlen; if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4673,7 +5291,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -4685,28 +5303,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } scan= tail; @@ -4721,14 +5336,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext delta += (trie->maxlen - trie->minlen); flags &= ~SCF_DO_STCLASS; /* xxx */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += trie->minlen; data->pos_delta += (trie->maxlen - trie->minlen); if (trie->maxlen != trie->minlen) data->longest = &(data->longest_float); } if (trie->jump) /* no more substrings -- for now /grr*/ - flags &= ~SCF_DO_SUBSTR; + flags &= ~SCF_DO_SUBSTR; } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ @@ -4736,10 +5352,24 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext /* Else: zero-length, ignore. */ scan = regnext(scan); } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ last = frame->last; scan = frame->next; stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + frame = frame->prev; goto fake_study_recurse; } @@ -4749,9 +5379,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("pre-fin:",data,depth); *scanp = scan; - *deltap = is_inf_internal ? I32_MAX : delta; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) - data->pos_delta = I32_MAX - data->pos_min; + data->pos_delta = SSize_t_MAX - data->pos_min; if (is_par > (I32)U8_MAX) is_par = 0; if (is_par && pars==1 && data) { @@ -4763,17 +5394,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; DEBUG_STUDYDATA("post-fin:",data,depth); - return min < stopmin ? min : stopmin; + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ } STATIC U32 -S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; @@ -4856,7 +5495,7 @@ Perl_current_re_engine(pTHX) HV * const table = GvHV(PL_hintgv); SV **ptr; - if (!table) + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) return &reh_regexp_engine; ptr = hv_fetchs(table, "regcomp", FALSE); if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) @@ -4937,12 +5576,11 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, Newx(dst, *plen_p * 2 + 1, U8); while (s < *plen_p) { - const UV uv = NATIVE_TO_ASCII(src[s]); - if (UNI_IS_INVARIANT(uv)) - dst[d] = (U8)UTF_TO_NATIVE(uv); + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; else { - dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv); - dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv); + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); } if (n < num_code_blocks) { if (!do_end && pRExC_state->code_blocks[n].start == s) { @@ -5006,6 +5644,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, STRLEN orig_patlen = 0; bool code = 0; SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; /* if we've got a delimiter, we go round the loop twice for each * svp slot (except the last), using the delimiter the second @@ -5024,7 +5663,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * The code in this block is based on S_pushav() */ AV *const av = (AV*)msv; - const I32 maxarg = AvFILL(av) + 1; + const SSize_t maxarg = AvFILL(av) + 1; SV **array; if (oplist) { @@ -5034,11 +5673,11 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, } if (SvRMAGICAL(av)) { - U32 i; + SSize_t i; Newx(array, maxarg, SV*); SAVEFREEPV(array); - for (i=0; i < (U32)maxarg; i++) { + for (i=0; i < maxarg; i++) { SV ** const svp = av_fetch(av, i, FALSE); array[i] = svp ? *svp : &PL_sv_undef; } @@ -5328,7 +5967,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, { Safefree(pRExC_state->code_blocks); /* use croak_sv ? */ - Perl_croak_nocontext("%s", SvPV_nolen_const(errsv)); + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); } } assert(SvROK(qr_ref)); @@ -5419,20 +6058,24 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, STATIC bool -S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol) +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) { /* This is the common code for setting up the floating and fixed length * string data extracted from Perl_re_op_compile() below. Returns a boolean * as to whether succeeded or not */ - I32 t,ml; + I32 t; + SSize_t ml; if (! (longest_length || (eol /* Can't have SEOL and MULTI */ && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) ) - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) { return FALSE; } @@ -5450,7 +6093,7 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, S follow this item. We calculate it ahead of time as once the lookbehind offset is added in we lose the ability to correctly calculate it.*/ - ml = minlen ? *(minlen) : (I32)longest_length; + ml = minlen ? *(minlen) : (SSize_t)longest_length; *rx_end_shift = ml - offset - longest_length + (SvTAIL(sv_longest) != 0) + lookbehind; @@ -5519,7 +6162,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, char *exp; regnode *scan; I32 flags; - I32 minlen = 0; + SSize_t minlen = 0; U32 rx_flags; SV *pat; SV *code_blocksv = NULL; @@ -5553,61 +6196,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * having to test them each time otherwise */ if (! PL_AboveLatin1) { PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); - PL_ASCII = _new_invlist_C_array(ASCII_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); - - PL_L1Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(PosixAlnum_invlist); - - PL_L1Posix_ptrs[_CC_ALPHA] - = _new_invlist_C_array(L1PosixAlpha_invlist); - PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist); - PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist); - - /* Cased is the same as Alpha in the ASCII range */ - PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist); - PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist); - PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist); - - PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - - PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist); - PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist); - - PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist); - PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist); - - PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist); - PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist); - - PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist); - PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist); - - PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist); - PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist); - PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist); - PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist); - - PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist); - PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist); - - PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist); - - PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist); - PL_L1Posix_ptrs[_CC_WORDCHAR] - = _new_invlist_C_array(L1PosixWord_invlist); - - PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist); - PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); - - PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); } #endif @@ -5723,6 +6316,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + RExC_contains_i = 0; pRExC_state->runtime_code_qr = NULL; DEBUG_COMPILE_r({ @@ -5744,11 +6338,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); /* return old regex if pattern hasn't changed */ - /* XXX: note in the below we have to check the flags as well as the pattern. + /* XXX: note in the below we have to check the flags as well as the + * pattern. * - * Things get a touch tricky as we have to compare the utf8 flag independently - * from the compile flags. - */ + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ if ( old_re && !recompile @@ -5765,10 +6359,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, rx_flags = orig_rx_flags; - if (initial_charset == REGEX_LOCALE_CHARSET) { - RExC_contains_locale = 1; + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; } - else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -5796,6 +6390,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_sawback = 0; RExC_seen = 0; + RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; @@ -5810,7 +6405,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_npar = 1; RExC_nestroot = 0; RExC_size = 0L; - RExC_emit = &RExC_emit_dummy; + RExC_emit = (regnode *) &RExC_emit_dummy; RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; @@ -5820,6 +6415,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_paren_name_list = NULL; #endif RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; RExC_recurse_count = 0; pRExC_state->code_index = 0; @@ -5898,7 +6495,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, FAIL("Regexp out of space"); #ifdef DEBUGGING /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ - Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); #else /* bulk initialize base fields with 0. */ Zero(ri, sizeof(regexp_internal), char); @@ -5925,14 +6523,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); /* The caret is output if there are any defaults: if not all the STD * flags are set, or if no character set specifier is needed */ bool has_default = (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) || ! has_charset); - bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ @@ -5994,12 +6594,23 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - if (RExC_seen & REG_SEEN_RECURSE) { + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { Newxz(RExC_open_parens, RExC_npar,regnode *); SAVEFREEPV(RExC_open_parens); Newxz(RExC_close_parens,RExC_npar,regnode *); SAVEFREEPV(RExC_close_parens); } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } /* Useful during FAIL. */ #ifdef RE_TRACK_PATTERN_OFFSETS @@ -6043,6 +6654,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, reStudy: r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); #ifdef TRIE_STUDY_OPT if (!restudied) { @@ -6053,10 +6667,10 @@ reStudy: DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); RExC_state = copyRExC_state; - if (seen & REG_TOP_LEVEL_BRANCHES) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; else - RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; StructCopy(&zero_scan_data, &data, scan_data_t); } #else @@ -6076,12 +6690,13 @@ reStudy: /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ - if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ - I32 fake; + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; STRLEN longest_float_length, longest_fixed_length; - struct regnode_charclass_class ch_class; /* pointed to by data */ + regnode_ssc ch_class; /* pointed to by data */ int stclass_flag; - I32 last_close = 0; /* pointed to by data */ + SSize_t last_close = 0; /* pointed to by data */ regnode *first= scan; regnode *first_next= regnext(first); /* @@ -6161,35 +6776,35 @@ reStudy: PL_regkind[OP(first)] == NBOUND) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOL) { - r->extflags |= (OP(first) == MBOL - ? RXf_ANCH_MBOL + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL : (OP(first) == SBOL - ? RXf_ANCH_SBOL - : RXf_ANCH_BOL)); + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); first = NEXTOPER(first); goto again; } else if (OP(first) == GPOS) { - r->extflags |= RXf_ANCH_GPOS; + r->intflags |= PREGf_ANCH_GPOS; first = NEXTOPER(first); goto again; } else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = (OP(NEXTOPER(first)) == REG_ANY) - ? RXf_ANCH_MBOL - : RXf_ANCH_SBOL; - r->extflags |= type; - r->intflags |= PREGf_IMPLICIT; + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); first = NEXTOPER(first); goto again; } - if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback) + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6231,15 +6846,17 @@ reStudy: SAVEFREESV(data.last_found); first = scan; if (!ri->regstclass) { - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; } else /* XXXX Check for BOUND? */ stclass_flag = 0; data.last_closep = &last_close; - minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ - &data, -1, NULL, NULL, + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 0); @@ -6251,9 +6868,11 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen - && !(RExC_seen & REG_SEEN_VERBARG) - && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ r->extflags |= RXf_CHECK_ALL; + } scan_commit(pRExC_state, &data,&minlen,0); longest_float_length = CHR_SVLEN(data.longest_float); @@ -6275,7 +6894,7 @@ reStudy: { r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; - if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; SvREFCNT_inc_simple_void_NN(data.longest_float); } @@ -6313,36 +6932,40 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag - && ! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } - /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; r->check_end_shift = r->anchored_end_shift; r->check_substr = r->anchored_substr; r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; - if (r->extflags & RXf_ANCH_SINGLE) - r->extflags |= RXf_NOSCAN; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; } else { + r->substrs->check_ix = 1; r->check_end_shift = r->float_end_shift; r->check_substr = r->float_substr; r->check_utf8 = r->float_utf8; @@ -6354,6 +6977,8 @@ reStudy: if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) if ( (STRLEN)minlen < longest_float_length ) minlen= longest_float_length; @@ -6363,22 +6988,23 @@ reStudy: } else { /* Several toplevels. Best we can is to set minlen. */ - I32 fake; - struct regnode_charclass_class ch_class; - I32 last_close = 0; + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = ri->program + 1; - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; data.last_closep = &last_close; - - minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, - &data, -1, NULL, NULL, - SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS - |(restudied ? SCF_TRIE_DOING_RESTUDY : 0), + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), 0); CHECK_RESTUDY_GOTO_butfirst(NOOP); @@ -6386,51 +7012,61 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - if (! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } } + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", - (IV)minlen, (IV)r->minlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); }); r->minlenret = minlen; if (r->minlen < minlen) r->minlen = minlen; - if (RExC_seen & REG_SEEN_GPOS) - r->extflags |= RXf_GPOS_SEEN; - if (RExC_seen & REG_SEEN_LOOKBEHIND) - r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_SEEN_CANY) - r->extflags |= RXf_CANY_SEEN; - if (RExC_seen & REG_SEEN_VERBARG) + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ } - if (RExC_seen & REG_SEEN_CUTGROUP) + if (RExC_seen & REG_CUTGROUP_SEEN) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) r->intflags |= PREGf_USE_RE_EVAL; @@ -6439,7 +7075,20 @@ reStudy: else RXp_PAREN_NAMES(r) = NULL; + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); regnode *next = NEXTOPER(first); @@ -6449,16 +7098,28 @@ reStudy: r->extflags |= RXf_NULL; else if (PL_regkind[fop] == BOL && nop == END) r->extflags |= RXf_START_ONLY; - else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END) + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) r->extflags |= RXf_WHITE; - else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + #ifdef DEBUGGING if (RExC_paren_names) { - ri->name_list_idx = add_data( pRExC_state, 1, "a" ); - ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); } else #endif ri->name_list_idx = 0; @@ -6473,15 +7134,17 @@ reStudy: /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ + DEBUG_RExC_seen(); PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); #ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_OFFSETS_r(if (ri->u.offsets) { - const U32 len = ri->u.offsets[0]; - U32 i; + const STRLEN len = ri->u.offsets[0]; + STRLEN i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", @@ -6541,7 +7204,8 @@ Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, else if (flags & RXapif_NEXTKEY) return reg_named_buff_nextkey(rx, flags); else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); return NULL; } } @@ -6667,7 +7331,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) { SV *ret; AV *av; - I32 length; + SSize_t length; struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; @@ -6678,11 +7342,12 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) } else if (flags & RXapif_ONE) { ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = MUTABLE_AV(SvRV(ret)); - length = av_len(av); + length = av_tindex(av); SvREFCNT_dec_NN(ret); return newSViv(length + 1); } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); return NULL; } } @@ -6730,8 +7395,8 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, { struct regexp *const rx = ReANY(r); char *s = NULL; - I32 i = 0; - I32 s1, t1; + SSize_t i = 0; + SSize_t s1, t1; I32 n = paren; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; @@ -6789,9 +7454,9 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, } assert(s >= rx->subbeg); - assert(rx->sublen >= (s - rx->subbeg) + i ); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); if (i >= 0) { -#if NO_TAINT_SUPPORT +#ifdef NO_TAINT_SUPPORT sv_setpvn(sv, s, i); #else const int oldtainted = TAINT_get; @@ -6799,7 +7464,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->extflags & RXf_CANY_SEEN) + if ( (rx->intflags & PREGf_CANY_SEEN) ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) : (RXp_MATCH_UTF8(rx)) ) @@ -6959,7 +7624,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) PERL_ARGS_ASSERT_REG_SCAN_NAME; - if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { /* skip IDFIRST by using do...while */ if (UTF) do { @@ -6970,7 +7637,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) RExC_parse++; } while (isWORDCHAR(*RExC_parse)); } else { - RExC_parse++; /* so the <- from the vFAIL is after the offending character */ + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ vFAIL("Group name must start with a non-digit word character"); } if ( flags ) { @@ -7237,10 +7905,9 @@ Perl__new_invlist(pTHX_ IV initial_size) return new_list; } -#endif -STATIC SV* -S__new_invlist_C_array(pTHX_ const UV* const list) +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) { /* Return a pointer to a newly constructed inversion list, initialized to * point to , which has to be in the exact correct inversion list @@ -7290,8 +7957,11 @@ S__new_invlist_C_array(pTHX_ const UV* const list) /* Initialize the iteration pointer. */ invlist_iterfinish(invlist); + SvREADONLY_on(invlist); + return invlist; } +#endif /* ifndef PERL_IN_XSUB_RE */ STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) @@ -7319,10 +7989,9 @@ S_invlist_trim(pTHX_ SV* const invlist) SvPV_shrink_to_cur((SV *) invlist); } -#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) - STATIC void -S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) { /* Subject to change or removal. Append the range from 'start' to 'end' at * the end of the inversion list. The range must be above any existing @@ -7496,7 +8165,8 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) } void -Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) +Perl__invlist_populate_swatch(pTHX_ SV* const invlist, + const UV start, const UV end, U8* swatch) { /* populates a swatch of a swash the same way swatch_get() does in utf8.c, * but is used when the swash has an inversion list. This makes this much @@ -7589,14 +8259,16 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV } void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. The first list, - * , may be NULL, in which case a copy of the second list is returned. - * If is TRUE, the union is taken of the complement - * (inversion) of instead of b itself. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7637,9 +8309,13 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b /* If either one is empty, the union is the other one */ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + if (*output == a) { if (a != NULL) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } } if (*output != b) { @@ -7648,18 +8324,27 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b _invlist_invert(*output); } } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; if (*output == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } /* The complement of an empty list is a list that has everything in it, * so the union with includes everything too */ if (complement_b) { if (a == *output) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } *output = _new_invlist(1); _append_range_to_invlist(*output, 0, UV_MAX); @@ -7668,6 +8353,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b *output = invlist_clone(a); } /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } @@ -7807,24 +8496,36 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b } } - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *output || b == *output) { assert(! invlist_is_iterating(*output)); - SvREFCNT_dec_NN(*output); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } } *output = u; + return; } void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. - * If is TRUE, the result will be the intersection of - * and the complement (or inversion) of instead of directly. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7862,6 +8563,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Special case if either one is empty */ len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; if (len_a != 0 && complement_b) { @@ -7871,24 +8573,38 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * simply 'a'. */ if (*i != a) { if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } *i = invlist_clone(a); } /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } return; } /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The * intersection must be empty */ if (*i == a) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } else if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + return; } @@ -7993,7 +8709,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } /* The final length is what we've output so far plus what else is in the - * intersection. At most one of the subexpressions below will be non-zero */ + * intersection. At most one of the subexpressions below will be non-zero + * */ len_r = i_r; if (count >= 2) { len_r += (len_a - i_a) + (len_b - i_b); @@ -8018,13 +8735,21 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *i || b == *i) { assert(! invlist_is_iterating(*i)); - SvREFCNT_dec_NN(*i); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } } *i = r; + return; } @@ -8071,6 +8796,35 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) return invlist; } +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + #endif PERL_STATIC_INLINE SV* @@ -8099,43 +8853,6 @@ Perl__invlist_invert(pTHX_ SV* const invlist) *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); } -void -Perl__invlist_invert_prop(pTHX_ SV* const invlist) -{ - /* Complement the input inversion list (which must be a Unicode property, - * all of which don't match above the Unicode maximum code point.) And - * Perl has chosen to not have the inversion match above that either. This - * adds a 0x110000 if the list didn't end with it, and removes it if it did - */ - - UV len; - UV* array; - - PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; - - _invlist_invert(invlist); - - len = _invlist_len(invlist); - - if (len != 0) { /* If empty do nothing */ - array = invlist_array(invlist); - if (array[len - 1] != PERL_UNICODE_MAX + 1) { - /* Add 0x110000. First, grow if necessary */ - len++; - if (invlist_max(invlist) < len) { - invlist_extend(invlist, len); - array = invlist_array(invlist); - } - invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist)); - array[len - 1] = PERL_UNICODE_MAX + 1; - } - else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist)); - } - } - - return; -} #endif PERL_STATIC_INLINE SV* @@ -8143,7 +8860,7 @@ S_invlist_clone(pTHX_ SV* const invlist) { /* Return a new inversion list that is a copy of the input one, which is - * unchanged */ + * unchanged. The new list will not be mortal even if the old one was. */ /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ @@ -8305,7 +9022,8 @@ Perl__invlist_contents(pTHX_ SV* const invlist) #ifndef PERL_IN_XSUB_RE void -Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist) +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) { /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by @@ -8424,7 +9142,7 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) /* End of inversion list object */ STATIC void -S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) { /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' * constructs, and updates RExC_flags with them. On input, RExC_parse @@ -8484,7 +9202,6 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) } cs = REGEX_LOCALE_CHARSET; has_charset_modifier = LOCALE_PAT_MOD; - RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier) { @@ -8538,7 +9255,8 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); } else if (has_charset_modifier == *(RExC_parse - 1)) { - vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); } else { vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); @@ -8546,12 +9264,15 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) /*NOTREACHED*/ neg_modifier: RExC_parse++; - vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { - const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; if (! (wastedflags & wflagbit) ) { wastedflags |= wflagbit; /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ @@ -8604,13 +9325,17 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) RExC_flags |= posflags; RExC_flags &= ~negflags; set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } return; /*NOTREACHED*/ default: fail_modifiers: - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", - RExC_parse-seqstart, seqstart); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } @@ -8685,7 +9410,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char *start_arg = NULL; unsigned char op = 0; int argok = 1; - int internal_argval = 0; /* internal_argval is only useful if !argok */ + int internal_argval = 0; /* internal_argval is only useful if + !argok */ if (has_intervening_patws && SIZE_ONLY) { ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); @@ -8748,14 +9474,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* [19:06] :: is then */ if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; - RExC_seen |= REG_SEEN_CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; } break; } if ( ! op ) { - RExC_parse++; - vFAIL3("Unknown verb pattern '%.*s'", - verb_len, start_verb); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); } if ( argok ) { if ( start_arg && internal_argval ) { @@ -8768,8 +9495,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, op, internal_argval); if ( ! internal_argval && ! SIZE_ONLY ) { if (start_arg) { - SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); - ARG(ret) = add_data( pRExC_state, 1, "S" ); + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); RExC_rxi->data->data[ARG(ret)]=(void*)sv; ret->flags = 0; } else { @@ -8778,7 +9507,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if (!internal_argval) - RExC_seen |= REG_SEEN_VERBARG; + RExC_seen |= REG_VERBARG_SEEN; } else if ( start_arg ) { vFAIL3("Verb pattern '%.*s' may not have an argument", verb_len, start_verb); @@ -8808,17 +9537,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto named_recursion; } else if (paren == '=') { /* (?P=...) named backref */ - /* this pretty much dupes the code for \k in regatom(), if - you change this make sure you change that */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ char* name_start = RExC_parse; U32 num = 0; SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -8843,7 +9574,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); /*NOTREACHED*/ case '<': /* (?<...) */ if (*RExC_parse == '!') @@ -8857,15 +9590,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '\'': /* (?'...') */ name_start= RExC_parse; svname = reg_scan_name(pRExC_state, - SIZE_ONLY ? /* reverse test from the others */ - REG_RSN_RETURN_NAME : - REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - /*NOTREACHED*/ - } - if (*RExC_parse != paren) + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); if (SIZE_ONLY) { @@ -8905,20 +9633,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if ( count ) { - pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); pv[count] = RExC_npar; SvIV_set(sv_dat, SvIVX(sv_dat) + 1); } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); - sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); SvIOK_on(sv_dat); SvIV_set(sv_dat, 1); } #ifdef DEBUGGING - /* Yes this does cause a memory leak in debugging Perls */ - if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) SvREFCNT_dec_NN(svname); #endif @@ -8928,7 +9660,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = 1; goto capturing_parens; } - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; case '=': /* (?=...) */ @@ -8972,6 +9704,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; *flagp |= POSTPONED; nextchar(pRExC_state); return ret; @@ -8986,6 +9719,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); goto gen_recurse_regop; assert(0); /* NOT REACHED */ case '+': @@ -9045,11 +9780,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ARG2L_SET( ret, RExC_recurse_count++); RExC_emit++; DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); } else { RExC_size++; } - RExC_seen |= REG_SEEN_RECURSE; + RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ @@ -9063,7 +9799,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) is_logical = 1; if (*RExC_parse != '{') { RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } *flagp |= POSTPONED; @@ -9092,14 +9831,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { OP *o = cb->block; if (cb->src_regex) { - n = add_data(pRExC_state, 2, "rl"); + n = add_data(pRExC_state, STR_WITH_LEN("rl")); RExC_rxi->data->data[n] = (void*)SvREFCNT_inc((SV*)cb->src_regex); RExC_rxi->data->data[n+1] = (void*)o; } else { - n = add_data(pRExC_state, 1, - (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); RExC_rxi->data->data[n] = (void*)o; } } @@ -9160,7 +9899,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) (ch == '>' ? '<' : ch)); RExC_parse++; if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -9190,7 +9929,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV *sv_dat; RExC_parse++; sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } ret = reganode(pRExC_state,INSUBP,parno); @@ -9199,6 +9940,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; + char *tmp; parno = atoi(RExC_parse++); while (isDIGIT(*RExC_parse)) @@ -9206,8 +9948,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if ((c = *nextchar(pRExC_state)) != ')') + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); + } insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -9219,14 +9967,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); } else - REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); c = *nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { if (is_define) vFAIL("(?(DEFINE)....) does not allow branches"); - lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { if (flags & RESTART_UTF8) { *flagp = RESTART_UTF8; @@ -9258,7 +10010,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else { - vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); } } case '[': /* (?[ ... ]) */ @@ -9292,7 +10045,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE + if (RExC_seen & REG_RECURSE_SEEN && !RExC_open_parens[parno-1]) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, @@ -9349,7 +10102,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { ender = reganode(pRExC_state, LONGJMP,0); - REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); } if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ @@ -9381,7 +10136,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting close paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ender))); @@ -9413,8 +10168,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("lsbr"); - regprop(RExC_rx, mysv_val1, lastbr); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(lastbr), @@ -9428,20 +10183,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (have_branch && !SIZE_ONLY) { char is_nothing= 1; if (depth==1) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; /* Hook the tails of the branches to the closing node. */ for (br = ret; br; br = regnext(br)) { const U8 op = PL_regkind[OP(br)]; if (op == BRANCH) { REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); - if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender) + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) is_nothing= 0; } else if (op == BRANCHJ) { REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); /* for now we always disable this optimisation * / - if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender) + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) */ is_nothing= 0; } @@ -9452,8 +10209,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("NADA"); - regprop(RExC_rx, mysv_val1, ret); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(ret), @@ -9695,6 +10452,19 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { + if (SIZE_ONLY) { + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); + } + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); + } do_curly: if ((flags&SIMPLE)) { @@ -9736,6 +10506,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; goto nest_check; } @@ -9773,6 +10545,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '*') { min = 0; @@ -9782,6 +10555,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '+') { min = 1; @@ -9794,10 +10568,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) nest_check: if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN3reg(RExC_parse, - "%.*s matches null string many times", - (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), - origparse); + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); (void)ReREFCNT_inc(RExC_rx_sv); } @@ -9827,8 +10603,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } STATIC bool -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, - const bool strict /* Apply stricter parsing rules? */ +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ ) { @@ -9894,7 +10671,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ /* The [^\n] meaning of \N ignores spaces and comments under the /x - * modifier. The other meaning does not */ + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) ? regwhite( pRExC_state, RExC_parse ) : RExC_parse; @@ -9904,17 +10682,18 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (*p != '{' || regcurly(p, FALSE)) { RExC_parse = p; if (! node_p) { - /* no bare \N in a charclass */ + /* no bare \N allowed in a charclass */ if (in_char_class) { vFAIL("\\N in a character class must be a named character: \\N{...}"); } return FALSE; } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ nextchar(pRExC_state); *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; - RExC_parse--; Set_Node_Length(*node_p, 1); /* MJD */ return TRUE; } @@ -9933,8 +10712,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ || ! (endbrace == RExC_parse /* nothing between the {} */ - || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ - && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ { if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); @@ -10140,7 +10921,9 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) } PERL_STATIC_INLINE void -S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) { /* This knows the details about sizing an EXACTish node, setting flags for * it (by setting <*flagp>, and potentially populating it with a single @@ -10155,48 +10938,111 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 * If is zero, the function assumes that the node is to contain only * the single character given by and calculates what * should be. In pass 1, it sizes the node appropriately. In pass 2, it - * additionally will populate the node's STRING with , if - * is 0. In both cases <*flagp> is appropriately set + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set * * It knows that under FOLD, the Latin Sharp S and UTF characters above * 255, must be folded (the former only when the rules indicate it can - * match 'ss') */ + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + if (! len_passed_in) { if (UTF) { - if (FOLD && (! LOC || code_point > 255)) { - _to_uni_fold_flags(NATIVE_TO_UNI(code_point), + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, character, &len, - FOLD_FLAGS_FULL | ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; } else { uvchr_to_utf8( character, code_point); len = UTF8SKIP(character); } - } - else if (! FOLD - || code_point != LATIN_SMALL_LETTER_SHARP_S - || ASCII_FOLD_RESTRICTED - || ! AT_LEAST_UNI_SEMANTICS) - { + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { *character = (U8) code_point; len = 1; - } - else { + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { *character = 's'; *(character + 1) = 's'; len = 2; } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } } if (SIZE_ONLY) { @@ -10220,8 +11066,29 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 { *flagp |= SIMPLE; } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } +} + + +/* return atoi(p), unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + char *q = p; + + for (;isDIGIT(*q); q++); /* calculate length of num */ + if (q - p == 0 || q - p > 9) + return I32_MAX; + return atoi(p); } + /* - regatom - the lowest level @@ -10379,7 +11246,8 @@ tryagain: *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; @@ -10427,7 +11295,7 @@ tryagain: goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); - RExC_seen |= REG_SEEN_GPOS; + RExC_seen |= REG_GPOS_SEEN; *flagp |= SIMPLE; goto finish_meta_pat; case 'K': @@ -10438,7 +11306,7 @@ tryagain: * be necessary here to avoid cases of memory corruption, as * with: C<$_="x" x 80; s/x\K/y/> -- rgs */ - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -10452,7 +11320,7 @@ tryagain: goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_SEEN_CANY; + RExC_seen |= REG_CANY_SEEN; *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'X': @@ -10469,30 +11337,38 @@ tryagain: case 'b': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = BOUND + get_regex_charset(RExC_flags); if (op > BOUNDA) { /* /aa is same as /a */ op = BOUNDA; } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = NBOUND + get_regex_charset(RExC_flags); if (op > NBOUNDA) { /* /aa is same as /a */ op = NBOUNDA; } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); } goto finish_meta_pat; @@ -10536,6 +11412,9 @@ tryagain: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } join_posix_op_known: @@ -10610,6 +11489,7 @@ tryagain: char ch= RExC_parse[1]; if (ch != '<' && ch != '\'' && ch != '{') { RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.2s... not terminated",parse_start); } else { /* this pretty much dupes the code for (?P=...) in reg(), if @@ -10620,10 +11500,11 @@ tryagain: SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -10655,10 +11536,11 @@ tryagain: case '5': case '6': case '7': case '8': case '9': { I32 num; - bool isg = *RExC_parse == 'g'; - bool isrel = 0; bool hasbrace = 0; - if (isg) { + + if (*RExC_parse == 'g') { + bool isrel = 0; + RExC_parse++; if (*RExC_parse == '{') { RExC_parse++; @@ -10672,25 +11554,40 @@ tryagain: if (isrel) RExC_parse--; RExC_parse -= 2; goto parse_named_seq; - } } - num = atoi(RExC_parse); - if (isg && num == 0) { - if (*RExC_parse == '0') { + } + + num = S_backref_value(RExC_parse); + if (num == 0) vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); } - else { - vFAIL("Unterminated \\g... pattern"); + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); } } - if (isrel) { - num = RExC_npar - num; - if (num < 1) - vFAIL("Reference to nonexistent or unclosed group"); - } - if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9') - /* Probably a character specified in octal, e.g. \35 */ - goto defchar; else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } + } + + /* at this point RExC_parse definitely points to a backref + * number */ + { #ifdef RE_TRACK_PATTERN_OFFSETS char * const parse_start = RExC_parse - 1; /* MJD */ #endif @@ -10761,7 +11658,6 @@ tryagain: char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; - STRLEN foldlen; U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; @@ -10769,9 +11665,15 @@ tryagain: /* We can convert EXACTF nodes to EXACTFU if they contain only * characters that match identically regardless of the target * string's UTF8ness. The reason to do this is that EXACTF is not - * trie-able, EXACTFU is. (We don't need to figure this out until - * pass 2) */ - bool maybe_exactfu = node_type == EXACTF && PASS2; + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, @@ -10788,10 +11690,9 @@ tryagain: reparse: - /* We do the EXACTFish to EXACT node only if folding, and not if in - * locale, as whether a character folds or not isn't known until - * runtime. (And we don't need to figure this out until pass 2) */ - maybe_exact = FOLD && ! LOC && PASS2; + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10856,7 +11757,8 @@ tryagain: case 's': case 'S': /* space class */ case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ - case 'X': /* eXtended Unicode "combining character sequence" */ + case 'X': /* eXtended Unicode "combining + character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ --p; goto loopdone; @@ -10904,7 +11806,7 @@ tryagain: p++; break; case 'a': - ender = ASCII_TO_NATIVE('\007'); + ender = '\a'; p++; break; case 'o': @@ -10967,24 +11869,30 @@ tryagain: } case 'c': p++; - ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); + ender = grok_bslash_c(*p++, SIZE_ONLY); break; case '8': case '9': /* must be a backreference */ --p; goto loopdone; case '1': case '2': case '3':case '4': case '5': case '6': case '7': - /* When we parse backslash escapes there is ambiguity between - * backreferences and octal escapes. Any escape from \1 - \9 is - * a backreference, any multi-digit escape which does not start with - * 0 and which when evaluated as decimal could refer to an already - * parsed capture buffer is a backslash. Anything else is octal. + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. * - * Note this implies that \118 could be interpreted as 118 OR as - * "\11" . "8" depending on whether there were 118 capture buffers - * defined already in the pattern. - */ - if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar ) + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) { /* Not to be treated as an octal constant, go find backref */ --p; @@ -11041,7 +11949,7 @@ tryagain: if (! SIZE_ONLY && RExC_flags & RXf_PMf_EXTENDED && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low(p, UTF)) + && is_PATWS_non_low_safe(p, RExC_end, UTF)) { vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), "Escape literal pattern white space under /x"); @@ -11079,7 +11987,10 @@ tryagain: goto loopdone; } - if (! FOLD) { + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { if (UTF) { const STRLEN unilen = reguni(pRExC_state, ender, s); if (unilen > 0) { @@ -11097,14 +12008,27 @@ tryagain: else { REGC((char)ender, s++); } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } } - else /* FOLD */ + else /* FOLD */ if (! ( UTF /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ if (IS_IN_SOME_FOLD_L1(ender)) { maybe_exact = FALSE; @@ -11116,77 +12040,68 @@ tryagain: || ender == LATIN_SMALL_LETTER_SHARP_S || (len > 0 && isARG2_lower_or_UPPER_ARG1('s', ender) - && isARG2_lower_or_UPPER_ARG1('s', *(s-1))))) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) { maybe_exactfu = FALSE; } } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ *(s++) = (char) ender; } - else { /* UTF */ - - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UNI_IS_INVARIANT(ender)) { - *s = (U8) ender; - foldlen = 1; - } else { - *s = UTF8_TWO_BYTE_HI(ender); - *(s + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; - } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; } else { - UV folded = _to_uni_fold_flags( + STRLEN foldlen; + + folded = _to_uni_fold_flags( ender, (U8 *) s, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) - ); + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; - /* If this node only contains non-folding code points - * so far, see if this new one is also non-folding */ - if (maybe_exact) { - if (folded != ender) { - maybe_exact = FALSE; - } - else { - /* Here the fold is the original; we have - * to check further to see if anything - * folds to it */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", - "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = - _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - if (_invlist_contains_cp(PL_utf8_foldable, + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, ender)) - { - maybe_exact = FALSE; - } + { + maybe_exact = FALSE; } } - ender = folded; } - s += foldlen; - - /* The loop increments each time, as all but this - * path (and one other) through it add a single byte to the - * EXACTish node. But this one has changed len to be the - * correct final value, so subtract one to cancel out the - * increment that follows */ - len += foldlen - 1; + ender = folded; } if (next_is_quantifier) { @@ -11235,9 +12150,8 @@ tryagain: if (! UTF) { - /* These two have no multi-char folds to non-UTF characters - */ - if (ASCII_FOLD_RESTRICTED || LOC) { + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { goto loopdone; } @@ -11268,11 +12182,7 @@ tryagain: } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - - /* No Latin1 characters participate in multi-char - * folds under /l */ - if (LOC - || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( *s, *(s+1)))) { break; @@ -11383,7 +12293,7 @@ tryagain: * code points in the node that participate in folds; * similarly for 'maybe_exactfu' and code points that match * differently depending on UTF8ness of the target string - * */ + * (for /u), or depending on locale for /l */ if (maybe_exact) { OP(ret) = EXACT; } @@ -11391,7 +12301,12 @@ tryagain: OP(ret) = EXACTFU; } } - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); } RExC_parse = p - 1; @@ -11430,7 +12345,7 @@ S_regwhite( RExC_state_t *pRExC_state, char *p ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -11444,7 +12359,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */ + * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -11464,7 +12379,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -11472,6 +12387,72 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) return p; } +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. Character classes ([:foo:]) can also be negated ([:^foo:]). Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. @@ -11590,8 +12571,9 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) } if (namedclass == OOB_NAMEDCLASS) - Simple_vFAIL3("POSIX class [:%.*s:] unknown", - t - s - 1, s + 1); + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); /* The #defines are structured so each complement is +1 to * the normal one */ @@ -11679,8 +12661,9 @@ S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) } STATIC regnode * -S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth, - char * const oregcomp_parse) +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) { /* Handle the (?[...]) construct to do set operations */ @@ -11716,7 +12699,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), "The regex_sets feature is experimental" REPORT_LOCATION, - (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse); + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); while (RExC_parse < RExC_end) { SV* current = NULL; @@ -12170,7 +13156,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ -#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, @@ -12220,8 +13207,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ SV* properties = NULL; /* Code points that match \p{} \P{} */ - SV* posixes = NULL; /* Code points that match classes like, [:word:], - extended beyond the Latin1 range */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ AV * multi_char_matches = NULL; /* Code points that fold to more than one @@ -12248,11 +13243,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * string is in UTF-8. (Because is under /d) */ SV* depends_list = NULL; - /* inversion list of code points this node matches. For much of the - * function, it includes only those that match regardless of the utf8ness - * of the target string */ + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ SV* cp_list = NULL; + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -12260,14 +13262,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif bool invert = FALSE; /* Is this class to be complemented */ - /* Is there any thing like \W or [:^digit:] that matches above the legal - * Unicode range? */ - bool runtime_posix_matches_above_Unicode = FALSE; + bool warn_super = ALWAYS_WARN_SUPER; regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; - const I32 orig_size = RExC_size; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -12288,9 +13289,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ANYOF_FLAGS(ret) = 0; RExC_emit += ANYOF_SKIP; - if (LOC) { - ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } listsv = newSVpvs_flags("# comment\n", SVs_TEMP); initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ @@ -12433,7 +13431,12 @@ parseit: char *e; /* We will handle any undefined properties ourselves */ - U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF; + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; if (RExC_parse >= RExC_end) vFAIL2("Empty \\%c{}", (U8)value); @@ -12456,6 +13459,7 @@ parseit: } if (!SIZE_ONLY) { SV* invlist; + char* formatted; char* name; if (UCHARAT(RExC_parse) == '^') { @@ -12476,14 +13480,14 @@ parseit: * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - Newx(name, n + sizeof("_i__\n"), char); - - sprintf(name, "%s%.*s%s\n", - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - ); + formatted = Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + ); + name = savepvn(formatted, strlen(formatted)); /* Look up the property name, and get its swash and * inversion list, if the property is found */ @@ -12508,11 +13512,13 @@ parseit: * otherwise add it to the list for run-time look up */ if (ret_invlist) { RExC_parse = e + 1; - vFAIL3("Property '%.*s' is unknown", (int) n, name); + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", (value == 'p' ? '+' : '!'), - name); + UTF8fARG(UTF, n, name)); has_user_defined_property = TRUE; /* We don't know yet, so have to assume that the @@ -12521,7 +13527,7 @@ parseit: * would cause things in to match * inappropriately, except that any \p{}, including * this one forces Unicode semantics, which means there - * is */ + * is no */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; } else { @@ -12529,9 +13535,23 @@ parseit: /* Here, did get the swash and its inversion list. If * the swash is from a user-defined property, then this * whole character class should be regarded as such */ - has_user_defined_property = - (swash_init_flags - & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + /* Invert if asking for the complement */ if (value == 'P') { @@ -12565,7 +13585,7 @@ parseit: case 'f': value = '\f'; break; case 'b': value = '\b'; break; case 'e': value = ASCII_TO_NATIVE('\033');break; - case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'a': value = '\a'; break; case 'o': RExC_parse--; /* function expects to be pointed at the 'o' */ { @@ -12605,7 +13625,7 @@ parseit: goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -12677,31 +13697,8 @@ parseit: /* Here, we have the current token in 'value' */ - /* What matches in a locale is not known until runtime. This includes - * what the Posix classes (like \w, [:space:]) match. Room must be - * reserved (one time per class) to store such classes, either if Perl - * is compiled so that locale nodes always should have this space, or - * if there is such class info to be stored. The space will contain a - * bit for each named class that is to be matched against. This isn't - * needed for \p{} and pseudo-classes, as they are not affected by - * locale, and hence are dealt with separately */ - if (LOC - && ! need_class - && (ANYOF_LOCALE == ANYOF_CLASS - || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX))) - { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; - ANYOF_CLASS_ZERO(ret); - } - ANYOF_FLAGS(ret) |= ANYOF_CLASS; - } - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a * literal, as is the character that began the false range, i.e. @@ -12712,16 +13709,19 @@ parseit: ? RExC_parse - rangebegin : 0; if (strict) { - vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); } else { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN4reg(RExC_parse, - "False [] range \"%*.*s\"", - w, w, rangebegin); + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); (void)ReREFCNT_inc(RExC_rx_sv); cp_list = add_cp_to_invlist(cp_list, '-'); - cp_list = add_cp_to_invlist(cp_list, prevvalue); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); } } @@ -12729,16 +13729,89 @@ parseit: element_count += 2; /* So counts for three values */ } - if (! SIZE_ONLY) { - U8 classnum = namedclass_to_classnum(namedclass); - if (namedclass >= ANYOF_MAX) { /* If a special class */ + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ - /* Here, should be \h, \H, \v, or \V. Neither /d nor - * /l make a difference in what these match. There - * would be problems if these characters had folds - * other than themselves, as cp_list is subject to - * folding. */ + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ if (classnum != _CC_VERTSPACE) { assert( namedclass == ANYOF_HORIZWS || namedclass == ANYOF_NHORIZWS); @@ -12751,245 +13824,24 @@ parseit: _invlist_union_maybe_complement_2nd( cp_list, PL_XPosix_ptrs[classnum], - cBOOL(namedclass % 2), /* Complement if odd + namedclass % 2 != 0, /* Complement if odd (NHORIZWS, NVERTWS) */ &cp_list); } } - else if (classnum == _CC_ASCII) { -#ifdef HAS_ISASCII - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else -#endif /* Not isascii(); just use the hard-coded definition for it */ - _invlist_union_maybe_complement_2nd( - posixes, - PL_ASCII, - cBOOL(namedclass % 2), /* Complement if odd - (NASCII) */ - &posixes); - } - else { /* Garden variety class */ - - /* The ascii range inversion list */ - SV* ascii_source = PL_Posix_ptrs[classnum]; - - /* The full Latin1 range inversion list */ - SV* l1_source = PL_L1Posix_ptrs[classnum]; - - /* This code is structured into two major clauses. The - * first is for classes whose complete definitions may not - * already be known. It not, the Latin1 definition - * (guaranteed to already known) is used plus code is - * generated to load the rest at run-time (only if needed). - * If the complete definition is known, it drops down to - * the second clause, where the complete definition is - * known */ - - if (classnum < _FIRST_NON_SWASH_CC) { - - /* Here, the class has a swash, which may or not - * already be loaded */ - - /* The name of the property to use to match the full - * eXtended Unicode range swash for this character - * class */ - const char *Xname = swash_property_names[classnum]; - - /* If returning the inversion list, we can't defer - * getting this until runtime */ - if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) { - PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", Xname, &PL_sv_undef, - 1, /* binary */ - 0, /* not tr/// */ - NULL, /* No inversion list */ - NULL /* No flags */ - ); - assert(PL_utf8_swash_ptrs[classnum]); - } - if ( ! PL_utf8_swash_ptrs[classnum]) { - if (namedclass % 2 == 0) { /* A non-complemented - class */ - /* If not /a matching, there are code points we - * don't know at compile time. Arrange for the - * unknown matches to be loaded at run-time, if - * needed */ - if (! AT_LEAST_ASCII_RESTRICTED) { - Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", - Xname); - } - if (LOC) { /* Under locale, set run-time - lookup */ - ANYOF_CLASS_SET(ret, namedclass); - } - else { - /* Add the current class's code points to - * the running total */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : l1_source, - &posixes); - } - } - else { /* A complemented class */ - if (AT_LEAST_ASCII_RESTRICTED) { - /* Under /a should match everything above - * ASCII, plus the complement of the set's - * ASCII matches */ - _invlist_union_complement_2nd(posixes, - ascii_source, - &posixes); - } - else { - /* Arrange for the unknown matches to be - * loaded at run-time, if needed */ - Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", - Xname); - runtime_posix_matches_above_Unicode = TRUE; - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else { - - /* We want to match everything in - * Latin1, except those things that - * l1_source matches */ - SV* scratch_list = NULL; - _invlist_subtract(PL_Latin1, l1_source, - &scratch_list); - - /* Add the list from this class to the - * running total */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, - scratch_list, - &posixes); - SvREFCNT_dec_NN(scratch_list); - } - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) - |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - } - } - goto namedclass_done; - } - - /* Here, there is a swash loaded for the class. If no - * inversion list for it yet, get it */ - if (! PL_XPosix_ptrs[classnum]) { - PL_XPosix_ptrs[classnum] - = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]); - } - } - - /* Here there is an inversion list already loaded for the - * entire class */ - - if (namedclass % 2 == 0) { /* A non-complemented class, - like ANYOF_PUNCT */ - if (! LOC) { - /* For non-locale, just add it to any existing list - * */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - } - else { /* Locale */ - SV* scratch_list = NULL; - - /* For above Latin1 code points, we use the full - * Unicode range */ - _invlist_intersection(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - /* And set the output to it, adding instead if - * there already is an output. Checking if - * 'posixes' is NULL first saves an extra clone. - * Its reference count will be decremented at the - * next union, etc, or if this is the only - * instance, at the end of the routine */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } - -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_BLANK) { -#endif - /* Set this class in the node for runtime - * matching */ - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* No isblank(), use the hard-coded ASCII-range - * blanks, adding them to the running total. */ - - _invlist_union(posixes, ascii_source, &posixes); - } -#endif - } - } - else { /* A complemented class, like ANYOF_NPUNCT */ - if (! LOC) { - _invlist_union_complement_2nd( - posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - /* Under /d, everything in the upper half of the - * Latin1 range matches this complement */ - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* Locale */ - SV* scratch_list = NULL; - _invlist_subtract(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_NBLANK) { -#endif - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* Get the list of all code points in Latin1 - * that are not ASCII blanks, and add them to - * the running total */ - _invlist_subtract(PL_Latin1, ascii_source, - &scratch_list); - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#endif - } - } + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); } - namedclass_done: continue; /* Go get next character */ } } /* end of namedclass \blah */ @@ -13009,7 +13861,9 @@ parseit: if (range) { if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; - Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); range = 0; /* not a valid range */ } } @@ -13090,11 +13944,9 @@ parseit: value, foldbuf, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) ); /* Here, should be the first character of the @@ -13162,7 +14014,8 @@ parseit: /* Deal with this element of the class */ if (! SIZE_ONLY) { #ifndef EBCDIC - cp_list = _add_range_to_invlist(cp_list, prevvalue, value); + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); #else SV* this_range = _new_invlist(1); _append_range_to_invlist(this_range, prevvalue, value); @@ -13176,13 +14029,18 @@ parseit: * included. literal_endpoint==2 means both ends of the range used * a literal character, not \x{foo} */ if (literal_endpoint == 2 - && (prevvalue >= 'a' && value <= 'z') - || (prevvalue >= 'A' && value <= 'Z')) + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) { - _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA], + _invlist_intersection(this_range, PL_ASCII, + &this_range); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], &this_range); } - _invlist_union(cp_list, this_range, &cp_list); + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); literal_endpoint = 0; #endif } @@ -13212,7 +14070,7 @@ parseit: #endif /* Look at the longest folds first */ - for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -13267,15 +14125,29 @@ parseit: return ret; } - /* If the character class contains only a single element, it may be - * optimizable into another node type which is smaller and runs faster. - * Check if this is the case for this class */ - if (element_count == 1 && ! ret_invlist) { + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { U8 op = END; U8 arg = 0; - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or - [:digit:] or \p{foo} */ + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ /* All named classes are mapped into POSIXish nodes, with its FLAG * argument giving which class it is */ @@ -13331,14 +14203,6 @@ parseit: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } -#ifndef HAS_ISBLANK - if (op == POSIXL - && (namedclass == ANYOF_BLANK - || namedclass == ANYOF_NBLANK)) - { - op = POSIXA; - } -#endif join_posix: /* The odd numbered ones are the complements of the @@ -13393,13 +14257,16 @@ parseit: /* To get locale nodes to not use the full ANYOF size would * require moving the code above that writes the portions * of it that aren't in other nodes to after this point. - * e.g. ANYOF_CLASS_SET */ + * e.g. ANYOF_POSIXL_SET */ RExC_size = orig_size; } } else { RExC_emit = (regnode *)orig_emit; if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } if (invert) { op += NPOSIXD - POSIXD; } @@ -13415,13 +14282,17 @@ parseit: *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } RExC_parse = (char *) cur_parse; SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); return ret; } } @@ -13432,238 +14303,260 @@ parseit: /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ - if (FOLD && cp_list) { - UV start, end; /* End points of code point ranges */ + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; + } - SV* fold_intersection = NULL; + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, + &fold_intersection); - /* If the highest code point is within Latin1, we can use the - * compiled-in Alphas list, and not have to go out to disk. This - * yields two false positives, the masculine and feminine ordinal - * indicators, which are weeded out below using the - * IS_IN_SOME_FOLD_L1() macro */ - if (invlist_highest(cp_list) < 256) { - _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, - &fold_intersection); - } - else { + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { + + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { - /* Here, there are non-Latin1 code points, so we will have to go - * fetch the list of all the characters that participate in folds - */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { - - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode - * rules hard-coded into Perl. (This case happens legitimately - * during compilation of Perl itself before the Unicode tables - * are generated) */ - if (_invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = newHV(); - } - else { /* If the folds haven't been read in, call a fold function * to force that */ if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; + U8 dummy[UTF8_MAXBYTES_CASE+1]; /* This string is just a short named one above \xff */ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); assert(PL_utf8_tofold); /* Verify that worked */ } - PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); + PL_utf8_foldclosures + = _swash_inversion_hash(PL_utf8_tofold); } } - /* Only the characters in this class that participate in folds need - * be checked. Get the intersection of this class and all the - * possible characters that are foldable. This can quickly narrow - * down a large class */ - _invlist_intersection(PL_utf8_foldable, cp_list, - &fold_intersection); - } - - /* Now look at the foldable characters in this class individually */ - invlist_iterinit(fold_intersection); - while (invlist_iternext(fold_intersection, &start, &end)) { - UV j; - - /* Locale folding for Latin1 characters is deferred until runtime */ - if (LOC && start < 256) { - start = 256; - } - - /* Look at every character in the range */ - for (j = start; j <= end; j++) { - - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - SV** listp; - - if (j < 256) { - - /* We have the latin1 folding rules hard-coded here so that - * an innocent-looking character class, like /[ks]/i won't - * have to go out to disk to find the possible matches. - * XXX It would be better to generate these via regen, in - * case a new version of the Unicode standard adds new - * mappings, though that is not really likely, and may be - * caught by the default: case of the switch below. */ - - if (IS_IN_SOME_FOLD_L1(j)) { - - /* ASCII is always matched; non-ASCII is matched only - * under Unicode rules */ - if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { - cp_list = - add_cp_to_invlist(cp_list, PL_fold_latin1[j]); - } - else { - depends_list = - add_cp_to_invlist(depends_list, PL_fold_latin1[j]); + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; + + if (j < 256) { + + /* We have the latin1 folding rules hard-coded here so + * that an innocent-looking character class, like + * /[ks]/i won't have to go out to disk to find the + * possible matches. XXX It would be better to + * generate these via regen, in case a new version of + * the Unicode standard adds new mappings, though that + * is not really likely, and may be caught by the + * default: case of the switch below. */ + + if (IS_IN_SOME_FOLD_L1(j)) { + + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } } - } - if (HAS_NONLATIN1_FOLD_CLOSURE(j) - && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) - { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for to the - * inversion list. */ - switch (j) { - case 'k': - case 'K': - cp_list = - add_cp_to_invlist(cp_list, KELVIN_SIGN); - break; - case 's': - case 'S': - cp_list = add_cp_to_invlist(cp_list, + if (HAS_NONLATIN1_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + /* Certain Latin1 characters have matches outside + * Latin1. To get here, is one of those + * characters. None of these matches is valid for + * ASCII characters under /aa, which is why the 'if' + * just above excludes those. These matches only + * happen when the target string is utf8. The code + * below adds the single fold closures for to the + * inversion list. */ + + switch (j) { + case 'k': + case 'K': + *use_list = + add_cp_to_invlist(*use_list, KELVIN_SIGN); + break; + case 's': + case 'S': + *use_list = add_cp_to_invlist(*use_list, LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - cp_list = add_cp_to_invlist(cp_list, + break; + case MICRO_SIGN: + *use_list = add_cp_to_invlist(*use_list, GREEK_CAPITAL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - cp_list = - add_cp_to_invlist(cp_list, ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - cp_list = add_cp_to_invlist(cp_list, + *use_list = add_cp_to_invlist(*use_list, + GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *use_list = + add_cp_to_invlist(*use_list, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *use_list = add_cp_to_invlist(*use_list, LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - cp_list = add_cp_to_invlist(cp_list, + break; + case LATIN_SMALL_LETTER_SHARP_S: + *use_list = add_cp_to_invlist(*use_list, LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 to - * express, so they can't match unless the - * target string is in UTF-8, so no action here - * is necessary, as regexec.c properly handles - * the general case for UTF-8 matching and - * multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character + * folds from code points that require UTF8 + * to express, so they can't match unless + * the target string is in UTF-8, so no + * action here is necessary, as regexec.c + * properly handles the general case for + * UTF-8 matching and multi-char folds */ + break; + default: + /* Use deprecated warning to increase the + * chances of this being output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); + break; + } } + continue; } - continue; - } - /* Here is an above Latin1 character. We don't have the rules - * hard-coded for it. First, get its fold. This is the simple - * fold, as the multi-character folds have been handled earlier - * and separated out */ - _to_uni_fold_flags(j, foldbuf, &foldlen, - ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); - - /* Single character fold of above Latin1. Add everything in - * its fold closure to the list that this node should match. - * The fold closures data structure is a hash with the keys - * being the UTF-8 of every character that is folded to, like - * 'k', and the values each an array of all code points that - * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. - * Multi-character folds are not included */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); - - /* /aa doesn't allow folds between ASCII and non-; /l - * doesn't allow them between above and below 256 */ - if ((ASCII_FOLD_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && c < 256)) { - continue; - } + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c = SvUV(*c_p); - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ - if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) - { - cp_list = add_cp_to_invlist(cp_list, c); - } - else { - depends_list = add_cp_to_invlist(depends_list, c); + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } + + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } } } } } + SvREFCNT_dec_NN(fold_intersection); } - SvREFCNT_dec_NN(fold_intersection); + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); } /* And combine the result (if any) with any inversion list from posix * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ - if (posixes) { + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } if (! DEPENDS_SEMANTICS) { if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); @@ -13677,10 +14570,8 @@ parseit: /* Under /d, we put into a separate list the Latin1 things that * match only when the target string is utf8 */ SV* nonascii_but_latin1_properties = NULL; - _invlist_intersection(posixes, PL_Latin1, + _invlist_intersection(posixes, PL_UpperLatin1, &nonascii_but_latin1_properties); - _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII, - &nonascii_but_latin1_properties); _invlist_subtract(posixes, nonascii_but_latin1_properties, &posixes); if (cp_list) { @@ -13714,7 +14605,6 @@ parseit: * , because having a Unicode property forces Unicode * semantics */ if (properties) { - bool warn_super = ! has_user_defined_property; if (cp_list) { /* If it matters to the final outcome, see if a non-property @@ -13725,14 +14615,8 @@ parseit: * are using above-Unicode code points indicates they should know * the issues involved */ if (warn_super) { - bool non_prop_matches_above_Unicode = - runtime_posix_matches_above_Unicode - | (invlist_highest(cp_list) > PERL_UNICODE_MAX); - if (invert) { - non_prop_matches_above_Unicode = - ! non_prop_matches_above_Unicode; - } - warn_super = ! non_prop_matches_above_Unicode; + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); } _invlist_union(properties, cp_list, &cp_list); @@ -13743,7 +14627,7 @@ parseit: } if (warn_super) { - OP(ret) = ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; } } @@ -13756,12 +14640,32 @@ parseit: * shouldn't. Therefore we can't invert folded locale now, as it won't be * folded until runtime */ + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ if (invert - && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { @@ -13791,15 +14695,6 @@ parseit: return orig_emit; } - /* If we didn't do folding, it's because some information isn't available - * until runtime; set the run-time fold flag for these. (We don't have to - * worry about properties folding, as that is taken care of by the swash - * fetching) */ - if (FOLD && LOC) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; - } - /* Some character classes are equivalent to other nodes. Such nodes take * up less room and generally fewer operations to execute than ANYOF nodes. * Above, we checked for and optimized into some such equivalents for @@ -13818,8 +14713,13 @@ parseit: if (cp_list && ! invert && ! depends_list - && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) { UV start, end; U8 op = END; /* The optimzation node-type */ @@ -13843,7 +14743,7 @@ parseit: && (start < 256 || UTF)) { /* Here, the list contains a single code point. Can optimize - * into an EXACT node */ + * into an EXACTish node */ value = start; @@ -13873,12 +14773,6 @@ parseit: } } else { - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } if (_invlist_contains_cp(PL_utf8_foldable, value)) { op = EXACT; } @@ -13918,7 +14812,9 @@ parseit: RExC_parse = (char *)cur_parse; if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } SvREFCNT_dec_NN(cp_list); @@ -13931,53 +14827,8 @@ parseit: * for things that belong in the bitmap, put them there, and delete from * . While we are at it, see if everything above 255 is in the * list, and if so, set a flag to speed up execution */ - ANYOF_BITMAP_ZERO(ret); - if (cp_list) { - - /* This gets set if we actually need to modify things */ - bool change_invlist = FALSE; - - UV start, end; - - /* Start looking through */ - invlist_iterinit(cp_list); - while (invlist_iternext(cp_list, &start, &end)) { - UV high; - int i; - - if (end == UV_MAX && start <= 256) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - } - - /* Quit if are above what we should change */ - if (start > 255) { - break; - } - change_invlist = TRUE; - - /* Set all the bits in the range, up to the max that we are doing */ - high = (end < 255) ? end : 255; - for (i = start; i <= (int) high; i++) { - if (! ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_SET(ret, i); - } - } - } - invlist_iterfinish(cp_list); - - /* Done with loop; remove any code points that are in the bitmap from - * */ - if (change_invlist) { - _invlist_subtract(cp_list, PL_Latin1, &cp_list); - } - - /* If have completely emptied it, remove it completely */ - if (_invlist_len(cp_list) == 0) { - SvREFCNT_dec_NN(cp_list); - cp_list = NULL; - } - } + populate_ANYOF_from_invlist(ret, &cp_list); if (invert) { ANYOF_FLAGS(ret) |= ANYOF_INVERT; @@ -13994,6 +14845,7 @@ parseit: else { cp_list = depends_list; } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; } /* If there is a swash and more than one element, we can't use the swash in @@ -14003,56 +14855,104 @@ parseit: swash = NULL; } - if (! cp_list - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - { - ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { - /* av[0] stores the character class description in its textual form: - * used later (regexec.c:Perl_regclass_swash()) to initialize the - * appropriate swash, and is also useful for dumping the regnode. - * av[1] if NULL, is a placeholder to later contain the swash computed - * from av[0]. But if no further computation need be done, the - * swash is stored there now. - * av[2] stores the cp_list inversion list for use in addition or - * instead of av[0]; used only if av[1] is NULL - * av[3] is set if any component of the class is from a user-defined - * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; - av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - ? SvREFCNT_inc(listsv) : &PL_sv_undef); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { av_store(av, 1, swash); SvREFCNT_dec_NN(cp_list); } else { - av_store(av, 1, NULL); + av_store(av, 1, &PL_sv_undef); if (cp_list) { - av_store(av, 2, cp_list); - av_store(av, 3, newSVuv(has_user_defined_property)); + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); } } + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + rv = newRV_noinc(MUTABLE_SV(av)); - n = add_data(pRExC_state, 1, "s"); + n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; - ARG_SET(ret, n); + ARG_SET(node, n); } - - *flagp |= HASWIDTH|SIMPLE; - return ret; } -#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* reg_skipcomment() Absorbs an /x style # comments from the input stream. Returns true if there is more text remaining in the stream. - Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment terminates the pattern without including a newline. Note its the callers responsibility to ensure that we are @@ -14075,7 +14975,7 @@ S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) if (!ended) { /* we ran off the end of the pattern without ending the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; return 0; } else return 1; @@ -14157,7 +15057,8 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", "reg_node", __LINE__, PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] @@ -14214,7 +15115,8 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", __LINE__, PL_reg_name[op], @@ -14233,7 +15135,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) /* - reguni - emit (if appropriate) a Unicode character */ -STATIC STRLEN +PERL_STATIC_INLINE STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; @@ -14294,7 +15196,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) StructCopy(--src, --dst, regnode); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", __LINE__, PL_reg_name[op], @@ -14313,7 +15216,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) place = opnd; /* Op node, where operand used to be. */ #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", __LINE__, PL_reg_name[op], @@ -14338,7 +15242,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) */ /* TODO: All three parms should be const */ STATIC void -S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14359,7 +15264,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tail" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), @@ -14397,7 +15302,8 @@ to control which is which. /* TODO: All four parms should be const */ STATIC U8 -S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14420,8 +15326,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN if (PL_regkind[OP(scan)] == EXACT) { - bool has_exactf_sharp_s; /* Unexamined in this routine */ - if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) return EXACT; } #endif @@ -14429,10 +15336,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, switch (OP(scan)) { case EXACT: case EXACTF: + case EXACTFA_NO_TRIE: case EXACTFA: case EXACTFU: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -14447,7 +15354,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), @@ -14460,8 +15367,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv_val=sv_newmortal(); DEBUG_PARSE_MSG(""); - regprop(RExC_rx, mysv_val, val); - PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", SvPV_nolen_const(mysv_val), (IV)REG_NODE_NUM(val), (IV)(val - scan) @@ -14489,7 +15397,9 @@ S_regdump_intflags(pTHX_ const char *lead, const U32 flags) int bit; int set=0; - for (bit=0; bit<32; bit++) { + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitcheck_substr == r->float_substr && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored")); - if (r->extflags & RXf_NOSCAN) + if (r->intflags & PREGf_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->extflags & RXf_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); @@ -14611,22 +15523,22 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, ") "); if (ri->regstclass) { - regprop(r, sv, ri->regstclass); + regprop(r, sv, ri->regstclass, NULL); PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); } - if (r->extflags & RXf_ANCH) { + if (r->intflags & PREGf_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); - if (r->extflags & RXf_ANCH_BOL) + if (r->intflags & PREGf_ANCH_BOL) PerlIO_printf(Perl_debug_log, "(BOL)"); - if (r->extflags & RXf_ANCH_MBOL) + if (r->intflags & PREGf_ANCH_MBOL) PerlIO_printf(Perl_debug_log, "(MBOL)"); - if (r->extflags & RXf_ANCH_SBOL) + if (r->intflags & PREGf_ANCH_SBOL) PerlIO_printf(Perl_debug_log, "(SBOL)"); - if (r->extflags & RXf_ANCH_GPOS) + if (r->intflags & PREGf_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); PerlIO_putc(Perl_debug_log, ' '); } - if (r->extflags & RXf_GPOS_SEEN) + if (r->intflags & PREGf_GPOS_SEEN) PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) PerlIO_printf(Perl_debug_log, "plus "); @@ -14648,21 +15560,11 @@ Perl_regdump(pTHX_ const regexp *r) } /* -- regprop - printable representation of opcode +- regprop - printable representation of opcode, with run time support */ -#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ -STMT_START { \ - if (do_sep) { \ - Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ - if (flags & ANYOF_INVERT) \ - /*make sure the invert info is in each */ \ - sv_catpvs(sv, "^"); \ - do_sep = 0; \ - } \ -} STMT_END void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) { #ifdef DEBUGGING dVAR; @@ -14678,10 +15580,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) || _CC_VERTSPACE != 16 #error Need to adjust order of anyofs[] #endif - "[\\w]", - "[\\W]", - "[\\d]", - "[\\D]", + "\\w", + "\\W", + "\\d", + "\\D", "[:alpha:]", "[:^alpha:]", "[:lower:]", @@ -14698,8 +15600,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^graph:]", "[:cased:]", "[:^cased:]", - "[\\s]", - "[\\S]", + "\\s", + "\\S", "[:blank:]", "[:^blank:]", "[:xdigit:]", @@ -14710,8 +15612,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^cntrl:]", "[:ascii:]", "[:^ascii:]", - "[\\v]", - "[\\V]" + "\\v", + "\\V" }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -14723,7 +15625,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from regexec.c, and it would be hard to supply pRExC_state. */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; @@ -14754,16 +15657,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); DEBUG_TRIE_COMPILE_r( - Perl_sv_catpvf(aTHX_ sv, - "", - (UV)trie->startstate, - (IV)trie->statecount-1, /* -1 because of the unused 0 element */ - (UV)trie->wordcount, - (UV)trie->minlen, - (UV)trie->maxlen, - (UV)TRIE_CHARCOUNT(trie), - (UV)trie->uniquecharcount - ) + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { sv_catpvs(sv, "["); @@ -14780,7 +15683,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || (OP(o) < NREF)) { @@ -14804,20 +15709,36 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } } + if ( k == REF && reginfo) { + U32 n = ARG(o); /* which paren pair */ + I32 ln = prog->offs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } } else if (k == GOSUB) - Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); else if (k == VERB) { if (!o->flags) Perl_sv_catpvf(aTHX_ sv, ":%"SVf, SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - if (flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE_FLAGS) sv_catpvs(sv, "{loc}"); if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); @@ -14828,84 +15749,117 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* output what the standard cp 0-255 bitmap matches */ do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - /* output any special charclass tests (used entirely under use locale) */ - if (ANYOF_CLASS_TEST_ANY_SET(o)) { + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { int i; - for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) { - if (ANYOF_CLASS_TEST(o,i)) { + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { sv_catpv(sv, anyofs[i]); do_sep = 1; } } } - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - - if (flags & ANYOF_NON_UTF8_LATIN1_ALL) { - sv_catpvs(sv, "{non-utf8-latin1-all}"); - } - - /* output information about the unicode matching */ - if (flags & ANYOF_UNICODE_ALL) - sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) { - SV *lv; /* Set if there is something outside the bit map. */ - SV * sw; - bool byte_output = FALSE; /* If something in the bitmap has been - output */ - - if (flags & ANYOF_NONBITMAP_NON_UTF8) { - sv_catpvs(sv, "{outside bitmap}"); + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); } - else { - sv_catpvs(sv, "{utf8}"); + + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); } - /* Get the stuff that wasn't in the bitmap */ - sw = regclass_swash(prog, o, FALSE, &lv, NULL); - if (lv && lv != &PL_sv_undef) { - char *s = savesvpv(lv); - char * const origs = s; + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { + char *s = savesvpv(lv); + char * const origs = s; + + while (*s && *s != '\n') + s++; - while (*s && *s != '\n') - s++; + if (*s == '\n') { + const char * const t = ++s; - if (*s == '\n') { - const char * const t = ++s; + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } - if (byte_output) { - sv_catpvs(sv, " "); - } + if (byte_output) { + sv_catpvs(sv, " "); + } - while (*s) { - if (*s == '\n') { + while (*s) { + if (*s == '\n') { - /* Truncate very long output */ - if (s - origs > 256) { - Perl_sv_catpvf(aTHX_ sv, - "%.*s...", - (int) (s - origs - 1), - t); - goto out_dump; + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; } - *s = ' '; - } - else if (*s == '\t') { - *s = '-'; + else if (*s == '\t') { + *s = '-'; + } + s++; } - s++; + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); } - if (s[-1] == ' ') - s[-1] = 0; - sv_catpv(sv, t); - } + out_dump: - out_dump: + Safefree(origs); + SvREFCNT_dec_NN(lv); + } - Safefree(origs); - SvREFCNT_dec_NN(lv); + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); + } } } @@ -14913,11 +15867,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; - if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } } else { - sv_catpv(sv, anyofs[index]); + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); } } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -14927,9 +15887,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) PERL_UNUSED_ARG(sv); PERL_UNUSED_ARG(o); PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); #endif /* DEBUGGING */ } + + SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ @@ -15198,7 +16161,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } break; default: - Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); } } Safefree(ri->data->what); @@ -15331,7 +16295,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) len = ProgLen(ri); - Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); reti->num_code_blocks = ri->num_code_blocks; @@ -15373,9 +16338,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) break; case 'f': /* This is cheating. */ - Newx(d->data[i], 1, struct regnode_charclass_class); - StructCopy(ri->data->data[i], d->data[i], - struct regnode_charclass_class); + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); reti->regstclass = (regnode*)d->data[i]; break; case 'T': @@ -15395,7 +16359,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) d->data[i] = ri->data->data[i]; break; default: - Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); } } @@ -15435,7 +16400,8 @@ Perl_regnext(pTHX_ regnode *p) return(NULL); if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); } offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); @@ -15447,7 +16413,7 @@ Perl_regnext(pTHX_ regnode *p) #endif STATIC void -S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) { va_list args; STRLEN l1 = strlen(pat1); @@ -15466,20 +16432,15 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) Copy(pat2, buf + l1, l2 , char); buf[l1 + l2] = '\n'; buf[l1 + l2 + 1] = '\0'; -#ifdef I_STDARG - /* ANSI variant takes additional second argument */ va_start(args, pat2); -#else - va_start(args); -#endif msv = vmess(buf, &args); va_end(args); message = SvPV_const(msv,l1); if (l1 > 512) l1 = 512; Copy(message, buf, l1 , char); - buf[l1-1] = '\0'; /* Overwrite \n */ - Perl_croak(aTHX_ "%s", buf); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); } /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ @@ -15497,7 +16458,8 @@ Perl_save_re_context(pTHX) U32 i; for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; - const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); GV *const *const gvp = (GV**)hv_fetch(PL_defstash, digits, len, 0); @@ -15519,19 +16481,6 @@ S_put_byte(pTHX_ SV *sv, int c) { PERL_ARGS_ASSERT_PUT_BYTE; - /* Our definition of isPRINT() ignores locales, so only bytes that are - not part of UTF-8 are considered printable. I assume that the same - holds for UTF-EBCDIC. - Also, code point 255 is not printable in either (it's E0 in EBCDIC, - which Wikipedia says: - - EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all - ones (binary 1111 1111, hexadecimal FF). It is similar, but not - identical, to the ASCII delete (DEL) or rubout control character. ... - it is typically mapped to hexadecimal code 9F, in order to provide a - unique character mapping in both directions) - - So the old condition can be simplified to !isPRINT(c) */ if (!isPRINT(c)) { switch (c) { case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; @@ -15553,6 +16502,48 @@ S_put_byte(pTHX_ SV *sv, int c) } } +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end' */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) + put_byte(sv, start); + } + else if ( end > 255 + || ! isALPHANUMERIC(start) + || ! isALPHANUMERIC(end) + || isDIGIT(start) != isDIGIT(end) + || isUPPER(start) != isUPPER(end) + || isLOWER(start) != isLOWER(end) + + /* This final test should get optimized out except on EBCDIC + * platforms, where it causes ranges that cross discontinuities + * like i/j to be shown as hex instead of the misleading, + * e.g. H-K (since that range includes more than H, I, J, K). + * */ + || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start)) + { + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + } + else { /* Here, the ends of the range are both digits, or both uppercase, + or both lowercase; and there's no discontinuity in the range + (which could happen on EBCDIC platforms) */ + put_byte(sv, start); + sv_catpvs(sv, "-"); + put_byte(sv, end); + } +} + STATIC bool S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) { @@ -15561,50 +16552,27 @@ S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) * output anything */ int i; - int rangestart = -1; bool has_output_anything = FALSE; PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; - for (i = 0; i <= 256; i++) { + for (i = 0; i < 256; i++) { if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - int j = i - 1; - if (i <= rangestart + 3) { /* Individual chars in short ranges */ - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - } - else if ( j > 255 - || ! isALPHANUMERIC(rangestart) - || ! isALPHANUMERIC(j) - || isDIGIT(rangestart) != isDIGIT(j) - || isUPPER(rangestart) != isUPPER(j) - || isLOWER(rangestart) != isLOWER(j) - - /* This final test should get optimized out except - * on EBCDIC platforms, where it causes ranges that - * cross discontinuities like i/j to be shown as hex - * instead of the misleading, e.g. H-K (since that - * range includes more than H, I, J, K). */ - || (j - rangestart) - != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart)) - { - Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}", - rangestart, - (j < 256) ? j : 255); - } - else { /* Here, the ends of the range are both digits, or both - uppercase, or both lowercase; and there's no - discontinuity in the range (which could happen on EBCDIC - platforms) */ - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, j); - } - rangestart = -1; + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j <= 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); has_output_anything = TRUE; + i = j; } } @@ -15612,12 +16580,15 @@ S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) } #define CLEAR_OPTSTART \ - if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ } STMT_END -#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); STATIC const regnode * S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, @@ -15659,14 +16630,15 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else CLEAR_OPTSTART; - regprop(r, sv, node); + regprop(r, sv, node, NULL); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); if (OP(node) != OPTIMIZED) { if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, " (0)"); - else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) PerlIO_printf(Perl_debug_log, " (FAIL)"); else PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); @@ -15699,7 +16671,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const reg_trie_data * const trie = (reg_trie_data*)ri->data->data[optrie]; #ifdef DEBUGGING - AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); #endif const regnode *nextbranch= NULL; I32 word_idx; @@ -15709,18 +16682,22 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PerlIO_printf(Perl_debug_log, "%*s%s ", (int)(2*(indent+3)), "", - elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, - PL_colors[0], PL_colors[1], - (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELLIPSES | - PERL_PV_PRETTY_LTGT + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT ) - : "???" + : "???" ); if (trie->jump) { U16 dist= trie->jump[word_idx+1]; PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", - (UV)((dist ? this_trie + dist : next) - start)); + (UV)((dist ? this_trie + dist : next) - start)); if (dist) { if (!nextbranch) nextbranch= this_trie + trie->jump[0]; @@ -15750,8 +16727,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS) - ? ANYOF_CLASS_SKIP : ANYOF_SKIP); + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) { diff --git a/src/5019003/regexec.c b/src/5020000/regexec.c similarity index 81% rename from src/5019003/regexec.c rename to src/5020000/regexec.c index 66466c7..5bf42fe 100644 --- a/src/5019003/regexec.c +++ b/src/5020000/regexec.c @@ -37,16 +37,6 @@ #include "re_top.h" #endif -/* At least one required character in the target string is expressible only in - * UTF-8. */ -static const char* const non_utf8_target_but_utf8_required - = "Can't match, because target string needs to be in UTF-8\n"; - -#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ - goto target; \ -} STMT_END - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * @@ -94,16 +84,28 @@ static const char* const non_utf8_target_but_utf8_required #include "inline_invlist.c" #include "unicode_constants.h" +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #ifndef STATIC #define STATIC static #endif -/* Valid for non-utf8 strings: avoids the reginclass +/* Valid only for non-utf8 strings: avoids the reginclass * call if there are no complications: i.e., if everything matchable is * straight forward in the bitmap */ -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ : ANYOF_BITMAP_TEST(p,*(c))) /* @@ -118,6 +120,7 @@ static const char* const non_utf8_target_but_utf8_required ? reghop3((U8*)pos, off, \ (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ : (U8*)(pos + off)) + #define HOPBACKc(pos, off) \ (char*)(reginfo->is_utf8_target \ ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ @@ -128,6 +131,24 @@ static const char* const non_utf8_target_but_utf8_required #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ #define NEXTCHR_IS_EOS (nextchr < 0) @@ -140,11 +161,11 @@ static const char* const non_utf8_target_but_utf8_required SET_nextchr -#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \ +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ if (!swash_ptr) { \ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ - 1, 0, NULL, &flags); \ + 1, 0, invlist, &flags); \ assert(swash_ptr); \ } \ } STMT_END @@ -153,28 +174,33 @@ static const char* const non_utf8_target_but_utf8_required #ifdef DEBUGGING # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); #else # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name) + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) #endif #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ PL_utf8_swash_ptrs[_CC_WORDCHAR], \ - swash_property_names[_CC_WORDCHAR], \ - GREEK_SMALL_LETTER_IOTA_UTF8) + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ STMT_START { \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ "_X_regular_begin", \ - GREEK_SMALL_LETTER_IOTA_UTF8); \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ "_X_extend", \ + NULL, \ COMBINING_GRAVE_ACCENT_UTF8); \ } STMT_END @@ -191,14 +217,14 @@ static const char* const non_utf8_target_but_utf8_required * although it may be done at run time beause of the REF possibility - more * investigation required. -- demerphq */ -#define JUMPABLE(rn) ( \ - OP(rn) == OPEN || \ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ - OP(rn) == EVAL || \ - OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ - OP(rn) == PLUS || OP(rn) == MINMOD || \ - OP(rn) == KEEPS || \ - (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ ) #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) @@ -208,13 +234,13 @@ static const char* const non_utf8_target_but_utf8_required /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -224,7 +250,7 @@ static const char* const non_utf8_target_but_utf8_required Search for mandatory following text node; for lookahead, the text must follow but for lookbehind (rn->flags != 0) we skip to the next step. */ -#define FIND_NEXT_IMPT(rn) STMT_START { \ +#define FIND_NEXT_IMPT(rn) STMT_START { \ while (JUMPABLE(rn)) { \ const OPCODE type = OP(rn); \ if (type == SUSPEND || PL_regkind[type] == CURLY) \ @@ -275,8 +301,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) PERL_ARGS_ASSERT_REGCPPUSH; if (paren_elems_to_push < 0) - Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", - paren_elems_to_push); + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i", + paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS); if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf @@ -297,8 +323,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) ); for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(rex->offs[p].end); - SSPUSHINT(rex->offs[p].start); + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", @@ -370,10 +396,10 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) ); paren = *maxopenparen_p; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { - I32 tmps; + SSize_t tmps; rex->offs[paren].start_tmp = SSPOPINT; - rex->offs[paren].start = SSPOPINT; - tmps = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; if (paren <= rex->lastparen) rex->offs[paren].end = tmps; DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, @@ -485,7 +511,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } if (classnum < _FIRST_NON_SWASH_CC) { @@ -493,8 +519,11 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) /* Initialize the swash unless done already */ if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags); + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) @@ -527,7 +556,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) */ I32 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, - char *strbeg, I32 minend, SV *screamer, U32 nosave) + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -544,64 +573,70 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, } #endif -/* - * Need to implement the following flags for reg_anch: - * - * USE_INTUIT_NOML - Useful to call re_intuit_start() first - * USE_INTUIT_ML - * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer - * INTUIT_AUTORITATIVE_ML - * INTUIT_ONCE_NOML - Intuit can match in one location only. - * INTUIT_ONCE_ML - * - * Another flag for this function: SECOND_TIME (so that float substrs - * with giant delta may be not rechecked). - */ - -/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. - Otherwise, only SvCUR(sv) is used to get strbeg. */ -/* XXXX Some places assume that there is a fixed substring. - An update may be needed if optimizer marks as "INTUITable" - RExen without fixed substrings. Similarly, it is assumed that - lengths of all the strings are no more than minlen, thus they - cannot come from lookahead. - (Or minlen should take into account lookahead.) - NOTE: Some of this comment is not correct. minlen does now take account - of lookahead/behind. Further research is required. -- demerphq -*/ - -/* A failure to find a constant substring means that there is no need to make - an expensive call to REx engine, thus we celebrate a failure. Similarly, - finding a substring too deep into the string means that fewer calls to - regtry() should be needed. - - REx compiler's optimizer found 4 possible hints: - a) Anchored substring; - b) Fixed substring; - c) Whether we are anchored (beginning-of-line or \G); - d) First node (of those at offset 0) which may distinguish positions; - We use a)b)d) and multiline-part of c), and try to find a position in the - string which does not contradict any of them. - */ - -/* Most of decisions we do here should have been done at compile time. - The nodes of the REx which we used for the search should have been - deleted from the finite automaton. */ - -/* args: - * rx: the regex to match against - * sv: the SV being matched: only used for utf8 flag; the string - * itself is accessed via the pointers below. Note that on - * something like an overloaded SV, SvPOK(sv) may be false - * and the string pointers may point to something unrelated to - * the SV itself. - * strbeg: real beginning of string - * strpos: the point in the string at which to begin matching - * strend: pointer to the byte following the last char of the string - * flags currently unused; set to 0 - * data: currently unused; set to NULL +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. */ char * @@ -616,34 +651,60 @@ Perl_re_intuit_start(pTHX_ { dVAR; struct regexp *const prog = ReANY(rx); - I32 start_shift = 0; + SSize_t start_shift = prog->check_offset_min; /* Should be nonnegative! */ - I32 end_shift = 0; - char *s; + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; SV *check; - char *t; const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ - I32 ml_anch; - char *other_last = NULL; /* other substr checked before this */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ char *check_at = NULL; /* check substr found at this pos */ - char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; -#ifdef DEBUGGING - const char * const i_strpos = strpos; -#endif GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_START; PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(data); - /* CHR_DIST() would be more correct here but it makes things slow. */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ if (prog->minlen > strend - strpos) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "String too short... [re_intuit_start]\n")); + " String too short...\n")); goto fail; } @@ -668,448 +729,511 @@ Perl_re_intuit_start(pTHX_ } check = prog->check_substr; } - if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */ - && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */ - { - ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) - || ( (prog->extflags & RXf_ANCH_BOL) - && !multiline ) ); /* Check after \n? */ - - if (!ml_anch) { - if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ - && (strpos != strbeg)) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); - goto fail; + + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); } - if (prog->check_offset_min == prog->check_offset_max - && !(prog->extflags & RXf_CANY_SEEN) - && ! multiline) /* /m can cause \n's to match that aren't - accounted for in the string max length. - See [perl #115242] */ - { - /* Substring at constant offset from beg-of-str... */ - I32 slen; + }); - s = HOP3c(strpos, prog->check_offset_min, strend); + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ - if (SvTAIL(check)) { - slen = SvCUR(check); /* >= 1 */ + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); - if ( strend - s > slen || strend - s < slen - 1 - || (strend - s == slen && strend[-1] != '\n')) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); - goto fail_finish; + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; + } + + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; } - /* Now should match s[0..slen-2] */ - slen--; if (slen && (*SvPVX_const(check) != *s - || (slen > 1 - && memNE(SvPVX_const(check), s, slen)))) { - report_neq: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); goto fail_finish; } + + check_at = s; + goto success_at_start; } - else if (*SvPVX_const(check) != *s - || ((slen = SvCUR(check)) > 1 - && memNE(SvPVX_const(check), s, slen))) - goto report_neq; - check_at = s; - goto success_at_start; - } - } - /* Match is anchored, but substr is not anchored wrt beg-of-str. */ - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - - if (!ml_anch) { - const I32 end = prog->check_offset_max + CHR_SVLEN(check) - - (SvTAIL(check) != 0); - const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; - - if (end_shift < eshift) - end_shift = eshift; } } - else { /* Can match at random position */ - ml_anch = 0; - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - /* end shift should be non negative here */ - } + end_shift = prog->check_end_shift; -#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", (IV)end_shift, RX_PRECOMP(prog)); #endif restart: - /* Find a possible match in the region s..strend by looking for - the "check" substring in the region corrected by start/end_shift. */ + + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ { - I32 srch_start_shift = start_shift; - I32 srch_end_shift = end_shift; U8* start_point; U8* end_point; - if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { - srch_end_shift -= ((strbeg - s) - srch_start_shift); - srch_start_shift = strbeg - s; - } - DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", - (IV)prog->check_offset_min, - (IV)srch_start_shift, - (IV)srch_end_shift, - (IV)prog->check_end_shift); - }); - if (prog->extflags & RXf_CANY_SEEN) { - start_point= (U8*)(s + srch_start_shift); - end_point= (U8*)(strend - srch_end_shift); + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); + + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; } else { - start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); - end_point= HOP3(strend, -srch_end_shift, strbeg); + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; + } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } } + DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", (int)(end_point - start_point), (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), start_point); }); - s = fbm_instr( start_point, end_point, + check_at = fbm_instr( start_point, end_point, check, multiline ? FBMrf_MULTILINE : 0); - } - /* Update the count-of-usability, remove useless subpatterns, - unshift s. */ - - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(check), RE_SV_DUMPLEN(check), 30); - PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", - (s ? "Found" : "Did not find"), - (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) - ? "anchored" : "floating"), - quoted, - RE_SV_TAIL(check), - (s ? " at offset " : "...\n") ); - }); - if (!s) - goto fail_finish; - /* Finish the diagnostic message */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ - /* XXX dmq: first branch is for positive lookbehind... - Our check string is offset from the beginning of the pattern. - So we need to do any stclass tests offset forward from that - point. I think. :-( - */ + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ - check_at=s; + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); + } - /* Got a candidate. Check MBOL anchoring, and the *other* substr. - Start with the other substr. - XXXX no SCREAM optimization yet - and a very coarse implementation - XXXX /ttx+/ results in anchored="ttx", floating="x". floating will - *always* match. Probably should be marked during compile... - Probably it is right to do no SCREAM here... - */ + /* now look for the 'other' substring if defined */ - if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8) - : (prog->float_substr && prog->anchored_substr)) + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) { /* Take into account the "other" substring. */ - /* XXXX May be hopelessly wrong for UTF... */ - if (!other_last) - other_last = strpos; - if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { - do_other_anchored: - { - char * const last = HOP3c(s, -start_shift, strbeg); - char *last1, *last2; - char * const saved_s = s; - SV* must; - - t = s - prog->check_offset_max; - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) - && t > strpos))) - NOOP; - else - t = strpos; - t = HOP3c(t, prog->anchored_offset, strend); - if (t < other_last) /* These positions already checked */ - t = other_last; - last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); - if (last < last1) - last1 = last; - /* XXXX It is not documented what units *_offsets are in. - We assume bytes, but this is clearly wrong. - Meaning this code needs to be carefully reviewed for errors. - dmq. - */ + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ - /* On end-of-str: see comment below. */ - must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->anchored_utf8); /* for debug */ - } - else - s = fbm_instr( - (unsigned char*)t, - HOP3(HOP3(last1, prog->anchored_offset, strend) - + SvCUR(must), -(SvTAIL(must)!=0), strbeg), - must, - multiline ? FBMrf_MULTILINE : 0 - ); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); - if (!s) { - if (last1 >= last2) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying floating at offset %ld...\n", - (long)(HOP3c(saved_s, 1, strend) - i_strpos))); - other_last = HOP3c(last1, prog->anchored_offset+1, strend); - s = HOP3c(last, 1, strend); - goto restart; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - t = HOP3c(s, -prog->anchored_offset, strbeg); - other_last = HOP3c(s, 1, strend); - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; - } - } + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); } - else { /* Take into account the floating substring. */ - char *last, *last1; - char * const saved_s = s; - SV* must; - - t = HOP3c(s, -start_shift, strbeg); - last1 = last = - HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); - if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) - last = HOP3c(t, prog->float_max_offset, strend); - s = HOP3c(t, prog->float_min_offset, strend); - if (s < other_last) - s = other_last; - /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ - must = utf8_target ? prog->float_utf8 : prog->float_substr; - /* fbm_instr() takes into account exact value of end-of-str - if the check is SvTAIL(ed). Since false positives are OK, - and end-of-str is not later than strend we are OK. */ - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->float_utf8); /* for debug message */ - } - else - s = fbm_instr((unsigned char*)s, - (unsigned char*)last + SvCUR(must) - - (SvTAIL(must)!=0), - must, multiline ? FBMrf_MULTILINE : 0); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); - if (!s) { - if (last1 == last) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying anchored starting at offset %ld...\n", - (long)(saved_s + 1 - i_strpos))); - other_last = last; - s = HOP3c(t, 1, strend); - goto restart; + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; } else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - other_last = s; /* Fix this later. --Hugo */ - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); } } } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); + } + postprocess_substr_matches: - t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); + /* handle the extra constraint of /^.../m if present */ - DEBUG_OPTIMISE_MORE_r( - PerlIO_printf(Perl_debug_log, - "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", - (IV)prog->check_offset_min, - (IV)prog->check_offset_max, - (IV)(s-strpos), - (IV)(t-strpos), - (IV)(t-s), - (IV)(strend-strpos) - ) - ); + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) - && t > strpos))) - { - /* Fixed substring is found far enough so that the match - cannot start at strpos. */ - try_at_offset: - if (ml_anch && t[-1] != '\n') { - /* Eventually fbm_*() should handle this, but often - anchored_offset is not 0, so this check will not be wasted. */ - /* XXXX In the code below we prefer to look for "^" even in - presence of anchored substrings. And we search even - beyond the found float position. These pessimizations - are historical artefacts only. */ - find_anchor: - while (t < strend - prog->minlen) { - if (*t == '\n') { - if (t < check_at - prog->check_offset_min) { - if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { - /* Since we moved from the found position, - we definitely contradict the found anchored - substr. Due to the above check we do not - contradict "check" substr. - Thus we can arrive here only if check substr - is float. Redo checking for "other"=="fixed". - */ - strpos = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); - goto do_other_anchored; - } - /* We don't contradict the found floating substring. */ - /* XXXX Why not check for STCLASS? */ - s = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(s - i_strpos))); - goto set_useful; - } - /* Position contradicts check-string */ - /* XXXX probably better to look for check-string - than for "\n", so one should lower the limit for t? */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos))); - other_last = strpos = s = t + 1; - goto restart; - } - t++; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); goto fail_finish; } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); - } - s = t; - set_useful: - ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ - } - else { - /* The found string does not prohibit matching at strpos, - - no optimization of calling REx engine can be performed, - unless it was an MBOL and we are not after MBOL, - or a future STCLASS check will fail this. */ - try_at_start: - /* Even in this situation we may use MBOL flag if strpos is offset - wrt the start of the string. */ - if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n' - /* May be due to an implicit anchor of m{.*foo} */ - && !(prog->intflags & PREGf_IMPLICIT)) + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) { - t = strpos; - goto find_anchor; + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; } - DEBUG_EXECUTE_r( if (ml_anch) - PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", - (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]); - ); - success_at_start: - if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ - && (utf8_target ? ( - prog->check_utf8 /* Could be deleted already */ - && --BmUSEFUL(prog->check_utf8) < 0 - && (prog->check_utf8 == prog->float_utf8) - ) : ( - prog->check_substr /* Could be deleted already */ - && --BmUSEFUL(prog->check_substr) < 0 - && (prog->check_substr == prog->float_substr) - ))) - { - /* If flags & SOMETHING - do not do it many times on the same match */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); - /* XXX Does the destruction order has to change with utf8_target? */ - SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); - SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); - prog->check_substr = prog->check_utf8 = NULL; /* disable */ - prog->float_substr = prog->float_utf8 = NULL; /* clear */ - check = NULL; /* abort */ - s = strpos; - /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag - see http://bugs.activestate.com/show_bug.cgi?id=87173 */ - if (prog->intflags & PREGf_IMPLICIT) - prog->extflags &= ~RXf_ANCH_MBOL; - /* XXXX This is a remnant of the old implementation. It - looks wasteful, since now INTUIT can use many - other heuristics. */ - prog->extflags &= ~RXf_USE_INTUIT; - /* XXXX What other flags might need to be cleared in this branch? */ + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; } - else - s = strpos; + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); } - /* Last resort... */ - /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ - /* trie stclasses are too expensive to use here, we are better off to - leave it to regmatch itself */ + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ + if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { - /* minlen == 0 is possible if regstclass is \b or \B, - and the fixed substr is ''$. - Since minlen is already taken into account, s+1 is before strend; - accidentally, minlen >= 1 guaranties no false positives at s + 1 - even for \b or \B. But (minlen? 1 : 0) below assumes that - regstclass does not come from lookahead... */ - /* If regstclass takes bytelength more than 1: If charlength==1, OK. - This leaves EXACTF-ish only, which are dealt with in find_byclass(). */ const U8* const str = (U8*)STRING(progi->regstclass); + /* XXX this value could be pre-computed */ const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT ? (reginfo->is_utf8_pat @@ -1117,106 +1241,190 @@ Perl_re_intuit_start(pTHX_ : STR_LEN(progi->regstclass)) : 1); char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) - endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); - else if (prog->float_substr || prog->float_utf8) - endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } else endpos= strend; - if (checked_upto < s) - checked_upto = s; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); - t = s; - s = find_byclass(prog, progi->regstclass, checked_upto, endpos, + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, reginfo); - if (s) { - checked_upto = s; - } else { -#ifdef DEBUGGING - const char *what = NULL; -#endif + if (!s) { if (endpos == strend) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); + " Could not match STCLASS...\n") ); goto fail; } DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "This position contradicts STCLASS...\n") ); - if ((prog->extflags & RXf_ANCH) && !ml_anch) + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) goto fail; - checked_upto = HOPBACKc(endpos, start_shift); - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { - if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { - DEBUG_EXECUTE_r( what = "anchored" ); - hop_and_restart: - s = HOP3c(t, 1, strend); - if (s + start_shift + end_shift > strend) { - /* XXXX Should be taken into account earlier? */ + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); - goto fail; + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; } - if (!check) - goto giveup; + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for %s substr starting at offset %ld...\n", - what, (long)(s + start_shift - i_strpos)) ); - goto restart; + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; } - /* Have both, check_string is floating */ - if (t + start_shift >= check_at) /* Contradicts floating=check */ - goto retry_floating_check; - /* Recheck anchored substring, but not floating... */ - s = check_at; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for anchored substr starting at offset %ld...\n", - (long)(other_last - i_strpos)) ); - goto do_other_anchored; - } - /* Another way we could have checked stclass at the - current position only: */ - if (ml_anch) { - s = t = t + 1; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for /%s^%s/m starting at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); - goto try_at_offset; + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; } - if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); goto fail; - /* Check is floating substring. */ - retry_floating_check: - t = check_at - start_shift; - DEBUG_EXECUTE_r( what = "floating" ); - goto hop_and_restart; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; } - if (t != s) { + + /* Success !!! */ + + if (rx_origin != s) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "By STCLASS: moving %ld --> %ld\n", - (long)(t - i_strpos), (long)(s - i_strpos)) + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) ); } else { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "Does not contradict STCLASS...\n"); + " Does not contradict STCLASS...\n"); ); } } - giveup: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", - PL_colors[4], (check ? "Guessed" : "Giving up"), - PL_colors[5], (long)(s - i_strpos)) ); - return s; + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; + } + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; fail_finish: /* Substring not found */ if (prog->check_substr || prog->check_utf8) /* could be removed already */ @@ -1227,46 +1435,57 @@ Perl_re_intuit_start(pTHX_ return NULL; } + #define DECL_TRIE_TYPE(scan) \ - const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ trie_type = ((scan->flags == EXACT) \ ? (utf8_target ? trie_utf8 : trie_plain) \ - : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ -STMT_START { \ +STMT_START { \ STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ - uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ len = UTF8SKIP(uc); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_latin_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ case trie_utf8: \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ case trie_plain: \ uvc = (UV)*uc; \ @@ -1337,28 +1556,28 @@ REXEC_FBC_SCAN( \ tmp = 1; \ ) -#define REXEC_FBC_TRYIT \ +#define REXEC_FBC_TRYIT \ if ((reginfo->intuit || regtry(reginfo, &s))) \ goto got_it #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ - if (utf8_target) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ } -#define DUMP_EXEC_POS(li,s,doutf8) \ +#define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ startpos, doutf8) -#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ tmp = TEST_NON_UTF8(tmp); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1367,18 +1586,19 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ -#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ if (s == reginfo->strbeg) { \ tmp = '\n'; \ } \ else { \ U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ } \ tmp = TeSt1_UtF8; \ - LOAD_UTF8_CHARCLASS_ALNUM(); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! (TeSt2_UtF8)) { \ + if (tmp == ! (TeSt2_UtF8)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1413,9 +1633,9 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ * one, and compare it with the wordness of this one. If they differ, we have * a boundary. At the beginning of the string, pretend that the previous * character was a new-line */ -#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ if (utf8_target) { \ - UTF8_CODE \ + UTF8_CODE \ } \ else { /* Not utf8 */ \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ @@ -1430,7 +1650,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ } \ - if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ + if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ goto got_it; /* We know what class REx starts with. Try to find this position... */ @@ -1469,11 +1689,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { case ANYOF: - case ANYOF_SYNTHETIC: - case ANYOF_WARN_SUPER: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( - reginclass(prog, c, (U8*)s, utf8_target)); + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); } else { REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); @@ -1488,6 +1706,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: if (is_utf8_pat || utf8_target) { utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; @@ -1497,10 +1718,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1; /* /a, except the sharp s one which */ goto do_exactf_non_utf8; /* isn't dealt with by these */ - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); if (utf8_target) { - - /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1509,8 +1729,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; case EXACTFL: - if (is_utf8_pat || utf8_target) { - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; } fold_array = PL_fold_locale; @@ -1523,7 +1743,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_TRICKYFOLD: case EXACTFU: if (is_utf8_pat || utf8_target) { utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -1556,7 +1775,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * characters, and there are only 2 availabe, we know without * trying that it will fail; so don't start a match past the * required minimum number from the far end */ - e = HOP3c(strend, -((I32)ln), s); + e = HOP3c(strend, -((SSize_t)ln), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1602,7 +1821,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * only 2 are left, it's guaranteed to fail, so don't start a * match that would require us to go beyond the end of the string */ - e = HOP3c(strend, -((I32)lnc), s); + e = HOP3c(strend, -((SSize_t)lnc), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1628,15 +1847,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; } case BOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -1683,7 +1900,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); break; @@ -1747,7 +1963,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), classnum)))) { if (tmp && (reginfo->intuit || regtry(reginfo, &s))) @@ -1805,8 +2022,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } /* This is a copy of the loop above for swash classes, though using the @@ -2098,9 +2317,9 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, } else #endif { - I32 min = 0; - I32 max = strend - strbeg; - I32 sublen; + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; if ( (flags & REXEC_COPY_SKIP_POST) && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ @@ -2180,7 +2399,8 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; * from going quadratic */ if (SvPOKp(sv) && SvPVX(sv) == strbeg) - sv_pos_b2u(sv, &(prog->subcoffset)); + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); else prog->subcoffset = utf8_length((U8*)strbeg, (U8*)(strbeg+prog->suboffset)); @@ -2203,7 +2423,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, */ I32 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, - char *strbeg, I32 minend, SV *sv, void *data, U32 flags) + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -2220,9 +2440,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, char *s; regnode *c; char *startpos; - I32 minlen; /* must match at least this many chars */ - I32 dontbother = 0; /* how many characters not to try at end */ - I32 end_shift = 0; /* Same for the end. */ /* CC */ + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); @@ -2248,7 +2467,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, startpos = stringarg; - if (prog->extflags & RXf_GPOS_SEEN) { + if (prog->intflags & PREGf_GPOS_SEEN) { MAGIC *mg; /* set reginfo->ganch, the position where \G can match */ @@ -2257,15 +2476,16 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, (flags & REXEC_IGNOREPOS) ? stringarg /* use start pos rather than pos() */ : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) - ? strbeg + mg->mg_len /* Defined pos() */ + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) : strbeg; /* pos() not defined; use start of string */ DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg)); + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); /* in the presence of \G, we may need to start looking earlier in * the string than the suggested start point of stringarg: - * if gofs->prog is set, then that's a known, fixed minimum + * if prog->gofs is set, then that's a known, fixed minimum * offset, such as * /..\G/: gofs = 2 * /ab|c\G/: gofs = 1 @@ -2273,7 +2493,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, * to the start of the string, e.g. /w+\G/ */ - if (prog->extflags & RXf_ANCH_GPOS) { + if (prog->intflags & PREGf_ANCH_GPOS) { startpos = reginfo->ganch - prog->gofs; if (startpos < ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) @@ -2289,7 +2509,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, else startpos -= prog->gofs; } - else if (prog->extflags & RXf_GPOS_FLOAT) + else if (prog->intflags & PREGf_GPOS_FLOAT) startpos = strbeg; } @@ -2329,7 +2549,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, && (s < stringarg)) { /* this should only be possible under \G */ - assert(prog->extflags & RXf_GPOS_SEEN); + assert(prog->intflags & PREGf_GPOS_SEEN); DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); goto phooey; @@ -2379,15 +2599,16 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* see how far we have to get to not match where we matched before */ reginfo->till = stringarg + minend; - if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) { + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after S_cleanup_regmatch_info_aux has executed (registered by SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies magic belonging to this SV. Not newSVsv, either, as it does not COW. */ + assert(!IS_PADGV(sv)); reginfo->sv = newSV(0); - sv_setsv(reginfo->sv, sv); + SvSetSV_nosteal(reginfo->sv, sv); SAVEFREESV(reginfo->sv); } @@ -2464,11 +2685,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ - if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { if (s == startpos && regtry(reginfo, &s)) goto got_it; - else if (multiline || (prog->intflags & PREGf_IMPLICIT) - || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ { char *end; @@ -2542,8 +2762,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* end search for newline */ } /* end anchored/multiline check string search */ goto phooey; - } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) + } else if (prog->intflags & PREGf_ANCH_GPOS) { + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); /* For anchored \G, the only position it can match from is * (ganch-gofs); we already set startpos to this above; if intuit * moved us on from there, we can't possibly succeed */ @@ -2604,8 +2826,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, || ((prog->float_substr != NULL || prog->float_utf8 != NULL) && prog->float_max_offset < strend - s)) { SV *must; - I32 back_max; - I32 back_min; + SSize_t back_max; + SSize_t back_min; char *last; char *last1; /* Last position checked before */ #ifdef DEBUGGING @@ -2650,7 +2872,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, last = strend; } else { last = HOP3c(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) + -(SSize_t)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min), strbeg); } if (s > reginfo->strbeg) @@ -2660,10 +2882,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* XXXX check_substr already used to find "s", can optimize if check_substr==must. */ - dontbother = end_shift; + dontbother = 0; strend = HOPc(strend, -dontbother); while ( (s <= last) && - (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), (unsigned char*)strend, must, multiline ? FBMrf_MULTILINE : 0)) ) { DEBUG_EXECUTE_r( did_match = 1 ); @@ -2716,7 +2938,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, c); + regprop(prog, prop, c, reginfo); { RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); @@ -2859,7 +3081,7 @@ got_it: && (prog->offs[0].start < stringarg - strbeg)) { /* this should only be possible under \G */ - assert(prog->extflags & RXf_GPOS_SEEN); + assert(prog->intflags & PREGf_GPOS_SEEN); DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); goto phooey; @@ -2921,7 +3143,7 @@ phooey: /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. * Do inc before dec, in case old and new rex are the same */ -#define SET_reg_curpm(Re2) \ +#define SET_reg_curpm(Re2) \ if (reginfo->info_aux_eval) { \ (void)ReREFCNT_inc(Re2); \ ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ @@ -2939,7 +3161,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) CHECKPOINT lastcp; REGEXP *const rx = reginfo->prog; regexp *const prog = ReANY(rx); - I32 result; + SSize_t result; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -3186,11 +3408,11 @@ regmatch(), slabs allocated since entry are freed. #define DEBUG_STATE_pp(pp) \ DEBUG_STATE_r({ \ - DUMP_EXEC_POS(locinput, scan, utf8_target); \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ PerlIO_printf(Perl_debug_log, \ " %*s"pp" %s%s%s%s%s\n", \ depth*2, "", \ - PL_reg_name[st->resume_state], \ + PL_reg_name[st->resume_state], \ ((st==yes_state||st==mark_state) ? "[" : ""), \ ((st==yes_state) ? "Y" : ""), \ ((st==mark_state) ? "M" : ""), \ @@ -3394,6 +3616,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, dVAR; U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; if (OP(text_node) == EXACT) { @@ -3413,136 +3636,193 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = c1 = valid_utf8_to_uvchr(pat, NULL); } } - else /* an EXACTFish node */ - if ((is_utf8_pat - && is_MULTI_CHAR_FOLD_utf8_safe(pat, - pat + STR_LEN(text_node))) - || (!is_utf8_pat - && is_MULTI_CHAR_FOLD_latin1_safe(pat, - pat + STR_LEN(text_node)))) - { - /* Multi-character folds require more context to sort out. Also - * PL_utf8_foldclosures used below doesn't handle them, so have to be - * handled outside this routine */ - use_chrtest_void = TRUE; - } - else { /* an EXACTFish node which doesn't begin with a multi-char fold */ - c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; - if (c1 > 256) { - /* Load the folds hash, if not already done */ - SV** listp; - if (! PL_utf8_foldclosures) { - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - - /* Force loading this by folding an above-Latin1 char */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); - } - - /* The fold closures data structure is a hash with the keys being - * the UTF-8 of every character that is folded to, like 'k', and - * the values each an array of all code points that fold to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are - * not included */ - if ((! (listp = hv_fetch(PL_utf8_foldclosures, - (char *) pat, - UTF8SKIP(pat), - FALSE)))) - { - /* Not found in the hash, therefore there are no folds - * containing it, so there is only a single character that - * could match */ - c2 = c1; } - else { /* Does participate in folds */ - AV* list = (AV*) *listp; - if (av_len(list) != 1) { + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; - /* If there aren't exactly two folds to this, it is outside - * the scope of this function */ - use_chrtest_void = TRUE; + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } } - else { /* There are two. Get them */ - SV** c_p = av_fetch(list, 0, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + + pat = folded; + pat_end = d; + } + } + + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 256) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* Force loading this by folding an above-Latin1 char */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ } - c1 = SvUV(*c_p); + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + } - c_p = av_fetch(list, 1, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { + + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; } - c2 = SvUV(*c_p); - - /* Folds that cross the 255/256 boundary are forbidden if - * EXACTFL, or EXACTFA and one is ASCIII. Since the - * pattern character is above 256, and its only other match - * is below 256, the only legal match will be to itself. - * We have thrown away the original, so have to compute - * which is the one above 255 */ - if ((c1 < 256) != (c2 < 256)) { - if (OP(text_node) == EXACTFL - || (OP(text_node) == EXACTFA - && (isASCII(c1) || isASCII(c2)))) - { - if (c1 < 256) { - c1 = c2; - } - else { - c2 = c1; + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 256, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255 */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } } } } } } - } - else /* Here, c1 is < 255 */ - if (utf8_target - && HAS_NONLATIN1_FOLD_CLOSURE(c1) - && OP(text_node) != EXACTFL - && (OP(text_node) != EXACTFA || ! isASCII(c1))) - { - /* Here, there could be something above Latin1 in the target which - * folds to this character in the pattern. All such cases except - * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters - * involved in their folds, so are outside the scope of this - * function */ - if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { - c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; - } - else { - use_chrtest_void = TRUE; + else /* Here, c1 is < 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } } - } - else { /* Here nothing above Latin1 can fold to the pattern character */ - switch (OP(text_node)) { + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { - case EXACTFL: /* /l rules */ - c2 = PL_fold_locale[c1]; - break; + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; - case EXACTF: - if (! utf8_target) { /* /d rules */ - c2 = PL_fold[c1]; + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; break; - } - /* FALLTHROUGH */ - /* /u rules for all these. This happens to work for - * EXACTFA as nothing in Latin1 folds to ASCII */ - case EXACTFA: - case EXACTFU_TRICKYFOLD: - case EXACTFU_SS: - case EXACTFU: - c2 = PL_fold_latin1[c1]; - break; - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); - assert(0); /* NOTREACHED */ + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } } } } @@ -3584,7 +3864,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } /* returns -1 on failure, $+[0] on success */ -STATIC I32 +STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { #if PERL_VERSION < 9 && !defined(PERL_CORE) @@ -3602,7 +3882,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regnode *scan; regnode *next; U32 n = 0; /* general value; init to avoid compiler warning */ - I32 ln = 0; /* len or last; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput) */ @@ -3630,7 +3910,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; - SV* const oreplsv = GvSV(PL_replgv); + SV* const oreplsv = GvSVn(PL_replgv); /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops @@ -3660,6 +3940,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) GET_RE_DEBUG_FLAGS_DECL; #endif + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ multicall_oldcatch = 0; multicall_cv = NULL; @@ -3685,7 +3968,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SV * const prop = sv_newmortal(); regnode *rnext=regnext(scan); DUMP_EXEC_POS( locinput, scan, utf8_target ); - regprop(rex, prop, scan); + regprop(rex, prop, scan, reginfo); PerlIO_printf(Perl_debug_log, "%3"IVdf":%*s%s(%"IVdf")\n", @@ -3708,7 +3991,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); switch (state_num) { - case BOL: /* /^../ */ + case BOL: /* /^../ */ + case SBOL: /* /^../s */ if (locinput == reginfo->strbeg) break; sayNO; @@ -3721,11 +4005,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } sayNO; - case SBOL: /* /^../s */ - if (locinput == reginfo->strbeg) - break; - sayNO; - case GPOS: /* \G */ if (locinput == reginfo->ganch) break; @@ -3743,16 +4022,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO_SILENT; assert(0); /*NOTREACHED*/ - case EOL: /* /..$/ */ - goto seol; - case MEOL: /* /..$/m */ if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; break; + case EOL: /* /..$/ */ + /* FALL THROUGH */ case SEOL: /* /..$/s */ - seol: if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; if (reginfo->strend - locinput > 1) @@ -4050,7 +4327,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (chars) { if (utf8_target) { - uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; } @@ -4063,7 +4340,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (foldlen) { if (!--chars) break; - uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, uniflags); uscan += len; foldlen -= len; @@ -4154,7 +4431,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { sayNO; } l += 2; @@ -4177,7 +4455,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { sayNO; } s += 2; @@ -4207,27 +4486,31 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * s; U32 fold_utf8_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - fold_utf8_flags = FOLDEQ_UTF8_LOCALE; + fold_utf8_flags = FOLDEQ_LOCALE; goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ - case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: /* /abc/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; - case EXACTF: /* /abc/i */ + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); folder = foldEQ; fold_array = PL_fold; fold_utf8_flags = 0; @@ -4236,7 +4519,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) { + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { /* Either target or the pattern are utf8, or has the issue where * the fold lengths may differ. */ const char * const l = locinput; @@ -4271,8 +4558,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * have to set the FLAGS fields of these */ case BOUNDL: /* /\b/l */ case NBOUNDL: /* /\B/l */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* FALL THROUGH */ case BOUND: /* /\b/ */ case BOUNDU: /* /\b/u */ case BOUNDA: /* /\b/a */ @@ -4290,7 +4575,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 * const r = reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); - ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); } if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isWORDCHAR_uni(ln); @@ -4303,7 +4589,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -4352,11 +4638,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case ANYOF: /* /[abc]/ */ - case ANYOF_WARN_SUPER: if (NEXTCHR_IS_EOS) sayNO; if (utf8_target) { - if (!reginclass(rex, scan, (U8*)locinput, utf8_target)) + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) sayNO; locinput += UTF8SKIP(locinput); } @@ -4378,10 +4664,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (NEXTCHR_IS_EOS) sayNO; - /* The locale hasn't influenced the outcome before this, so defer - * tainting until now */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* Use isFOO_lc() for characters within Latin1. (Note that * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else * wouldn't be invariant) */ @@ -4392,7 +4674,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_UNI(nextchr, + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -4473,7 +4755,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), FLAGS(scan))))) { @@ -4491,8 +4773,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } if (! (to_complement ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], @@ -4754,11 +5037,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 *fold_array; UV utf8_fold_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; type = REFFL; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; case NREFFA: /* /\g{name}/iaa */ @@ -4799,10 +5081,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_nref_ref_common; case REFFL: /* /\1/il */ - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_ref; case REFFA: /* /\1/iaa */ @@ -4842,8 +5123,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = reginfo->strbeg + ln; if (type != REF /* REF can do byte comparison */ - && (utf8_target || type == REFFU)) - { /* XXX handle REFFL better */ + && (utf8_target || type == REFFU || type == REFFL)) + { char * limit = reginfo->strend; /* This call case insensitively compares the entire buffer @@ -4917,7 +5198,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) startpoint = rei->program+1; ST.close_paren = 0; } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ goto eval_recurse_doit; + assert(0); /* NOTREACHED */ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ @@ -5029,8 +5317,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->offs[0].end = locinput - reginfo->strbeg; if (reginfo->info_aux_eval->pos_magic) - reginfo->info_aux_eval->pos_magic->mg_len - = locinput - reginfo->strbeg; + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); @@ -5068,20 +5357,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* /(??{}) */ /* if its overloaded, let the regex compiler handle * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); if (!SvAMAGIC(ret)) { SV *sv = ret; if (SvROK(sv)) sv = SvRV(sv); if (SvTYPE(sv) == SVt_REGEXP) re_sv = (REGEXP*) sv; - else if (SvSMAGICAL(sv)) { - MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); if (mg) re_sv = (REGEXP *) mg->mg_obj; } - /* force any magic, undef warnings here */ - if (!re_sv) { + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { ret = sv_mortalcopy(ret); (void) SvPV_force_nolen(ret); } @@ -5135,17 +5426,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) pm_flags); if (!(SvFLAGS(ret) - & (SVs_TEMP | SVs_PADTMP | SVf_READONLY - | SVs_GMG))) { + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { /* This isn't a first class regexp. Instead, it's caching a regexp onto an existing, Perl visible scalar. */ sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); } - /* safe to do now that any $1 etc has been - * interpolated into the new pattern string and - * compiled */ - S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); } SAVEFREESV(re_sv); re = ReANY(re_sv); @@ -5155,6 +5442,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) re->sublen = rex->sublen; re->suboffset = rex->suboffset; re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; rei = RXi_GET(re); DEBUG_EXECUTE_r( debug_start_match(re_sv, utf8_target, locinput, @@ -5162,18 +5451,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ - - eval_recurse_doit: /* Share code with GOSUB below this line */ - /* run the pattern returned from (??{...}) */ - - /* Save *all* the positions. */ + /* Save all the seen positions so far. */ ST.cp = regcppush(rex, 0, maxopenparen); REGCP_SET(ST.lastcp); - - re->lastparen = 0; - re->lastcloseparen = 0; - + /* and set maxopenparen to 0, since we are starting a "fresh" match */ maxopenparen = 0; + /* run the pattern returned from (??{...}) */ + + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ /* invalidate the S-L poscache. We're now executing a * different set of WHILEM ops (and their associated @@ -5185,6 +5472,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * pattern again */ reginfo->poscache_maxiter = 0; + /* the new regexp might have a different is_utf8_pat than we do */ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); ST.prev_rex = rex_sv; @@ -5209,7 +5497,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); rexi = RXi_GET(rex); - regcpblow(ST.cp); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; @@ -5559,10 +5854,10 @@ NULL if (reginfo->poscache_iter-- == 0) { /* initialise cache */ - const I32 size = (reginfo->poscache_maxiter + 7)/8; + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; regmatch_info_aux *const aux = reginfo->info_aux; if (aux->poscache) { - if ((I32)reginfo->poscache_size < size) { + if ((SSize_t)reginfo->poscache_size < size) { Renew(aux->poscache, size, char); reginfo->poscache_size = size; } @@ -5580,7 +5875,7 @@ NULL if (reginfo->poscache_iter < 0) { /* have we already failed at this position? */ - I32 offset, mask; + SSize_t offset, mask; reginfo->poscache_iter = -1; /* stop eventual underflow */ offset = (scan->flags & 0xf) - 1 @@ -5905,7 +6200,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", (int)(REPORT_CODE_OFF+(depth*2)),"", valid_utf8_to_uvchr((U8 *) locinput, NULL), valid_utf8_to_uvchr(ST.c1_utf8, NULL), @@ -5919,7 +6214,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", (int)(REPORT_CODE_OFF+(depth*2)),"", (int) nextchr, ST.c1, ST.c2) ); @@ -6658,6 +6953,10 @@ yes: * When popping the save stack, all these locals would be undone; * bypass this by setting the outermost saved $^R to the latest * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ if (oreplsv != GvSV(PL_replgv)) sv_setsv(oreplsv, GvSV(PL_replgv)); } @@ -6860,7 +7159,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -6887,21 +7186,23 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; case EXACTFL: - RXp_MATCH_TAINTED_on(prog); - utf8_flags = FOLDEQ_UTF8_LOCALE; + utf8_flags = FOLDEQ_LOCALE; goto do_exactf; - case EXACTF: - utf8_flags = 0; - goto do_exactf; + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6965,11 +7266,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; } case ANYOF: - case ANYOF_WARN_SUPER: if (utf8_target) { while (hardcount < max && scan < loceol - && reginclass(prog, p, (U8*)scan, utf8_target)) + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; @@ -6987,7 +7287,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); if (! utf8_target) { while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) @@ -7087,8 +7386,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, - *(scan + 1)), + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), classnum)))) { break; @@ -7170,8 +7469,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init( - "utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } while (hardcount < max && scan < loceol @@ -7239,7 +7540,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, GET_RE_DEBUG_FLAGS_DECL; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, p); + regprop(prog, prop, p, reginfo); PerlIO_printf(Perl_debug_log, "%*s %s can match %"IVdf" times out of %"IVdf"...\n", REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); @@ -7265,14 +7566,18 @@ Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, *altsvp = NULL; } - return newSVsv(core_regclass_swash(prog, node, doinit, listsvp)); + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); } -#endif -STATIC SV * -S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp) +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) { - /* Returns the swash for the input 'node' in the regex 'prog'. + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. * If is 'true', will attempt to create the swash if not already * done. * If is non-null, will return the printable contents of the @@ -7290,9 +7595,10 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; - PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; - assert(ANYOF_NONBITMAP(node)); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); if (data && data->count) { const U32 n = ARG(node); @@ -7305,25 +7611,38 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit si = *ary; /* ary[0] = the string to initialize the swash with */ - /* Elements 2 and 3 are either both present or both absent. [2] is - * any inversion list generated at compile time; [3] indicates if + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if * that inversion list has any user-defined properties in it. */ - if (av_len(av) >= 2) { - invlist = ary[2]; - if (SvUV(ary[3])) { - swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; } - } - else { - invlist = NULL; } /* Element [1] is reserved for the set-up swash. If already there, * return it; if not, create it and store it there */ - if (SvROK(ary[1])) { + if (ary[1] && SvROK(ary[1])) { sw = ary[1]; } - else if (si && doinit) { + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { sw = _core_swash_init("utf8", /* the utf8 package */ "", /* nameless */ @@ -7362,12 +7681,14 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit return sw; } +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ /* - reginclass - determine if a character falls into a character class n is the ANYOF regnode p is the target string + p_end points to one byte beyond the end of the target string utf8_target tells whether p is in UTF-8. Returns true if matched; false otherwise. @@ -7379,7 +7700,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit */ STATIC bool -S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target) +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); @@ -7392,7 +7713,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_IS_INVARIANT() works even if not in UTF-8 */ if (! UTF8_IS_INVARIANT(c) && utf8_target) { STRLEN c_len = 0; - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, + c = utf8n_to_uvchr(p, p_end - p, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for @@ -7405,21 +7726,19 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_NON_UTF8_LATIN1_ALL + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL && ! utf8_target && ! isASCII(c)) { match = TRUE; } - else if (flags & ANYOF_LOCALE) { - RXp_MATCH_TAINTED_on(prog); - - if ((flags & ANYOF_LOC_FOLD) - && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) - { - match = TRUE; + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } } - else if (ANYOF_CLASS_TEST_ANY_SET(n)) { + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { /* The data structure is arranged so bits 0, 2, 4, ... are set * if the class includes the Posix character class given by @@ -7453,8 +7772,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const int count = 0; int to_complement = 0; + while (count < ANYOF_MAX) { - if (ANYOF_CLASS_TEST(n, count) + if (ANYOF_POSIXL_TEST(n, count) && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) { match = TRUE; @@ -7467,27 +7787,22 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } } + /* If the bitmap didn't (or couldn't) match, and something outside the - * bitmap could match, try that. Locale nodes specify completely the - * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except in - * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes. So allow locale - * to match for the synthetic start class, which will give a false - * positive that will be resolved when the match is done again as not part - * of the synthetic start class */ + * bitmap could match, try that. */ if (!match) { - if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { match = TRUE; /* Everything above 255 matches */ } - else if (ANYOF_NONBITMAP(n) - && ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target - && (c >=256 - || (! (flags & ANYOF_LOCALE)) - || OP(n) == ANYOF_SYNTHETIC)))) + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) { - SV * const sw = core_regclass_swash(prog, n, TRUE, 0); + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); if (sw) { U8 * utf8_p; if (utf8_target) { @@ -7504,23 +7819,32 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const /* If we allocated a string above, free it */ if (! utf8_target) Safefree(utf8_p); } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } } if (UNICODE_IS_SUPER(c) - && OP(n) == ANYOF_WARN_SUPER + && (flags & ANYOF_WARN_SUPER) && ckWARN_d(WARN_NON_UNICODE)) { Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); } } +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ - return cBOOL(flags & ANYOF_INVERT) ^ match; + return (flags & ANYOF_INVERT) ^ match; } STATIC U8 * -S_reghop3(U8 *s, I32 off, const U8* lim) +S_reghop3(U8 *s, SSize_t off, const U8* lim) { /* return the position 'off' UTF-8 characters away from 's', forward if * 'off' >= 0, backwards if negative. But don't go outside of position @@ -7549,13 +7873,8 @@ S_reghop3(U8 *s, I32 off, const U8* lim) return s; } -#ifdef XXX_dmq -/* there are a bunch of places where we use two reghop3's that should - be replaced with this routine. but since thats not done yet - we ifdef it out - dmq -*/ STATIC U8 * -S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) { dVAR; @@ -7579,10 +7898,12 @@ S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) } return s; } -#endif + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ STATIC U8 * -S_reghopmaybe3(U8* s, I32 off, const U8* lim) +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) { dVAR; @@ -7650,6 +7971,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) } eval_state->pos_magic = mg; eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; } else eval_state->pos_magic = NULL; @@ -7666,7 +7988,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) /* this regexp is also owned by the new PL_reg_curpm, which will try to free it. */ av_push(PL_regex_padav, repointer); - PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); PL_regex_pad = AvARRAY(PL_regex_padav); } #endif @@ -7724,7 +8046,12 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } if (eval_state->pos_magic) + { eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } PL_curpm = eval_state->curpm; } diff --git a/src/5019003/dquote_static.c b/src/5021000/dquote_static.c similarity index 90% rename from src/5019003/dquote_static.c rename to src/5021000/dquote_static.c index d5241ca..752b399 100644 --- a/src/5019003/dquote_static.c +++ b/src/5021000/dquote_static.c @@ -46,44 +46,44 @@ S_regcurly(pTHX_ const char *s, */ STATIC char -S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning) +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) { U8 result; - if (utf8) { - /* Trying to deprecate non-ASCII usages. This construct has never - * worked for a utf8 variant. So, even though are accepting non-ASCII - * Latin1 in 5.14, no need to make them work under utf8 */ + if (! isPRINT_A(source)) { + const char msg[] = "Character following \"\\c\" must be printable ASCII"; if (! isASCII(source)) { - Perl_croak(aTHX_ "Character following \"\\c\" must be ASCII"); + Perl_croak(aTHX_ "%s", msg); + } + else if (output_warning) { /* Unprintables can be removed in v5.22 */ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), "%s", + msg); } } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); - result = toCTRL(source); - if (! isASCII(source)) { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "Character following \"\\c\" must be ASCII"); + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); } - else if (! isCNTRL(result) && output_warning) { - if (source == '{') { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "\"\\c{\" is deprecated and is more clearly written as \";\""); - } - else { - U8 clearer[3]; - U8 i = 0; - if (! isWORDCHAR(result)) { - clearer[i++] = '\\'; - } - clearer[i++] = result; - clearer[i++] = '\0'; - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "\"\\c%c\" is more clearly written simply as \"%s\"", - source, - clearer); + result = toCTRL(source); + if (output_warning && ! isCNTRL_L1(result)) { + /* We use isCNTRL_L1 above and not simply isCNTRL, because on EBCDIC + * machines, things like \cT map into a C1 control. */ + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); } return result; diff --git a/src/5019002/inline_invlist.c b/src/5021000/inline_invlist.c similarity index 95% rename from src/5019002/inline_invlist.c rename to src/5021000/inline_invlist.c index 68ef83e..de3cecd 100644 --- a/src/5019002/inline_invlist.c +++ b/src/5021000/inline_invlist.c @@ -25,6 +25,8 @@ S_get_invlist_offset_addr(pTHX_ SV* invlist) PERL_ARGS_ASSERT_GET_INVLIST_OFFSET_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); + return &(((XINVLIST*) SvANY(invlist))->is_offset); } @@ -36,6 +38,8 @@ S__invlist_len(pTHX_ SV* const invlist) PERL_ARGS_ASSERT__INVLIST_LEN; + assert(SvTYPE(invlist) == SVt_INVLIST); + return (SvCUR(invlist) == 0) ? 0 : FROM_INTERNAL_SIZE(SvCUR(invlist)) - *get_invlist_offset_addr(invlist); diff --git a/src/5019001/orig/dquote_static.c b/src/5021000/orig/dquote_static.c similarity index 88% rename from src/5019001/orig/dquote_static.c rename to src/5021000/orig/dquote_static.c index da1b5b9..bb1bd4a 100644 --- a/src/5019001/orig/dquote_static.c +++ b/src/5021000/orig/dquote_static.c @@ -46,44 +46,44 @@ S_regcurly(pTHX_ const char *s, */ STATIC char -S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning) +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) { U8 result; - if (utf8) { - /* Trying to deprecate non-ASCII usages. This construct has never - * worked for a utf8 variant. So, even though are accepting non-ASCII - * Latin1 in 5.14, no need to make them work under utf8 */ - if (! isASCII(source)) { - Perl_croak(aTHX_ "Character following \"\\c\" must be ASCII"); + if (! isPRINT_A(source)) { + const char msg[] = "Character following \"\\c\" must be printable ASCII"; + if (! isASCII(source)) { + Perl_croak(aTHX_ "%s", msg); + } + else if (output_warning) { /* Unprintables can be removed in v5.22 */ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), "%s", + msg); } } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); - result = toCTRL(source); - if (! isASCII(source)) { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "Character following \"\\c\" must be ASCII"); + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); } - else if (! isCNTRL(result) && output_warning) { - if (source == '{') { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "\"\\c{\" is deprecated and is more clearly written as \";\""); - } - else { - U8 clearer[3]; - U8 i = 0; - if (! isWORDCHAR(result)) { - clearer[i++] = '\\'; - } - clearer[i++] = result; - clearer[i++] = '\0'; - - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "\"\\c%c\" is more clearly written simply as \"%s\"", - source, - clearer); - } + + result = toCTRL(source); + if (output_warning && ! isCNTRL_L1(result)) { + /* We use isCNTRL_L1 above and not simply isCNTRL, because on EBCDIC + * machines, things like \cT map into a C1 control. */ + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; + } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); } return result; diff --git a/src/5019002/orig/inline_invlist.c b/src/5021000/orig/inline_invlist.c similarity index 95% rename from src/5019002/orig/inline_invlist.c rename to src/5021000/orig/inline_invlist.c index 470659b..1aea9f7 100644 --- a/src/5019002/orig/inline_invlist.c +++ b/src/5021000/orig/inline_invlist.c @@ -25,6 +25,8 @@ S_get_invlist_offset_addr(pTHX_ SV* invlist) PERL_ARGS_ASSERT_GET_INVLIST_OFFSET_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); + return &(((XINVLIST*) SvANY(invlist))->is_offset); } @@ -36,6 +38,8 @@ S__invlist_len(pTHX_ SV* const invlist) PERL_ARGS_ASSERT__INVLIST_LEN; + assert(SvTYPE(invlist) == SVt_INVLIST); + return (SvCUR(invlist) == 0) ? 0 : FROM_INTERNAL_SIZE(SvCUR(invlist)) - *get_invlist_offset_addr(invlist); diff --git a/src/5019002/orig/regcomp.c b/src/5021000/orig/regcomp.c similarity index 73% rename from src/5019002/orig/regcomp.c rename to src/5021000/orig/regcomp.c index b40425f..eaee604 100644 --- a/src/5019002/orig/regcomp.c +++ b/src/5021000/orig/regcomp.c @@ -81,7 +81,7 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" -extern const struct regexp_engine my_reg_engine; +EXTERN_C const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif @@ -91,51 +91,46 @@ extern const struct regexp_engine my_reg_engine; #include "inline_invlist.c" #include "unicode_constants.h" -#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) -#ifdef op -#undef op -#endif /* op */ - -#ifdef MSDOS -# if defined(BUGGY_MSC6) - /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ -# pragma optimize("a",off) - /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ -# pragma optimize("w",on ) -# endif /* BUGGY_MSC6 */ -#endif /* MSDOS */ - #ifndef STATIC #define STATIC static #endif -typedef struct RExC_state_t { +struct RExC_state_t { U32 flags; /* RXf_* are we folding, multilining? */ U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ - regexp_internal *rxi; /* internal data for regexp object pprivate field */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ - I32 whilem_seen; /* number of WHILEM in this expr */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ - regnode *emit_bound; /* First regnode outside of the allocated space */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ regnode *emit; /* Code-emit pointer; if = &emit_dummy, implies compiling, so don't emit */ - regnode emit_dummy; /* placeholder for emit to point to */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; - I32 size; /* Code size. */ - I32 npar; /* Capture buffer count, (OPEN). */ - I32 cpar; /* Capture buffer count, (CLOSE). */ - I32 nestroot; /* root parens we are in - used by accept */ + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ I32 extralen; I32 seen_zerolen; regnode **open_parens; /* pointers to open parens */ @@ -149,18 +144,23 @@ typedef struct RExC_state_t { rules, even if the pattern is not in utf8 */ HV *paren_names; /* Paren names */ - + regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ I32 in_lookbehind; I32 contains_locale; + I32 contains_i; I32 override_recoding; I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ -#if ADD_TO_REGEXEC + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif @@ -173,7 +173,7 @@ typedef struct RExC_state_t { #define RExC_lastnum (pRExC_state->lastnum) #define RExC_paren_name_list (pRExC_state->paren_name_list) #endif -} RExC_state_t; +}; #define RExC_flags (pRExC_state->flags) #define RExC_pm_flags (pRExC_state->pm_flags) @@ -186,7 +186,8 @@ typedef struct RExC_state_t { #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) #ifdef RE_TRACK_PATTERN_OFFSETS -#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ #endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_dummy (pRExC_state->emit_dummy) @@ -196,6 +197,7 @@ typedef struct RExC_state_t { #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) @@ -209,8 +211,12 @@ typedef struct RExC_state_t { #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) #define RExC_override_recoding (pRExC_state->override_recoding) #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) @@ -219,9 +225,6 @@ typedef struct RExC_state_t { #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s, FALSE))) -#ifdef SPSTART -#undef SPSTART /* dratted cpp namespace... */ -#endif /* * Flags to be passed up and down. */ @@ -267,106 +270,129 @@ typedef struct RExC_state_t { #define namedclass_to_classnum(class) ((int) ((class) / 2)) #define classnum_to_namedclass(classnum) ((classnum) * 2) +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + /* About scan_data_t. During optimisation we recurse through the regexp program performing various inplace (keyhole style) optimisations. In addition study_chunk and scan_commit populate this data structure with information about - what strings MUST appear in the pattern. We look for the longest + what strings MUST appear in the pattern. We look for the longest string that must appear at a fixed location, and we look for the longest string that may appear at a floating location. So for instance in the pattern: - + /FOO[xX]A.*B[xX]BAR/ - + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating strings (because they follow a .* construct). study_chunk will identify both FOO and BAR as being the longest fixed and floating strings respectively. - + The strings can be composites, for instance - + /(f)(o)(o)/ - + will result in a composite fixed substring 'foo'. - + For each string some basic information is maintained: - + - offset or min_offset This is the position the string must appear at, or not before. It also implicitly (when combined with minlenp) tells us how many characters must match before the string we are searching for. Likewise when combined with minlenp and the length of the string it - tells us how many characters must appear after the string we have + tells us how many characters must appear after the string we have found. - + - max_offset Only used for floating strings. This is the rightmost point that - the string can appear at. If set to I32 max it indicates that the + the string can appear at. If set to SSize_t_MAX it indicates that the string can occur infinitely far to the right. - + - minlenp A pointer to the minimum number of characters of the pattern that the string was found inside. This is important as in the case of positive - lookahead or positive lookbehind we can have multiple patterns + lookahead or positive lookbehind we can have multiple patterns involved. Consider - + /(?=FOO).*F/ - + The minimum length of the pattern overall is 3, the minimum length of the lookahead part is 3, but the minimum length of the part that - will actually match is 1. So 'FOO's minimum length is 3, but the + will actually match is 1. So 'FOO's minimum length is 3, but the minimum length for the F is 1. This is important as the minimum length - is used to determine offsets in front of and behind the string being + is used to determine offsets in front of and behind the string being looked for. Since strings can be composites this is the length of the pattern at the time it was committed with a scan_commit. Note that the length is calculated by study_chunk, so that the minimum lengths - are not known until the full pattern has been compiled, thus the + are not known until the full pattern has been compiled, thus the pointer to the value. - + - lookbehind - + In the case of lookbehind the string being searched for can be - offset past the start point of the final matching string. + offset past the start point of the final matching string. If this value was just blithely removed from the min_offset it would invalidate some of the calculations for how many chars must match before or after (as they are derived from min_offset and minlen and - the length of the string being searched for). + the length of the string being searched for). When the final pattern is compiled and the data is moved from the scan_data_t structure into the regexp structure the information - about lookbehind is factored in, with the information that would - have been lost precalculated in the end_shift field for the + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the associated string. The fields pos_min and pos_delta are used to store the minimum offset - and the delta to the maximum offset at the current point in the pattern. + and the delta to the maximum offset at the current point in the pattern. */ typedef struct scan_data_t { /*I32 len_min; unused */ /*I32 len_delta; unused */ - I32 pos_min; - I32 pos_delta; + SSize_t pos_min; + SSize_t pos_delta; SV *last_found; - I32 last_end; /* min value, <0 unless valid. */ - I32 last_start_min; - I32 last_start_max; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; SV **longest; /* Either &l_fixed, or &l_float. */ SV *longest_fixed; /* longest fixed string found in pattern */ - I32 offset_fixed; /* offset where it starts */ - I32 *minlen_fixed; /* pointer to the minlen relevant to the string */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ I32 lookbehind_fixed; /* is the position of the string modfied by LB */ SV *longest_float; /* longest floating string found in pattern */ - I32 offset_float_min; /* earliest point in string it can appear */ - I32 offset_float_max; /* latest point in string it can appear */ - I32 *minlen_float; /* pointer to the minlen relevant to the string */ - I32 lookbehind_float; /* is the position of the string modified by LB */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ I32 flags; I32 whilem_c; - I32 *last_closep; - struct regnode_charclass_class *start_class; + SSize_t *last_closep; + regnode_ssc *start_class; } scan_data_t; +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + /* * Forward declarations for pregcomp()'s friends. */ @@ -380,13 +406,8 @@ static const scan_data_t zero_scan_data = #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) -#ifdef NO_UNARY_PLUS -# define SF_FIX_SHIFT_EOL (0+2) -# define SF_FL_SHIFT_EOL (0+4) -#else -# define SF_FIX_SHIFT_EOL (+2) -# define SF_FL_SHIFT_EOL (+4) -#endif +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) @@ -404,22 +425,32 @@ static const scan_data_t zero_scan_data = #define SCF_WHILEM_VISITED_POS 0x2000 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ -#define SCF_SEEN_ACCEPT 0x8000 +#define SCF_SEEN_ACCEPT 0x8000 #define SCF_TRIE_DOING_RESTUDY 0x10000 #define UTF cBOOL(RExC_utf8) /* The enums for all these are ordered so things work out correctly */ #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) -#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) -#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) -#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + #define OOB_NAMEDCLASS -1 /* There is no code point that is out-of-bounds, so this is problematic. But @@ -442,7 +473,12 @@ static const scan_data_t zero_scan_data = #define MARKER1 "<-- HERE" /* marker as it appears in the description */ #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ -#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given @@ -464,12 +500,12 @@ static const scan_data_t zero_scan_data = } STMT_END #define FAIL(msg) _FAIL( \ - Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) #define FAIL2(msg,arg) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ - arg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan @@ -477,7 +513,7 @@ static const scan_data_t zero_scan_data = #define Simple_vFAIL(m) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -494,8 +530,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL2(m,a1) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -513,8 +549,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL3(m, a1, a2) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -531,8 +567,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vFAIL4(m,a1,a2,a3) STMT_START { \ @@ -541,80 +577,90 @@ static const scan_data_t zero_scan_data = Simple_vFAIL4(m, a1, a2, a3); \ } STMT_END +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + /* m is not necessarily a "literal string", in this macro */ #define reg_warn_non_literal_string(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN_dep(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg_d(loc,m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN3(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ } STMT_END @@ -623,7 +669,7 @@ static const scan_data_t zero_scan_data = if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ } STMT_END -/* Macros for recording node offsets. 20001227 mjd@plover.com +/* Macros for recording node offsets. 20001227 mjd@plover.com * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in * element 2*n-1 of the array. Element #2n holds the byte length node #n. * Element 0 holds the number n. @@ -636,8 +682,8 @@ static const scan_data_t zero_scan_data = #define Set_Node_Length_To_R(node,len) #define Set_Node_Length(node,len) #define Set_Node_Cur_Length(node,start) -#define Node_Offset(n) -#define Node_Length(n) +#define Node_Offset(n) +#define Node_Length(n) #define Set_Node_Offset_Length(node,offset,len) #define ProgLen(ri) ri->u.proglen #define SetProgLen(ri,x) ri->u.proglen = x @@ -649,7 +695,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ __LINE__, (int)(node), (int)(byte))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)-1] = (byte); \ } \ @@ -665,7 +712,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ __LINE__, (int)(node), (int)(len))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)] = (len); \ } \ @@ -691,6 +739,49 @@ static const scan_data_t zero_scan_data = #define EXPERIMENTAL_INPLACESCAN #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log, \ @@ -730,7 +821,8 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ floating substrings if needed. */ STATIC void -S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) { const STRLEN l = CHR_SVLEN(data->last_found); const STRLEN old_l = CHR_SVLEN(*data->longest); @@ -754,9 +846,12 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min data->offset_float_min = l ? data->last_start_min : data->pos_min; data->offset_float_max = (l ? data->last_start_max - : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta)); - if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) - data->offset_float_max = I32_MAX; + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; if (data->flags & SF_BEFORE_EOL) data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); @@ -780,299 +875,592 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min DEBUG_STUDYDATA("commit: ",data,0); } -/* These macros set, clear and test whether the synthetic start class ('ssc', - * given by the parameter) matches an empty string (EOS). This uses the - * 'next_off' field in the node, to save a bit in the flags field. The ssc - * stands alone, so there is never a next_off, so this field is otherwise - * unused. The EOS information is used only for compilation, but theoretically - * it could be passed on to the execution code. This could be used to store - * more than one bit of information, but only this one is currently used. */ -#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END -#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END -#define TEST_SSC_EOS(node) cBOOL((node)->next_off) - -/* Can match anything (initialization) */ +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(pTHX_ const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + STATIC void -S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) { - PERL_ARGS_ASSERT_CL_ANYTHING; + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ + + PERL_ARGS_ASSERT_SSC_INIT; - ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_UNICODE_ALL; - SET_SSC_EOS(cl); + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); /* If any portion of the regex is to operate under locale rules, * initialization includes it. The reason this isn't done for all regexes * is that the optimizer was written under the assumption that locale was * all-or-nothing. Given the complexity and lack of documentation in the - * optimizer, and that there are inadequate test cases for locale, so many + * optimizer, and that there are inadequate test cases for locale, many * parts of it may not work properly, it is safest to avoid locale unless * necessary. */ if (RExC_contains_locale) { - ANYOF_CLASS_SETALL(cl); /* /l uses class */ - cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD; + ANYOF_POSIXL_SETALL(ssc); } else { - ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ + ANYOF_POSIXL_ZERO(ssc); } } -/* Can match anything (initialization) */ STATIC int -S_cl_is_anything(const struct regnode_charclass_class *cl) +S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) { - int value; + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ - PERL_ARGS_ASSERT_CL_IS_ANYTHING; + UV start, end; + bool ret; - for (value = 0; value < ANYOF_MAX; value += 2) - if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) - return 1; - if (!(cl->flags & ANYOF_UNICODE_ALL)) - return 0; - if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) - return 0; - return 1; + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; } -/* Can match anything (initialization) */ -STATIC void -S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) { - PERL_ARGS_ASSERT_CL_INIT; + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ + + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } + + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } - Zero(cl, 1, struct regnode_charclass_class); - cl->type = ANYOF; - cl_anything(pRExC_state, cl); - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } + + return invlist; } /* These two functions currently do the exact same thing */ -#define cl_init_zero S_cl_init +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ -/* 'AND' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if - * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_and(struct regnode_charclass_class *cl, - const struct regnode_charclass_class *and_with) +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) { - PERL_ARGS_ASSERT_CL_AND; + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } - assert(PL_regkind[and_with->type] == ANYOF); + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ - /* I (khw) am not sure all these restrictions are necessary XXX */ - if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) - && !(ANYOF_CLASS_TEST_ANY_SET(cl)) - && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(and_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD)) { - int i; + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; - if (and_with->flags & ANYOF_INVERT) - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= ~and_with->bitmap[i]; - else - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= and_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ - - if (and_with->flags & ANYOF_INVERT) { - - /* Here, the and'ed node is inverted. Get the AND of the flags that - * aren't affected by the inversion. Those that are affected are - * handled individually below */ - U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; - cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); - cl->flags |= affected_flags; - - /* We currently don't know how to deal with things that aren't in the - * bitmap, but we know that the intersection is no greater than what - * is already in cl, so let there be false positives that get sorted - * out after the synthetic start class succeeds, and the node is - * matched for real. */ - - /* The inversion of these two flags indicate that the resulting - * intersection doesn't have them */ - if (and_with->flags & ANYOF_UNICODE_ALL) { - cl->flags &= ~ANYOF_UNICODE_ALL; - } - if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { - cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* and'd node is not inverted */ - U8 outside_bitmap_but_not_utf8; /* Temp variable */ - - if (! ANYOF_NONBITMAP(and_with)) { - - /* Here 'and_with' doesn't match anything outside the bitmap - * (except possibly ANYOF_UNICODE_ALL), which means the - * intersection can't either, except for ANYOF_UNICODE_ALL, in - * which case we don't know what the intersection is, but it's no - * greater than what cl already has, so can just leave it alone, - * with possible false positives */ - if (! (and_with->flags & ANYOF_UNICODE_ALL)) { - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); - cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; - } - } - else if (! ANYOF_NONBITMAP(cl)) { - - /* Here, 'and_with' does match something outside the bitmap, and cl - * doesn't have a list of things to match outside the bitmap. If - * cl can match all code points above 255, the intersection will - * be those above-255 code points that 'and_with' matches. If cl - * can't match all Unicode code points, it means that it can't - * match anything outside the bitmap (since the 'if' that got us - * into this block tested for that), so we leave the bitmap empty. - */ - if (cl->flags & ANYOF_UNICODE_ALL) { - ARG_SET(cl, ARG(and_with)); + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); - /* and_with's ARG may match things that don't require UTF8. - * And now cl's will too, in spite of this being an 'and'. See - * the comments below about the kludge */ - cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8; - } - } - else { - /* Here, both 'and_with' and cl match something outside the - * bitmap. Currently we do not do the intersection, so just match - * whatever cl had at the beginning. */ - } - - - /* Take the intersection of the two sets of flags. However, the - * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a - * kludge around the fact that this flag is not treated like the others - * which are initialized in cl_anything(). The way the optimizer works - * is that the synthetic start class (SSC) is initialized to match - * anything, and then the first time a real node is encountered, its - * values are AND'd with the SSC's with the result being the values of - * the real node. However, there are paths through the optimizer where - * the AND never gets called, so those initialized bits are set - * inappropriately, which is not usually a big deal, as they just cause - * false positives in the SSC, which will just mean a probably - * imperceptible slow down in execution. However this bit has a - * higher false positive consequence in that it can cause utf8.pm, - * utf8_heavy.pl ... to be loaded when not necessary, which is a much - * bigger slowdown and also causes significant extra memory to be used. - * In order to prevent this, the code now takes a different tack. The - * bit isn't set unless some part of the regular expression needs it, - * but once set it won't get cleared. This means that these extra - * modules won't get loaded unless there was some path through the - * pattern that would have required them anyway, and so any false - * positives that occur by not ANDing them out when they could be - * aren't as severe as they would be if we treated this bit like all - * the others */ - outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags) - & ANYOF_NONBITMAP_NON_UTF8; - cl->flags &= and_with->flags; - cl->flags |= outside_bitmap_but_not_utf8; + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); + + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } + } + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } } } -/* 'OR' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if - * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) { - PERL_ARGS_ASSERT_CL_OR; - - if (or_with->flags & ANYOF_INVERT) { - - /* Here, the or'd node is to be inverted. This means we take the - * complement of everything not in the bitmap, but currently we don't - * know what that is, so give up and match anything */ - if (ANYOF_NONBITMAP(or_with)) { - cl_anything(pRExC_state, cl); - } - /* We do not use - * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) - * <= (B1 | !B2) | (CL1 | !CL2) - * which is wasteful if CL2 is small, but we ignore CL2: - * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 - * XXXX Can we handle case-fold? Unclear: - * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = - * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) - */ - else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(or_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD) ) { - int i; + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= ~or_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise */ - else { - cl_anything(pRExC_state, cl); - } + SV* ored_cp_list; + U8 ored_flags; - /* And, we can just take the union of the flags that aren't affected - * by the inversion */ - cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; + PERL_ARGS_ASSERT_SSC_OR; - /* For the remaining flags: - ANYOF_UNICODE_ALL and inverted means to not match anything above - 255, which means that the union with cl should just be - what cl has in it, so can ignore this flag - ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord - is 127-255 to match them, but then invert that, so the - union with cl should just be what cl has in it, so can - ignore this flag - */ - } else { /* 'or_with' is not inverted */ - /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ - if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_LOC_FOLD) - || (cl->flags & ANYOF_LOC_FOLD)) ) { - int i; + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ - /* OR char bitmap and class bitmap separately */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= or_with->bitmap[i]; - if (or_with->flags & ANYOF_CLASS) { - ANYOF_CLASS_OR(or_with, cl); + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); + } } - } - else { /* XXXX: logic is complicated, leave it along for a moment. */ - cl_anything(pRExC_state, cl); - } + } + } - if (ANYOF_NONBITMAP(or_with)) { + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} - /* Use the added node's outside-the-bit-map match if there isn't a - * conflict. If there is a conflict (both nodes match something - * outside the bitmap, but what they match outside is not the same - * pointer, and hence not easily compared until XXX we extend - * inversion lists this far), give up and allow the start class to - * match everything outside the bitmap. If that stuff is all above - * 255, can just set UNICODE_ALL, otherwise caould be anything. */ - if (! ANYOF_NONBITMAP(cl)) { - ARG_SET(cl, ARG(or_with)); - } - else if (ARG(cl) != ARG(or_with)) { +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; - if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { - cl_anything(pRExC_state, cl); - } - else { - cl->flags |= ANYOF_UNICODE_ALL; - } - } - } + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; - /* Take the union */ - cl->flags |= or_with->flags; + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) -#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) #ifdef DEBUGGING @@ -1115,13 +1503,13 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 0 ; state < trie->uniquecharcount ; state++ ) { SV ** const tmp = av_fetch( revcharmap, state, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + PerlIO_printf( Perl_debug_log, "%*s", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) + PERL_PV_ESCAPE_FIRSTCHAR + ) ); } } @@ -1135,10 +1523,12 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; - PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); if ( trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); } else { PerlIO_printf( Perl_debug_log, "%6s", "" ); } @@ -1150,19 +1540,23 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, while( ( base + ofs < trie->uniquecharcount ) || ( base + ofs - trie->uniquecharcount < trie->lasttrans - && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) ofs++; PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) && - ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && - trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) { PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, - (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); } else { PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); } @@ -1173,17 +1567,18 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, } PerlIO_printf( Perl_debug_log, "\n" ); } - PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, ""); + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); for (word=1; word <= trie->wordcount; word++) { PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", (int)word, (int)(trie->wordinfo[word].prev), (int)(trie->wordinfo[word].len)); } PerlIO_printf(Perl_debug_log, "\n" ); -} +} /* Dumps a fully constructed but uncompressed trie in list form. - List tries normally only are used for construction when the number of + List tries normally only are used for construction when the number of possible chars (trie->uniquecharcount) is very high. Used for debugging make_trie(). */ @@ -1203,10 +1598,10 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", "------:-----+-----------------\n" ); - + for( state=1 ; state < next_alloc ; state ++ ) { U16 charid; - + PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", (int)depth * 2 + 2,"", (UV)state ); if ( ! trie->states[ state ].wordnum ) { @@ -1217,31 +1612,33 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, ); } for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { - SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); if ( tmp ) { PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR ) , TRIE_LIST_ITEM(state,charid).forid, (UV)TRIE_LIST_ITEM(state,charid).newstate ); - if (!(charid % 10)) + if (!(charid % 10)) PerlIO_printf(Perl_debug_log, "\n%*s| ", (int)((depth * 2) + 14), ""); } } PerlIO_printf( Perl_debug_log, "\n"); } -} +} /* Dumps a fully constructed but uncompressed trie in table form. - This is the normal DFA style state transition table, with a few - twists to facilitate compression later. + This is the normal DFA style state transition table, with a few + twists to facilitate compression later. Used for debugging make_trie(). */ STATIC void @@ -1256,24 +1653,24 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; - + /* print out the table precompression so that we can do a visual check that they are identical. */ - + PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { SV ** const tmp = av_fetch( revcharmap, charid, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + PerlIO_printf( Perl_debug_log, "%*s", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) + PERL_PV_ESCAPE_FIRSTCHAR + ) ); } } @@ -1288,7 +1685,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { - PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", + PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", (int)depth * 2 + 2,"", (UV)TRIE_NODENUM( state ) ); @@ -1300,9 +1697,11 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); } if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); } else { - PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, trie->states[ TRIE_NODENUM( state ) ].wordnum ); } } @@ -1421,7 +1820,7 @@ and would end up looking like: 8: EXACT (10) 10: END(0) - d = uvuni_to_utf8_flags(d, uv, 0); + d = uvchr_to_utf8_flags(d, uv, 0); is the recommended Unicode-aware way of saying @@ -1433,7 +1832,7 @@ is the recommended Unicode-aware way of saying if (UTF) { \ SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ @@ -1444,31 +1843,28 @@ is the recommended Unicode-aware way of saying } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - /* if it is UTF then it is either already folded, or does not need folding */ \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ - } \ - else if (folder == PL_fold_latin1) { \ - /* if we use this folder we have to obey unicode rules on latin-1 data */ \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ - skiplen = UNISKIP(uvc); \ - foldlen -= skiplen; \ - scan = foldbuf + skiplen; \ - } \ - } else { \ - /* raw data, will be folded later if needed */ \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1511,7 +1907,8 @@ is the recommended Unicode-aware way of saying \ if ( noper_next < tail ) { \ if (!trie->jump) \ - trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ trie->jump[curword] = (U16)(noper_next - convert); \ if (!jumper) \ jumper = noper_next; \ @@ -1546,7 +1943,9 @@ is the recommended Unicode-aware way of saying #define MADE_EXACT_TRIE 4 STATIC I32 -S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) { dVAR; /* first pass, loop through and scan words */ @@ -1554,7 +1953,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs HV *widecharmap = NULL; AV *revcharmap = newAV(); regnode *cur; - const U32 uniflags = UTF8_ALLOW_DEFAULT; STRLEN len = 0; UV uvc = 0; U16 curword = 0; @@ -1567,13 +1965,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 * folder = NULL; #ifdef DEBUGGING - const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); AV *trie_words = NULL; /* along with revcharmap, this only used during construction but both are * useful during debugging so we store them in the struct when debugging. */ #else - const U32 data_slot = add_data( pRExC_state, 2, "tu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); STRLEN trie_charcount=0; #endif SV *re_trie_maxbuff; @@ -1588,10 +1986,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs case EXACT: break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; - case EXACTFL: folder = PL_fold_locale; break; default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } @@ -1615,14 +2011,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } DEBUG_TRIE_COMPILE_r({ - PerlIO_printf( Perl_debug_log, - "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - (int)depth * 2 + 2, "", - REG_NODE_NUM(startbranch),REG_NODE_NUM(first), - REG_NODE_NUM(last), REG_NODE_NUM(tail), - (int)depth); + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); }); - + /* Find the node we are going to overwrite */ if ( first == startbranch && OP( last ) != BRANCH ) { /* whole branch chain */ @@ -1631,7 +2026,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* branch sub-chain */ convert = NEXTOPER( first ); } - + /* -- First loop and Setup -- We first traverse the branches and scan each word to determine if it @@ -1640,9 +2035,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store Unicode characters. We use the - native representation of the character value as the key and IV's for the - coded index. + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. *TODO* If we keep track of how many times each character is used we can remap the columns so that the table compression later on is more @@ -1659,13 +2054,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); const U8 *e = uc + STR_LEN( noper ); - STRLEN foldlen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; - const U8 *scan = (U8*)NULL; + int foldlen = 0; U32 wordlen = 0; /* required init */ - STRLEN chars = 0; - bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1685,13 +2079,77 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regardless of encoding */ if (OP( noper ) == EXACTFU_SS) { /* false positives are ok, so just set this */ - TRIE_BITMAP_SET(trie,0xDF); + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; - chars++; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ if ( uvc < 256 ) { if ( folder ) { U8 folded= folder[ (U8) uvc ]; @@ -1715,13 +2173,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !UTF ) { /* store first byte of utf8 representation of variant codepoints */ - if (! UNI_IS_INVARIANT(uvc)) { + if (! UVCHR_IS_INVARIANT(uvc)) { TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); } } set_bit = 0; /* We've done our bit :-) */ } } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + SV** svpp; if ( !widecharmap ) widecharmap = newHV(); @@ -1736,30 +2202,22 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs TRIE_STORE_REVCHAR(uvc); } } - } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ if( cur == first ) { - trie->minlen = chars; - trie->maxlen = chars; - } else if (chars < trie->minlen) { - trie->minlen = chars; - } else if (chars > trie->maxlen) { - trie->maxlen = chars; + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; } - if (OP( noper ) == EXACTFU_SS) { - /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ - if (trie->minlen > 1) - trie->minlen= 1; - } - if (OP( noper ) == EXACTFU_TRICKYFOLD) { - /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" - * - We assume that any such sequence might match a 2 byte string */ - if (trie->minlen > 2 ) - trie->minlen= 2; - } - } /* end first pass */ DEBUG_TRIE_COMPILE_r( - PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", (int)depth * 2 + 2,"", ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, @@ -1791,7 +2249,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); prev_states[1] = 0; - if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { /* Second Pass -- Array Of Lists Representation @@ -1806,7 +2266,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN transcount = 1; - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sCompiling trie using list compiler\n", (int)depth * 2 + 2, "")); @@ -1823,11 +2283,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1846,14 +2302,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); if ( !svpp ) { charid = 0; } else { charid=(U16)SvIV( *svpp ); } } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ if ( charid ) { U16 check; @@ -1863,8 +2323,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !trie->states[ state ].trans.list ) { TRIE_LIST_NEW( state ); } - for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { - if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { newstate = TRIE_LIST_ITEM( state, check ).newstate; break; } @@ -1886,7 +2351,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } /* end second pass */ /* next alloc is the NEXT state to be allocated */ - trie->statecount = next_alloc; + trie->statecount = next_alloc; trie->states = (reg_trie_state *) PerlMemShared_realloc( trie->states, next_alloc @@ -1934,7 +2399,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, transcount * sizeof(reg_trie_trans) ); - Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); } base = trie->uniquecharcount + tp - minid; if ( maxid == minid ) { @@ -1942,22 +2409,27 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( ; zp < tp ; zp++ ) { if ( ! trie->trans[ zp ].next ) { base = trie->uniquecharcount + zp - minid; - trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ zp ].check = state; set = 1; break; } } if ( !set ) { - trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ tp ].check = state; tp++; zp = tp; } } else { for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; - trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; trie->trans[ tid ].check = state; } tp += ( maxid - minid + 1 ); @@ -1977,26 +2449,26 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* Second Pass -- Flat Table Representation. - we dont use the 0 slot of either trans[] or states[] so we add 1 to each. - We know that we will need Charcount+1 trans at most to store the data - (one row per char at worst case) So we preallocate both structures - assuming worst case. + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. We then construct the trie using only the .next slots of the entry structs. - We use the .check field of the first entry of the node temporarily to - make compression both faster and easier by keeping track of how many non - zero fields are in the node. + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. Since trans are numbered from 1 any 0 pointer in the table is a FAIL transition. - There are two terms at use here: state as a TRIE_NODEIDX() which is a - number representing the first entry of the node, and state as a - TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and - TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there - are 2 entrys per node. eg: + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: A B A B 1. 2 4 1. 3 7 @@ -2004,12 +2476,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs 3. 0 0 5. 0 0 4. 0 0 7. 0 0 - The table is internally in the right hand, idx form. However as we also - have to deal with the states array which is indexed by nodenum we have to - use TRIE_NODENUM() to convert. + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. */ - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sCompiling trie using table compiler\n", (int)depth * 2 + 2, "")); @@ -2033,12 +2505,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U16 charid = 0; /* sanity init */ U32 accept_state = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - STRLEN skiplen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -2057,7 +2525,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); charid = svpp ? (U16)SvIV(*svpp) : 0; } if ( charid ) { @@ -2073,7 +2544,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } else { Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ } } accept_state = TRIE_NODENUM( state ); @@ -2160,7 +2632,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U32 used = trie->trans[ stateidx ].check; trie->trans[ stateidx ].check = 0; - for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { if ( flag || trie->trans[ stateidx + charid ].next ) { if ( trie->trans[ stateidx + charid ].next ) { if (o_used == 1) { @@ -2169,8 +2644,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs break; } } - trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; - trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); trie->trans[ zp ].check = state; if ( ++zp > pos ) pos = zp; break; @@ -2179,9 +2659,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( !flag ) { flag = 1; - trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; } - trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); trie->trans[ pos ].check = state; pos++; } @@ -2192,19 +2675,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->states, laststate * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, - "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", - (int)depth * 2 + 2,"", - (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), - (IV)next_alloc, - (IV)pos, - ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); ); } /* end table compress */ } DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", (int)depth * 2 + 2, "", (UV)trie->statecount, (UV)trie->lasttrans) @@ -2214,10 +2699,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, trie->lasttrans * sizeof(reg_trie_trans) ); - { /* Modify the program and insert the new TRIE node */ + { /* Modify the program and insert the new TRIE node */ U8 nodetype =(U8)(flags & 0xFF); char *str=NULL; - + #ifdef DEBUGGING regnode *optimize = NULL; #ifdef RE_TRACK_PATTERN_OFFSETS @@ -2255,12 +2740,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs }); } DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); #endif - /* But first we check to see if there is a common prefix we can + /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; if ( trie->bitmap && !widecharmap && !trie->jump ) { @@ -2319,11 +2805,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlIO_printf( Perl_debug_log, "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", (int)depth * 2 + 2, "", - (UV)state, (UV)idx, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, + (UV)state, (UV)idx, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR + PERL_PV_ESCAPE_FIRSTCHAR ) ); }); @@ -2336,7 +2822,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs while (len--) *str++ = *ch++; } else { -#ifdef DEBUGGING +#ifdef DEBUGGING if (state>1) DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); #endif @@ -2387,17 +2873,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } } } - if (!jumper) - jumper = last; + if (!jumper) + jumper = last; if ( trie->maxlen ) { NEXT_OFF( convert ) = (U16)(tail - convert); ARG_SET( convert, data_slot ); - /* Store the offset to the first unabsorbed branch in - jump[0], which is otherwise unused by the jump logic. + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. We use this when dumping a trie and during optimisation. */ - if (trie->jump) + if (trie->jump) trie->jump[0] = (U16)(nextbranch - convert); - + /* If the start state is not accepting (meaning there is no empty string/NOTHING) * and there is a bitmap * and the first "jump target" node we found leaves enough room @@ -2412,17 +2898,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); PerlMemShared_free(trie->bitmap); trie->bitmap= NULL; - } else + } else OP( convert ) = TRIE; /* store the type in the flags */ convert->flags = nodetype; DEBUG_r({ - optimize = convert - + NODE_STEP_REGNODE + optimize = convert + + NODE_STEP_REGNODE + regarglen[ OP( convert ) ]; }); - /* XXX We really should free up the resource in trie now, + /* XXX We really should free up the resource in trie now, as we won't use them - (which resources?) dmq */ } /* needed for dumping*/ @@ -2432,8 +2918,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs while ( ++opt < optimize) { Set_Node_Offset_Length(opt,0,0); } - /* - Try to clean up some of the debris left after the + /* + Try to clean up some of the debris left after the optimisation. */ while( optimize < jumper ) { @@ -2488,32 +2974,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #else SvREFCNT_dec_NN(revcharmap); #endif - return trie->jump - ? MADE_JUMP_TRIE - : trie->startstate>1 - ? MADE_EXACT_TRIE + return trie->jump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE : MADE_TRIE; } STATIC void S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) { -/* The Trie is constructed and compressed now so we can build a fail array if it's needed +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed - This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the - "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 ISBN 0-201-10088-6 - We find the fail state for each state in the trie, this state is the longest proper - suffix of the current state's 'word' that is also a proper prefix of another word in our - trie. State 1 represents the word '' and is thus the default fail state. This allows - the DFA not to have to restart after its tried and failed a word at a given point, it - simply continues as though it had been matching the other word in the first place. + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. Consider 'abcdgu'=~/abcdefg|cdgu/ - When we get to 'd' we are still matching the first word, we would encounter 'g' which would - fail, which would bring us to the state representing 'd' in the second word where we would - try 'g' and succeed, proceeding to match 'cdgu'. + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. */ /* add a fail transition */ const U32 trie_offset = ARG(source); @@ -2528,7 +3019,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 base = trie->states[ 1 ].trans.base; U32 *fail; reg_ac_data *aho; - const U32 data_slot = add_data( pRExC_state, 1, "T" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; @@ -2592,7 +3083,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode fail[ 0 ] = fail[ 1 ] = 0; DEBUG_TRIE_COMPILE_r({ PerlIO_printf(Perl_debug_log, - "%*sStclass Failtable (%"UVuf" states): 0", + "%*sStclass Failtable (%"UVuf" states): 0", (int)(depth * 2), "", (UV)numstates ); for( q_read=1; q_read%3d: %s (%d)\n", \ (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ Next ? (REG_NODE_NUM(Next)) : 0 ); \ @@ -2639,49 +3119,58 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta of the input nodes. + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. * - * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF - * and contains LATIN SMALL LETTER SHARP S + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) * * This is as good a place as any to discuss the design of handling these * multi-character fold sequences. It's been wrong in Perl for a very long * time. There are three code points in Unicode whose multi-character folds * were long ago discovered to mess things up. The previous designs for * dealing with these involved assigning a special node for them. This - * approach doesn't work, as evidenced by this example: + * approach doesn't always work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node that + * Both sides fold to "sss", but if the pattern is parsed to create a node that * would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node - * that is "sss". + * that is "sss" in this case. * * It turns out that there are problems with all multi-character folds, and not - * just these three. Now the code is general, for all such cases, but the - * three still have some special handling. The approach taken is: + * just these three. Now the code is general, for all such cases. The + * approach taken is: * 1) This routine examines each EXACTFish node that could contain multi- - * character fold sequences. It returns in *min_subtract how much to - * subtract from the the actual length of the string to get a real minimum - * match length; it is 0 if there are no multi-char folds. This delta is - * used by the caller to adjust the min length of the match, and the delta - * between min and max, so that the optimizer doesn't reject these - * possibilities based on size constraints. - * 2) Certain of these sequences require special handling by the trie code, - * so, if found, this code changes the joined node type to special ops: - * EXACTFU_TRICKYFOLD and EXACTFU_SS. - * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS * is used for an EXACTFU node that contains at least one "ss" sequence in * it. For non-UTF-8 patterns and strings, this is the only case where * there is a possible fold length change. That means that a regular * EXACTFU node without UTF-8 involvement doesn't have to concern itself * with length changes, and so can be processed faster. regexec.c takes * advantage of this. Generally, an EXACTFish node that is in UTF-8 is - * pre-folded by regcomp.c. This saves effort in regex matching. - * However, the pre-folding isn't done for non-UTF8 patterns because the - * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things - * down by forcing the pattern into UTF8 unless necessary. Also what - * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold * possibilities for the non-UTF8 patterns are quite simple, except for * the sharp s. All the ones that don't involve a UTF-8 target string are * members of a fold-pair, and arrays are set up for all of them so that @@ -2689,45 +3178,63 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * this file makes sure that in EXACTFU nodes, the sharp s gets folded to * 'ss', even if the pattern isn't UTF-8. This avoids the issues * described in the next item. - * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the - * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a - * UTF-8 pattern.) An assumption that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes is that a - * character in the pattern corresponds to at most a single character in - * the target string. (And I do mean character, and not byte here, unlike - * other parts of the documentation that have never been updated to - * account for multibyte Unicode.) sharp s in EXACTF nodes can match the - * two character string 'ss'; in EXACTFA nodes it can match - * "\x{17F}\x{17F}". These violate the assumption, and they are the only - * instances where it is violated. I'm reluctant to try to change the - * assumption, as the code involved is impenetrable to me (khw), so - * instead the code here punts. This routine examines (when the pattern - * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a - * boolean indicating whether or not the node contains a sharp s. When it - * is true, the caller sets a flag that later causes the optimizer in this - * file to not set values for the floating and fixed string lengths, and - * thus avoids the optimizer code in regexec.c that makes the invalid + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid * assumption. Thus, there is no optimization based on string lengths for - * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. - * (The reason the assumption is wrong only in these two cases is that all - * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all - * other folds to their expanded versions. We can't prefold sharp s to - * 'ss' in EXACTF nodes because we don't know at compile time if it - * actually matches 'ss' or not. It will match iff the target string is - * in UTF-8, unlike the EXACTFU nodes, where it always matches; and - * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 - * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; - * but in a non-UTF8 pattern, folding it to that above-Latin1 string would - * require the pattern to be forced into UTF-8, the overhead of which we - * want to avoid.) - */ - -#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2756,7 +3263,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b && NEXT_OFF(n) && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { - + if (OP(n) == TAIL || n > next) stringok = 0; if (PL_regkind[OP(n)] == NOTHING) { @@ -2773,12 +3280,13 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); - /* XXX I (khw) kind of doubt that this works on platforms where - * U8_MAX is above 255 because of lots of other assumptions */ + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ /* Don't join if the sum can't fit into a single node */ if (oldl + STR_LEN(n) > U8_MAX) break; - + DEBUG_PEEP("merg",n,depth); merged++; @@ -2809,7 +3317,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } *min_subtract = 0; - *has_exactf_sharp_s = FALSE; + *unfolded_multi_char = FALSE; /* Here, all the adjacent mergeable EXACTish nodes have been merged. We * can now analyze for sequences of problematic code points. (Prior to @@ -2817,15 +3325,68 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - const U8 * const s0 = (U8*) STRING(scan); - const U8 * s = s0; - const U8 * const s_end = s0 + STR_LEN(scan); + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ /* One pass is made over the node's string looking for all the - * possibilities. to avoid some tests in the loop, there are two main + * possibilities. To avoid some tests in the loop, there are two main * cases, for UTF-8 patterns (which can't have EXACTF nodes) and * non-UTF-8 */ if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ /* Examine the string for a multi-character fold sequence. UTF-8 * patterns have all characters pre-folded by the time this code is @@ -2833,60 +3394,32 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b while (s < s_end - 1) /* Can stop 1 before the end, as minimum length sequence we are looking for is 2 */ { - int count = 0; + int count = 0; /* How many characters in a multi-char fold */ int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); if (! len) { /* Not a multi-char fold: get next char */ s += UTF8SKIP(s); continue; } - /* Nodes with 'ss' require special handling, except for EXACTFL - * and EXACTFA for which there is no multi-char fold to this */ + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ if (len == 2 && *s == 's' && *(s+1) == 's' - && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) { count = 2; - OP(scan) = EXACTFU_SS; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } s += 2; } - else if (len == 6 /* len is the same in both ASCII and EBCDIC - for these */ - && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6) - || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6))) - { - count = 3; - - /* These two folds require special handling by trie's, so - * change the node type to indicate this. If EXACTFA and - * EXACTFL were ever to be handled by trie's, this would - * have to be changed. If this node has already been - * changed to EXACTFU_SS in this loop, leave it as is. (I - * (khw) think it doesn't matter in regexec.c for UTF - * patterns, but no need to change it */ - if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_TRICKYFOLD; - } - s += 6; - } else { /* Here is a generic multi-char fold. */ - const U8* multi_end = s + len; - - /* Count how many characters in it. In the case of /l and - * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. (In - * EXACTFL, no folds are allowed to any Latin1 code point, - * not just ASCII. But there aren't any of these - * currently, nor ever likely, so don't take the time to - * test for them. The code that generates the - * is_MULTI_foo() macros croaks should one actually get put - * into Unicode .) */ - if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + U8* multi_end = s + len; + + /* Count how many characters in it. In the case of /aa, no + * folds which contain ASCII code points are allowed, so + * check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; } @@ -2906,70 +3439,78 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b /* The delta is how long the sequence is minus 1 (1 is how long * the character that folds to the sequence is) */ - *min_subtract += count - 1; - next_iteration: ; + total_count_delta += count - 1; + next_iteration: ; } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); } else if (OP(scan) == EXACTFA) { /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char * fold to the ASCII range (and there are no existing ones in the * upper latin1 range). But, as outlined in the comments preceding - * this function, we need to flag any occurrences of the sharp s */ + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { - *has_exactf_sharp_s = TRUE; + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; break; } s++; continue; } } - else if (OP(scan) != EXACTFL) { - - /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the - * multi-char folds that are all Latin1. (This code knows that - * there are no current multi-char folds possible with EXACTFL, - * relying on fold_grind.t to catch any errors if the very unlikely - * event happens that some get added in future Unicode versions.) - * As explained in the comments preceding this function, we look - * also for the sharp s in EXACTF nodes; it can be in the final - * position. Otherwise we can stop looking 1 byte earlier because - * have to find at least two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; - - /* The below is perhaps overboard, but this allows us to save a - * test each time through the loop at the expense of a mask. This - * is because on both EBCDIC and ASCII machines, 'S' and 's' differ - * by a single bit. On ASCII they are 32 apart; on EBCDIC, they - * are 64. This uses an exclusive 'or' to find that bit and then - * inverts it to form a mask, with just a single 0, in the bit - * position where 'S' and 's' differ. */ - const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); - const U8 s_masked = 's' & S_or_s_mask; + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; while (s < upper) { int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) { - *has_exactf_sharp_s = TRUE; + *unfolded_multi_char = TRUE; } s++; continue; } if (len == 2 - && ((*s & S_or_s_mask) == s_masked) - && ((*(s+1) & S_or_s_mask) == s_masked)) + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) { /* EXACTF nodes need to know that the minimum length * changed so that a sharp s in the string can match this * ss in the pattern, but they remain EXACTF nodes, as they * won't match this unless the target string is is UTF-8, - * which we don't know until runtime */ - if (OP(scan) != EXACTF) { + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { OP(scan) = EXACTFU_SS; } } @@ -3003,7 +3544,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b #define INIT_AND_WITHP \ assert(!and_withp); \ - Newx(and_withp,1,struct regnode_charclass_class); \ + Newx(and_withp,1, regnode_ssc); \ SAVEFREEPV(and_withp) /* this is a chain of data about sub patterns we are processing that @@ -3014,20 +3555,19 @@ typedef struct scan_frame { regnode *last; /* last node to process in this frame */ regnode *next; /* next node to process when last is reached */ struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; I32 stop; /* what stopparen do we use */ } scan_frame; -#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) - -STATIC I32 +STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, - I32 *minlenp, I32 *deltap, + SSize_t *minlenp, SSize_t *deltap, regnode *last, scan_data_t *data, I32 stopparen, - U8* recursed, - struct regnode_charclass_class *and_withp, + U32 recursed_depth, + regnode_ssc *and_withp, U32 flags, U32 depth) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ @@ -3038,17 +3578,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0; /* There must be at least this number of characters to match */ + /* There must be at least this number of characters to match */ + SSize_t min = 0; I32 pars = 0, code; regnode *scan = *scanp, *next; - I32 delta = 0; + SSize_t delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; SV *re_trie_maxbuff = NULL; regnode *first_non_open = scan; - I32 stopmin = I32_MAX; + SSize_t stopmin = SSize_t_MAX; scan_frame *frame = NULL; GET_RE_DEBUG_FLAGS_DECL; @@ -3057,7 +3598,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif - if ( depth == 0 ) { while (first_non_open && OP(first_non_open) == OPEN) first_non_open=regnext(first_non_open); @@ -3069,15 +3609,40 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV min_subtract = 0; /* How mmany chars to subtract from the minimum node length to get a real minimum (because the folded version may be shorter) */ - bool has_exactf_sharp_s = FALSE; + bool unfolded_multi_char = FALSE; /* Peephole optimizer: */ - DEBUG_STUDYDATA("Peep:", data,depth); - DEBUG_PEEP("Peep",scan,depth); + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); + - /* Its not clear to khw or hv why this is done here, and not in the - * clauses that deal with EXACT nodes. khw's guess is that it's - * because of a previous design */ - JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -3110,24 +3675,29 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, || OP(scan) == IFTHEN) { next = regnext(scan); code = OP(scan); - /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ if (OP(next) == code || code == IFTHEN) { - /* NOTE - There is similar code to this block below for handling - TRIE nodes on a re-study. If you change stuff here check there - too. */ - I32 max1 = 0, min1 = I32_MAX, num = 0; - struct regnode_charclass_class accum; + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; regnode * const startbranch=scan; - if (flags & SCF_DO_SUBSTR) - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ - if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); while (OP(scan) == code) { - I32 deltanext, minnext, f = 0, fake; - struct regnode_charclass_class this_class; + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; num++; data_fake.flags = 0; @@ -3144,7 +3714,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (code != BRANCH) scan = NEXTOPER(scan); if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -3152,21 +3722,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - next, &data_fake, - stopparen, recursed, NULL, f,depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); if (min1 > minnext) min1 = minnext; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; + max1 = SSize_t_MAX; } else if (max1 < minnext + deltanext) max1 = minnext + deltanext; scan = next; if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > minnext) + if ( stopmin > minnext) stopmin = min + min1; flags &= ~SCF_DO_SUBSTR; if (data) @@ -3178,63 +3748,64 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; if (flags & SCF_DO_SUBSTR) { data->pos_min += min1; - if (data->pos_delta >= I32_MAX - (max1 - min1)) - data->pos_delta = I32_MAX; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; else data->pos_delta += max1 - min1; if (max1 != min1 || is_inf) data->longest = &(data->longest_float); } min += min1; - if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0) - delta = I32_MAX; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; else delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } - if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { /* demq. - Assuming this was/is a branch we are dealing with: 'scan' now - points at the item that follows the branch sequence, whatever - it is. We now start at the beginning of the sequence and look - for subsequences of + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of BRANCH->EXACT=>x1 BRANCH->EXACT=>x2 tail - which would be constructed from a pattern like /A|LIST|OF|WORDS/ + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ If we can find such a subsequence we need to turn the first element into a trie and then add the subsequent branch exact @@ -3242,7 +3813,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, We have two cases - 1. patterns where the whole set of branches can be converted. + 1. patterns where the whole set of branches can be + converted. 2. patterns where only a subset can be converted. @@ -3253,7 +3825,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, 'BRANCH EXACT; BRANCH EXACT; BRANCH X' becomes BRANCH TRIE; BRANCH X; - There is an additional case, that being where there is a + There is an additional case, that being where there is a common prefix, which gets split out into an EXACT like node preceding the TRIE node. @@ -3279,7 +3851,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U32 count=0; #ifdef DEBUGGING - SV * const mysv = sv_newmortal(); /* for dumping */ + SV * const mysv = sv_newmortal(); /* for dumping */ #endif /* var tail is used because there may be a TAIL regop in the way. Ie, the exacts will point to the @@ -3294,49 +3866,60 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, tail = regnext( tail ); } - + DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, tail ); + regprop(RExC_rx, mysv, tail, NULL); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", - (int)depth * 2 + 2, "", - "Looking for TRIE'able sequences. Tail node is: ", - SvPV_nolen_const( mysv ) + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) ); }); - + /* Step through the branches cur represents each branch, - noper is the first thing to be matched as part of that branch + noper is the first thing to be matched as part + of that branch noper_next is the regnext() of that node. - We normally handle a case like this /FOO[xyz]|BAR[pqr]/ - via a "jump trie" but we also support building with NOJUMPTRIE, - which restricts the trie logic to structures like /FOO|BAR/. - - If noper is a trieable nodetype then the branch is a possible optimization - target. If we are building under NOJUMPTRIE then we require that noper_next - is the same as scan (our current position in the regex program). - - Once we have two or more consecutive such branches we can create a - trie of the EXACT's contents and stitch it in place into the program. - - If the sequence represents all of the branches in the alternation we - replace the entire thing with a single TRIE node. - - Otherwise when it is a subsequence we need to stitch it in place and - replace only the relevant branches. This means the first branch has - to remain as it is used by the alternation logic, and its next pointer, - and needs to be repointed at the item on the branch chain following - the last branch we have optimized away. - - This could be either a BRANCH, in which case the subsequence is internal, - or it could be the item following the branch sequence in which case the - subsequence is at the end (which does not necessarily mean the first node - is the start of the alternation). - - TRIE_TYPE(X) is a define which maps the optype to a trietype. + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. optype | trietype ----------------+----------- @@ -3344,14 +3927,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, EXACT | EXACT EXACTFU | EXACTFU EXACTFU_SS | EXACTFU - EXACTFU_TRICKYFOLD | EXACTFU - EXACTFA | 0 + EXACTFA | EXACTFA */ #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ ( EXACT == (X) ) ? EXACT : \ - ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ 0 ) /* dont use tail as the end marker for this traverse */ @@ -3366,27 +3949,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); - regprop(RExC_rx, mysv, noper); + regprop(RExC_rx, mysv, noper, NULL); PerlIO_printf( Perl_debug_log, " -> %s", SvPV_nolen_const(mysv)); if ( noper_next ) { - regprop(RExC_rx, mysv, noper_next ); + regprop(RExC_rx, mysv, noper_next, NULL); PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), - PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] + PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] ); }); - /* Is noper a trieable nodetype that can be merged with the - * current trie (if there is one)? */ + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ if ( noper_trietype && ( @@ -3399,10 +3982,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif && count < U16_MAX) { - /* Handle mergable triable node - * Either we are the first node in a new trieable sequence, - * in which case we do some bookkeeping, otherwise we update - * the end pointer. */ + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ if ( !first ) { first = cur; if ( noper_trietype == NOTHING ) { @@ -3415,8 +3998,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_next_trietype ) { trietype = noper_next_trietype; } else if (noper_next_type) { - /* a NOTHING regop is 1 regop wide. We need at least two - * for a trie so we can't merge this in */ + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ first = NULL; } } else { @@ -3432,31 +4016,39 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle mergable triable node */ else { /* handle unmergable node - - * noper may either be a triable node which can not be tried - * together with the current trie, or a non triable node */ + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ if ( last ) { - /* If last is set and trietype is not NOTHING then we have found - * at least two triable branch sequences in a row of a similar - * trietype so we can turn them into a trie. If/when we - * allow NOTHING to start a trie sequence this condition will be - * required, and it isn't expensive so we leave it in for now. */ + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ if ( trietype && trietype != NOTHING ) make_trie( pRExC_state, - startbranch, first, cur, tail, count, - trietype, depth+1 ); - last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ } if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ - /* noper is triable, so we can start a new trie sequence */ + /* noper is triable, so we can start a new + * trie sequence */ count = 1; first = cur; trietype = noper_trietype; } else if (first) { - /* if we already saw a first but the current node is not triable then we have + /* if we already saw a first but the + * current node is not triable then we have * to reset the first information. */ count = 0; first = NULL; @@ -3465,18 +4057,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle unmergable node */ } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) \n", (int)depth * 2 + 2, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); if ( last && trietype ) { if ( trietype != NOTHING ) { - /* the last branch of the sequence was part of a trie, - * so we have to construct it here outside of the loop - */ - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) @@ -3486,20 +4081,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( startbranch == first && scan == tail ) { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } } #endif } else { - /* at this point we know whatever we have is a NOTHING sequence/branch - * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING */ if ( startbranch == first ) { regnode *opt; - /* the entire thing is a NOTHING sequence, something like this: - * (?:|) So we can turn it into a plain NOTHING op. */ + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); @@ -3513,9 +4111,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } /* end if ( last) */ } /* TRIE_MAXBUF is non zero */ - + } /* do trie */ - + } else if ( code == BRANCHJ ) { /* single branch is optimized. */ scan = NEXTOPER(NEXTOPER(scan)); @@ -3527,9 +4125,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 paren; regnode *start; regnode *end; + U32 my_recursed_depth= recursed_depth; if (OP(scan) != SUSPEND) { - /* set the pointer */ + /* set the pointer */ if (OP(scan) == GOSUB) { paren = ARG(scan); RExC_recurse[ARG2L(scan)] = scan; @@ -3540,21 +4139,33 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, start = RExC_rxi->program + 1; end = RExC_opend; } - if (!recursed) { - Newxz(recursed, (((RExC_npar)>>3) +1), U8); - SAVEFREEPV(recursed); - } - if (!PAREN_TEST(recursed,paren+1)) { - PAREN_SET(recursed,paren+1); + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; Newx(newframe,1,scan_frame); } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } } else { @@ -3571,17 +4182,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, newframe->last = last; newframe->stop = stopparen; newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); frame = newframe; scan = start; stopparen = paren; last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; continue; } } else if (OP(scan) == EXACT) { - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc; if (UTF) { const U8 * const s = (U8*)STRING(scan); @@ -3597,7 +4214,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data->last_end == -1) { /* Update the start info. */ data->last_start_min = data->pos_min; data->last_start_max = is_inf - ? I32_MAX : data->pos_min + data->pos_delta; + ? SSize_t_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); if (UTF) @@ -3608,83 +4225,47 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) mg->mg_len += utf8_length((U8*)STRING(scan), - (U8*)STRING(scan)+STR_LEN(scan)); + (U8*)STRING(scan)+STR_LEN(scan)); } data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - - - /* If compatible, we or it in below. It is compatible if is - * in the bitmp and either 1) its bit or its fold is set, or 2) - * it's for a locale. Even if there isn't unicode semantics - * here, at runtime there may be because of matching against a - * utf8 string, so accept a possible false positive for - * latin1-range folds */ - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && (!(data->start_class->flags & ANYOF_LOC_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - ) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) - ANYOF_BITMAP_SET(data->start_class, uc); - else if (uc >= 0x100) { - int i; - - /* Some Unicode code points fold to the Latin1 range; as - * XXX temporary code, instead of figuring out if this is - * one, just assume it is and set all the start class bits - * that could be some such above 255 code point's fold - * which will generate fals positives. As the code - * elsewhere that does compute the fold settles down, it - * can be extracted out and re-used here */ - for (i = 0; i < 256; i++){ - if (HAS_NONLATIN1_FOLD_CLOSURE(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } - } - } - CLEAR_SSC_EOS(data->start_class); - if (uc < 0x100) - data->start_class->flags &= ~ANYOF_UNICODE_ALL; + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); } else if (flags & SCF_DO_STCLASS_OR) { - /* false positive possible if the class is case-folded */ - if (uc < 0x100) - ANYOF_BITMAP_SET(data->start_class, uc); - else - data->start_class->flags |= ANYOF_UNICODE_ALL; - CLEAR_SSC_EOS(data->start_class); - cl_and(data->start_class, and_withp); + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { const U8 * const s = (U8 *)STRING(scan); uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - if (has_exactf_sharp_s) { - RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; } min += l - min_subtract; assert (min >= 0); @@ -3699,99 +4280,95 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) { - ANYOF_BITMAP_SET(data->start_class, uc); - CLEAR_SSC_EOS(data->start_class); - if (OP(scan) == EXACTFL) { - /* XXX This set is probably no longer necessary, and - * probably wrong as LOCALE now is on in the initial - * state */ - data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; - } - else { + if (OP(scan) == EXACTFL) { + + /* We don't know what the folds are; it could be anything. XXX + * Actually, we only support UTF-8 encoding for code points + * above Latin1, so we could know what those folds are. */ + EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, + 0, + UV_MAX); + } + else { /* Non-locale EXACTFish */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + if (uc < 256) { /* We know what the Latin1 folds are ... */ + if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we + know if anything folds + with this */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); + if (OP(scan) != EXACTFA) { /* The folds below aren't + legal under /iaa */ + if (isARG2_lower_or_UPPER_ARG1('s', uc)) { + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, 's'); + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, 'S'); + } + } - /* Also set the other member of the fold pair. In case - * that unicode semantics is called for at runtime, use - * the full latin1 fold. (Can't do this for locale, - * because not known until runtime) */ - ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); - - /* All other (EXACTFL handled above) folds except under - * /iaa that include s, S, and sharp_s also may include - * the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } - } - } - } - else if (uc >= 0x100) { - int i; - for (i = 0; i < 256; i++){ - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } - } - } + /* We also know if there are above-Latin1 code points + * that fold to this (none legal for ASCII and /iaa) */ + if ((! isASCII(uc) || OP(scan) != EXACTFA) + && HAS_NONLATIN1_FOLD_CLOSURE(uc)) + { + /* XXX We could know exactly what does fold to this + * if the reverse folds are loaded, as currently in + * S_regclass() */ + _invlist_union(EXACTF_invlist, + PL_AboveLatin1, + &EXACTF_invlist); + } + } + } + else { /* Non-locale, above Latin1. XXX We don't currently + know what participates in folds with this, so have + to assume anything could */ + + /* XXX We could know exactly what does fold to this if the + * reverse folds are loaded, as currently in S_regclass(). + * But we do know that under /iaa nothing in the ASCII + * range can participate */ + if (OP(scan) == EXACTFA) { + _invlist_union_complement_2nd(EXACTF_invlist, + PL_XPosix_ptrs[_CC_ASCII], + &EXACTF_invlist); + } + else { + EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, + 0, UV_MAX); + } + } + } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); } else if (flags & SCF_DO_STCLASS_OR) { - if (data->start_class->flags & ANYOF_LOC_FOLD) { - /* false positive possible if the class is case-folded. - Assume that the locale settings are the same... */ - if (uc < 0x100) { - ANYOF_BITMAP_SET(data->start_class, uc); - if (OP(scan) != EXACTFL) { - - /* And set the other member of the fold pair, but - * can't do that in locale because not known until - * run-time */ - ANYOF_BITMAP_SET(data->start_class, - PL_fold_latin1[uc]); - - /* All folds except under /iaa that include s, S, - * and sharp_s also may include the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } - } - } - } - CLEAR_SSC_EOS(data->start_class); - } - cl_and(data->start_class, and_withp); + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); } else if (REGNODE_VARIES(OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, fl = 0; - I32 f = flags, pos_before = 0; + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; regnode * const oscan = scan; - struct regnode_charclass_class this_class; - struct regnode_charclass_class *oclass = NULL; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; I32 next_is_eval = 0; switch (PL_regkind[OP(scan)]) { @@ -3821,12 +4398,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan = NEXTOPER(scan); goto do_curly; } - is_inf = is_inf_internal = 1; - scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } + is_inf = is_inf_internal = 1; + scan = regnext(scan); goto optimize_curly_tail; case CURLY: if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) @@ -3847,7 +4425,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ pos_before = data->pos_min; } if (data) { @@ -3857,7 +4437,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags |= SF_IS_INF; } if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); oclass = data->start_class; data->start_class = &this_class; f |= SCF_DO_STCLASS_AND; @@ -3876,36 +4456,36 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f &= ~SCF_WHILEM_VISITED_POS; /* This will finish on WHILEM, setting scan, or on NULL: */ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - last, data, stopparen, recursed, NULL, - (mincount == 0 - ? (f & ~SCF_DO_SUBSTR) : f),depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); if (flags & SCF_DO_STCLASS) data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&this_class, data->start_class, - struct regnode_charclass_class); + StructCopy(&this_class, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); - cl_and(data->start_class, and_withp); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); } else if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); flags &= ~SCF_DO_STCLASS; } if (!scan) /* It was not CURLYX, but CURLY. */ @@ -3915,24 +4495,26 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3) /* Complement check for big count */ + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ { /* Fatal warnings may leak the regexp without this: */ SAVEFREESV(RExC_rx_sv); ckWARNreg(RExC_parse, - "Quantifier unexpected on zero-length expression"); + "Quantifier unexpected on zero-length expression"); (void)ReREFCNT_inc(RExC_rx_sv); } min += minnext * mincount; - is_inf_internal |= deltanext == I32_MAX - || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); is_inf |= is_inf_internal; - if (is_inf) - delta = I32_MAX; - else - delta += (minnext + deltanext) * maxcount - minnext * mincount; - + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } /* Try powerful optimization CURLYX => CURLYN. */ if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR @@ -3983,7 +4565,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ - && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4030,7 +4615,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif /* Optimize again: */ study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, - NULL, stopparen, recursed, NULL, 0,depth+1); + NULL, stopparen, recursed_depth, NULL, 0,depth+1); } else oscan->flags = 0; @@ -4055,43 +4640,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, pars++; if (flags & SCF_DO_SUBSTR) { SV *last_str = NULL; + STRLEN last_chrs = 0; int counted = mincount != 0; - if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ -#if defined(SPARC64_GCC_WORKAROUND) - I32 b = 0; - STRLEN l = 0; - const char *s = NULL; - I32 old = 0; - - if (pos_before >= data->last_start_min) - b = pos_before; - else - b = data->last_start_min; - - l = 0; - s = SvPV_const(data->last_found, l); - old = b - data->last_start_min; - -#else - I32 b = pos_before >= data->last_start_min + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min ? pos_before : data->last_start_min; STRLEN l; const char * const s = SvPV_const(data->last_found, l); - I32 old = b - data->last_start_min; -#endif + SSize_t old = b - data->last_start_min; if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); repeatcpy(SvPVX(last_str) + l, - SvPVX_const(last_str), l, mincount - 1); + SvPVX_const(last_str), l, + mincount - 1); SvCUR_set(last_str, SvCUR(last_str) * mincount); /* Add additional parts. */ SvCUR_set(data->last_found, @@ -4103,34 +4677,41 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str) - l; + mg->mg_len += last_chrs * (mincount-1); } + last_chrs *= mincount; data->last_end += l * (mincount - 1); } } else { /* start offset must point into the last copy */ data->last_start_min += minnext * (mincount - 1); - data->last_start_max += is_inf ? I32_MAX + data->last_start_max += is_inf ? SSize_t_MAX : (maxcount - 1) * (minnext + data->pos_delta); } } /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n", - counted, deltanext, I32_MAX, minnext, maxcount, mincount); -if (deltanext != I32_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta); +PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf + " SSize_t_MAX=%"UVdf" minnext=%"UVdf + " maxcount=%"UVdf" mincount=%"UVdf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif - if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta) - data->pos_delta = I32_MAX; + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; else data->pos_delta += - counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount; if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); if (mincount && last_str) { SV * const sv = data->last_found; MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? @@ -4140,12 +4721,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext mg->mg_len = -1; sv_setsv(sv, last_str); data->last_end = data->pos_min; - data->last_start_min = - data->pos_min - CHR_SVLEN(last_str); + data->last_start_min = data->pos_min - last_chrs; data->last_start_max = is_inf - ? I32_MAX - : data->pos_min + data->pos_delta - - CHR_SVLEN(last_str); + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; } data->longest = &(data->longest_float); } @@ -4160,164 +4739,212 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext NEXT_OFF(oscan) += NEXT_OFF(next); } continue; - default: /* REF, and CLUMP only? */ + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) - cl_anything(pRExC_state, data->start_class); + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } flags &= ~SCF_DO_STCLASS; break; } } else if (OP(scan) == LNBREAK) { if (flags & SCF_DO_STCLASS) { - int value = 0; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ if (flags & SCF_DO_STCLASS_AND) { - for (value = 0; value < 256; value++) - if (!is_VERTWS_cp(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - else { - for (value = 0; value < 256; value++) - if (is_VERTWS_cp(value)) - ANYOF_BITMAP_SET(data->start_class, value); + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } min++; delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += 1; data->pos_delta += 1; data->longest = &(data->longest_float); } } else if (REGNODE_SIMPLE(OP(scan))) { - int value = 0; if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min++; } min++; if (flags & SCF_DO_STCLASS) { - int loop_max = 256; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; /* Some of the logic below assumes that switching locale on will only add false positives. */ - switch (PL_regkind[OP(scan)]) { - U8 classnum; + switch (OP(scan)) { - case SANY: default: #ifdef DEBUGGING - Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); #endif - do_default: + case CANY: + case SANY: if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_match_all_cp(data->start_class); break; + case REG_ANY: - if (OP(scan) == SANY) - goto do_default; - if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ - value = (ANYOF_BITMAP_TEST(data->start_class,'\n') - || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); - cl_anything(pRExC_state, data->start_class); + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); } - if (flags & SCF_DO_STCLASS_AND || !value) - ANYOF_BITMAP_CLEAR(data->start_class,'\n'); break; - case ANYOF: + + case ANYOF: if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, - (struct regnode_charclass_class*)scan); + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); else - cl_or(pRExC_state, data->start_class, - (struct regnode_charclass_class*)scan); + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); break; - case POSIXA: - loop_max = 128; + + case NPOSIXL: + invert = 1; /* FALL THROUGH */ + case POSIXL: - case POSIXD: - case POSIXU: - classnum = FLAGS(scan); - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } - } - } - else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum)); + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; + if (flags & SCF_DO_STCLASS_AND) { + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } - else { - - /* Even if under locale, set the bits for non-locale - * in case it isn't a true locale-node. This will - * create false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); + } + else { + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } - } - break; - case NPOSIXA: - loop_max = 128; + } + break; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; /* FALL THROUGH */ - case NPOSIXL: - case NPOSIXU: + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + case NPOSIXD: - classnum = FLAGS(scan); - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } - } - } - else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1); - } - else { + case NPOSIXU: + invert = 1; + /* FALL THROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } - /* Even if under locale, set the bits for non-locale in - * case it isn't a true locale-node. This will create - * false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } - } - if (PL_regkind[OP(scan)] == NPOSIXD) { - data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - } - break; + join_posix: + + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); + } + else { + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); + } } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -4325,7 +4952,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } else if ( PL_regkind[OP(scan)] == BRANCHJ @@ -4344,11 +4971,12 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("OPFAIL",data,depth); /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, mysv_val, upto); - PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(mysv_val), - (IV)REG_NODE_NUM(upto), - (IV)(upto - scan) + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) ); }); OP(scan) = OPFAIL; @@ -4358,16 +4986,16 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext scan= upto; continue; } - if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY || OP(scan) == UNLESSM ) { /* Negative Lookahead/lookbehind In this case we can't do fixed string optimisation. */ - I32 deltanext, minnext, fake = 0; + SSize_t deltanext, minnext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; data_fake.flags = 0; @@ -4380,7 +5008,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.pos_delta = delta; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4388,14 +5016,16 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext f |= SCF_WHILEM_VISITED_POS; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f, depth+1); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (minnext > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)minnext; } @@ -4414,14 +5044,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext * *** HACK *** for now just treat as "no information". * See [perl #56690]. */ - cl_init(pRExC_state, data->start_class); + ssc_init(pRExC_state, data->start_class); } else { /* AND before and after: combine and continue */ - const int was = TEST_SSC_EOS(data->start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } } } @@ -4434,26 +5060,26 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext length of the pattern, something we won't know about until after the recurse. */ - I32 deltanext, fake = 0; + SSize_t deltanext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; - /* We use SAVEFREEPV so that when the full compile - is finished perl will clean up the allocated + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated minlens when it's all done. This way we don't have to worry about freeing them when we know they wont be used, which would be a pain. */ - I32 *minnextp; - Newx( minnextp, 1, I32 ); + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); SAVEFREEPV(minnextp); if (data) { StructCopy(data, &data_fake, scan_data_t); if ((flags & SCF_DO_SUBSTR) && data->last_found) { f |= SCF_DO_SUBSTR; - if (scan->flags) - SCAN_COMMIT(pRExC_state, &data_fake,minlenp); + if (scan->flags) + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); data_fake.last_found=newSVsv(data->last_found); } } @@ -4465,7 +5091,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.flags |= SF_IS_INF; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4474,14 +5100,17 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f,depth+1); + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (*minnextp > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)*minnextp; } @@ -4489,11 +5118,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext *minnextp += min; if (f & SCF_DO_STCLASS_AND) { - const int was = TEST_SSC_EOS(data.start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4504,10 +5129,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { if (RExC_rx->minlen<*minnextp) RExC_rx->minlen=*minnextp; - SCAN_COMMIT(pRExC_state, &data_fake, minnextp); + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); SvREFCNT_dec_NN(data_fake.last_found); - - if ( data_fake.minlen_fixed != minlenp ) + + if ( data_fake.minlen_fixed != minlenp ) { data->offset_fixed= data_fake.offset_fixed; data->minlen_fixed= data_fake.minlen_fixed; @@ -4548,7 +5173,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext } else if ( PL_regkind[OP(scan)] == ENDLIKE ) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); flags &= ~SCF_DO_SUBSTR; } if (data && OP(scan)==ACCEPT) { @@ -4560,26 +5185,26 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } else if (OP(scan) == GPOS) { - if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && - !(delta || is_inf || (data && data->pos_delta))) + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) { - if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) - RExC_rx->extflags |= RXf_ANCH_GPOS; - if (RExC_rx->gofs < (U32)min) + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) RExC_rx->gofs = min; } else { - RExC_rx->extflags |= RXf_GPOS_FLOAT; + RExC_rx->intflags |= PREGf_GPOS_FLOAT; RExC_rx->gofs = 0; - } + } } #ifdef TRIE_STUDY_OPT #ifdef FULL_TRIE_STUDY @@ -4590,26 +5215,28 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext regnode *trie_node= scan; regnode *tail= regnext(scan); reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - I32 max1 = 0, min1 = I32_MAX; - struct regnode_charclass_class accum; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; - if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); - + ssc_init_zero(pRExC_state, &accum); + if (!trie->jump) { min1= trie->minlen; max1= trie->maxlen; } else { const regnode *nextbranch= NULL; U32 word; - - for ( word=1 ; word <= trie->wordcount ; word++) + + for ( word=1 ; word <= trie->wordcount ; word++) { - I32 deltanext=0, minnext=0, f = 0, fake; - struct regnode_charclass_class this_class; - + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; + data_fake.flags = 0; if (data) { data_fake.whilem_c = data->whilem_c; @@ -4619,40 +5246,39 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.last_closep = &fake; data_fake.pos_delta = delta; if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } if (flags & SCF_WHILEM_VISITED_POS) f |= SCF_WHILEM_VISITED_POS; - + if (trie->jump[word]) { if (!nextbranch) nextbranch = trie_node + trie->jump[0]; scan= trie_node + trie->jump[word]; /* We go from the jump point to the branch that follows - it. Note this means we need the vestigal unused branches - even though they arent otherwise used. - */ - minnext = study_chunk(pRExC_state, &scan, minlenp, - &deltanext, (regnode *)nextbranch, &data_fake, - stopparen, recursed, NULL, f,depth+1); + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed_depth, NULL, f,depth+1); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode*)nextbranch); - - if (min1 > (I32)(minnext + trie->minlen)) + + if (min1 > (SSize_t)(minnext + trie->minlen)) min1 = minnext + trie->minlen; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; - } else if (max1 < (I32)(minnext + deltanext + trie->maxlen)) + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) max1 = minnext + deltanext + trie->maxlen; - + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > min + min1) + if ( stopmin > min + min1) stopmin = min + min1; flags &= ~SCF_DO_SUBSTR; if (data) @@ -4664,7 +5290,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -4676,28 +5302,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } scan= tail; @@ -4707,19 +5330,20 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext else if (PL_regkind[OP(scan)] == TRIE) { reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; U8*bang=NULL; - + min += trie->minlen; delta += (trie->maxlen - trie->minlen); flags &= ~SCF_DO_STCLASS; /* xxx */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += trie->minlen; data->pos_delta += (trie->maxlen - trie->minlen); if (trie->maxlen != trie->minlen) data->longest = &(data->longest_float); } if (trie->jump) /* no more substrings -- for now /grr*/ - flags &= ~SCF_DO_SUBSTR; + flags &= ~SCF_DO_SUBSTR; } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ @@ -4727,10 +5351,24 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext /* Else: zero-length, ignore. */ scan = regnext(scan); } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ last = frame->last; scan = frame->next; stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + frame = frame->prev; goto fake_study_recurse; } @@ -4740,9 +5378,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("pre-fin:",data,depth); *scanp = scan; - *deltap = is_inf_internal ? I32_MAX : delta; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) - data->pos_delta = I32_MAX - data->pos_min; + data->pos_delta = SSize_t_MAX - data->pos_min; if (is_par > (I32)U8_MAX) is_par = 0; if (is_par && pars==1 && data) { @@ -4754,17 +5393,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; - + DEBUG_STUDYDATA("post-fin:",data,depth); - - return min < stopmin ? min : stopmin; + + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ } STATIC U32 -S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; @@ -4825,7 +5472,7 @@ Perl_reginitcolors(pTHX) } STMT_END #else #define CHECK_RESTUDY_GOTO_butfirst -#endif +#endif /* * pregcomp - compile a regular expression into internal code @@ -4834,7 +5481,7 @@ Perl_reginitcolors(pTHX) * scope */ -#ifndef PERL_IN_XSUB_RE +#ifndef PERL_IN_XSUB_RE /* return the currently in-scope regex engine (or the default if none) */ @@ -4847,7 +5494,7 @@ Perl_current_re_engine(pTHX) HV * const table = GvHV(PL_hintgv); SV **ptr; - if (!table) + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) return &PL_core_reg_engine; ptr = hv_fetchs(table, "regcomp", FALSE); if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) @@ -4928,12 +5575,11 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, Newx(dst, *plen_p * 2 + 1, U8); while (s < *plen_p) { - const UV uv = NATIVE_TO_ASCII(src[s]); - if (UNI_IS_INVARIANT(uv)) - dst[d] = (U8)UTF_TO_NATIVE(uv); + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; else { - dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv); - dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv); + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); } if (n < num_code_blocks) { if (!do_end && pRExC_state->code_blocks[n].start == s) { @@ -4997,6 +5643,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, STRLEN orig_patlen = 0; bool code = 0; SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; /* if we've got a delimiter, we go round the loop twice for each * svp slot (except the last), using the delimiter the second @@ -5015,21 +5662,21 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * The code in this block is based on S_pushav() */ AV *const av = (AV*)msv; - const I32 maxarg = AvFILL(av) + 1; + const SSize_t maxarg = AvFILL(av) + 1; SV **array; if (oplist) { assert(oplist->op_type == OP_PADAV - || oplist->op_type == OP_RV2AV); + || oplist->op_type == OP_RV2AV); oplist = oplist->op_sibling;; } if (SvRMAGICAL(av)) { - U32 i; + SSize_t i; Newx(array, maxarg, SV*); SAVEFREEPV(array); - for (i=0; i < (U32)maxarg; i++) { + for (i=0; i < maxarg; i++) { SV ** const svp = av_fetch(av, i, FALSE); array[i] = svp ? *svp : &PL_sv_undef; } @@ -5319,7 +5966,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, { Safefree(pRExC_state->code_blocks); /* use croak_sv ? */ - Perl_croak_nocontext("%s", SvPV_nolen_const(errsv)); + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); } } assert(SvROK(qr_ref)); @@ -5410,20 +6057,24 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, STATIC bool -S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol) +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) { /* This is the common code for setting up the floating and fixed length * string data extracted from Perl_re_op_compile() below. Returns a boolean * as to whether succeeded or not */ - I32 t,ml; + I32 t; + SSize_t ml; if (! (longest_length || (eol /* Can't have SEOL and MULTI */ && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) ) - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) { return FALSE; } @@ -5441,7 +6092,7 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, S follow this item. We calculate it ahead of time as once the lookbehind offset is added in we lose the ability to correctly calculate it.*/ - ml = minlen ? *(minlen) : (I32)longest_length; + ml = minlen ? *(minlen) : (SSize_t)longest_length; *rx_end_shift = ml - offset - longest_length + (SvTAIL(sv_longest) != 0) + lookbehind; @@ -5510,7 +6161,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, char *exp; regnode *scan; I32 flags; - I32 minlen = 0; + SSize_t minlen = 0; U32 rx_flags; SV *pat; SV *code_blocksv = NULL; @@ -5529,10 +6180,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, scan_data_t data; RExC_state_t RExC_state; RExC_state_t * const pRExC_state = &RExC_state; -#ifdef TRIE_STUDY_OPT +#ifdef TRIE_STUDY_OPT int restudied = 0; RExC_state_t copyRExC_state; -#endif +#endif GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_OP_COMPILE; @@ -5544,61 +6195,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * having to test them each time otherwise */ if (! PL_AboveLatin1) { PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); - PL_ASCII = _new_invlist_C_array(ASCII_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); - - PL_L1Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(PosixAlnum_invlist); - - PL_L1Posix_ptrs[_CC_ALPHA] - = _new_invlist_C_array(L1PosixAlpha_invlist); - PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist); - PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist); - - /* Cased is the same as Alpha in the ASCII range */ - PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist); - PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist); - PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist); - - PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - - PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist); - PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist); - - PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist); - PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist); - - PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist); - PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist); - - PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist); - PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist); - - PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist); - PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist); - PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist); - PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist); - - PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist); - PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist); - - PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist); - - PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist); - PL_L1Posix_ptrs[_CC_WORDCHAR] - = _new_invlist_C_array(L1PosixWord_invlist); - - PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist); - PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); - - PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); } #endif @@ -5714,6 +6315,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + RExC_contains_i = 0; pRExC_state->runtime_code_qr = NULL; DEBUG_COMPILE_r({ @@ -5735,11 +6337,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); /* return old regex if pattern hasn't changed */ - /* XXX: note in the below we have to check the flags as well as the pattern. + /* XXX: note in the below we have to check the flags as well as the + * pattern. * - * Things get a touch tricky as we have to compare the utf8 flag independently - * from the compile flags. - */ + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ if ( old_re && !recompile @@ -5756,10 +6358,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, rx_flags = orig_rx_flags; - if (initial_charset == REGEX_LOCALE_CHARSET) { - RExC_contains_locale = 1; + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; } - else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -5787,6 +6389,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_sawback = 0; RExC_seen = 0; + RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; @@ -5801,7 +6404,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_npar = 1; RExC_nestroot = 0; RExC_size = 0L; - RExC_emit = &RExC_emit_dummy; + RExC_emit = (regnode *) &RExC_emit_dummy; RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; @@ -5811,6 +6414,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_paren_name_list = NULL; #endif RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; RExC_recurse_count = 0; pRExC_state->code_index = 0; @@ -5854,12 +6459,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ DEBUG_PARSE_r({ - PerlIO_printf(Perl_debug_log, + PerlIO_printf(Perl_debug_log, "Required size %"IVdf" nodes\n" - "Starting second pass (creation)\n", + "Starting second pass (creation)\n", (IV)RExC_size); - RExC_lastnum=0; - RExC_lastparse=NULL; + RExC_lastnum=0; + RExC_lastparse=NULL; }); /* The first pass could have found things that force Unicode semantics */ @@ -5878,8 +6483,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (RExC_whilem_seen > 15) RExC_whilem_seen = 15; - /* Allocate space and zero-initialize. Note, the two step process - of zeroing when in debug mode, thus anything assigned has to + /* Allocate space and zero-initialize. Note, the two step process + of zeroing when in debug mode, thus anything assigned has to happen after that */ rx = (REGEXP*) newSV_type(SVt_REGEXP); r = ReANY(rx); @@ -5889,10 +6494,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, FAIL("Regexp out of space"); #ifdef DEBUGGING /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ - Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); -#else + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); +#else /* bulk initialize base fields with 0. */ - Zero(ri, sizeof(regexp_internal), char); + Zero(ri, sizeof(regexp_internal), char); #endif /* non-zero initialization begins here */ @@ -5916,14 +6522,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); /* The caret is output if there are any defaults: if not all the STD * flags are set, or if no character set specifier is needed */ bool has_default = (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) || ! has_charset); - bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ @@ -5984,13 +6592,24 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - - if (RExC_seen & REG_SEEN_RECURSE) { + + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { Newxz(RExC_open_parens, RExC_npar,regnode *); SAVEFREEPV(RExC_open_parens); Newxz(RExC_close_parens,RExC_npar,regnode *); SAVEFREEPV(RExC_close_parens); } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } /* Useful during FAIL. */ #ifdef RE_TRACK_PATTERN_OFFSETS @@ -6019,7 +6638,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { - ReREFCNT_dec(rx); + ReREFCNT_dec(rx); Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); } /* XXXX To minimize changes to RE engine we always allocate @@ -6033,6 +6652,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, reStudy: r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); #ifdef TRIE_STUDY_OPT if (!restudied) { @@ -6041,22 +6663,22 @@ reStudy: } else { U32 seen=RExC_seen; DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); - + RExC_state = copyRExC_state; - if (seen & REG_TOP_LEVEL_BRANCHES) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; else - RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; StructCopy(&zero_scan_data, &data, scan_data_t); } #else StructCopy(&zero_scan_data, &data, scan_data_t); -#endif +#endif /* Dig out information for optimizations. */ r->extflags = RExC_flags; /* was pm_op */ /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ - + if (UTF) SvUTF8_on(rx); /* Unicode in it? */ ri->regstclass = NULL; @@ -6066,20 +6688,21 @@ reStudy: /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ - if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ - I32 fake; + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; STRLEN longest_float_length, longest_fixed_length; - struct regnode_charclass_class ch_class; /* pointed to by data */ + regnode_ssc ch_class; /* pointed to by data */ int stclass_flag; - I32 last_close = 0; /* pointed to by data */ + SSize_t last_close = 0; /* pointed to by data */ regnode *first= scan; regnode *first_next= regnext(first); /* * Skip introductions and multiplicators >= 1 - * so that we can extract the 'meat' of the pattern that must + * so that we can extract the 'meat' of the pattern that must * match in the large if() sequence following. * NOTE that EXACT is NOT covered here, as it is normally - * picked up by the optimiser separately. + * picked up by the optimiser separately. * * This is unfortunate as the optimiser isnt handling lookahead * properly currently. @@ -6096,7 +6719,7 @@ reStudy: (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { - /* + /* * the only op that could be a regnode is PLUS, all the rest * will be regnode_1 or regnode_2. * @@ -6125,7 +6748,7 @@ reStudy: } #ifdef TRIE_STCLASS else if (PL_regkind[OP(first)] == TRIE && - ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) + ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { regnode *trie_op; /* this can happen only on restudy */ @@ -6151,35 +6774,35 @@ reStudy: PL_regkind[OP(first)] == NBOUND) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOL) { - r->extflags |= (OP(first) == MBOL - ? RXf_ANCH_MBOL + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL : (OP(first) == SBOL - ? RXf_ANCH_SBOL - : RXf_ANCH_BOL)); + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); first = NEXTOPER(first); goto again; } else if (OP(first) == GPOS) { - r->extflags |= RXf_ANCH_GPOS; + r->intflags |= PREGf_ANCH_GPOS; first = NEXTOPER(first); goto again; } else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = (OP(NEXTOPER(first)) == REG_ANY) - ? RXf_ANCH_MBOL - : RXf_ANCH_SBOL; - r->extflags |= type; - r->intflags |= PREGf_IMPLICIT; + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); first = NEXTOPER(first); goto again; } - if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback) + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6221,15 +6844,17 @@ reStudy: SAVEFREESV(data.last_found); first = scan; if (!ri->regstclass) { - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; } else /* XXXX Check for BOUND? */ stclass_flag = 0; data.last_closep = &last_close; - - minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ - &data, -1, NULL, NULL, + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 0); @@ -6241,9 +6866,11 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen - && !(RExC_seen & REG_SEEN_VERBARG) - && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ r->extflags |= RXf_CHECK_ALL; + } scan_commit(pRExC_state, &data,&minlen,0); longest_float_length = CHR_SVLEN(data.longest_float); @@ -6265,7 +6892,7 @@ reStudy: { r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; - if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; SvREFCNT_inc_simple_void_NN(data.longest_float); } @@ -6303,126 +6930,141 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag - && ! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } - /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; r->check_end_shift = r->anchored_end_shift; r->check_substr = r->anchored_substr; r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; - if (r->extflags & RXf_ANCH_SINGLE) - r->extflags |= RXf_NOSCAN; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; } else { + r->substrs->check_ix = 1; r->check_end_shift = r->float_end_shift; r->check_substr = r->float_substr; r->check_utf8 = r->float_utf8; r->check_offset_min = r->float_min_offset; r->check_offset_max = r->float_max_offset; } - /* XXXX Currently intuiting is not compatible with ANCH_GPOS. - This should be changed ASAP! */ - if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) ) { r->extflags |= RXf_USE_INTUIT; if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) if ( (STRLEN)minlen < longest_float_length ) minlen= longest_float_length; if ( (STRLEN)minlen < longest_fixed_length ) - minlen= longest_fixed_length; + minlen= longest_fixed_length; */ } else { /* Several toplevels. Best we can is to set minlen. */ - I32 fake; - struct regnode_charclass_class ch_class; - I32 last_close = 0; + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = ri->program + 1; - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; data.last_closep = &last_close; - - minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, - &data, -1, NULL, NULL, - SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS - |(restudied ? SCF_TRIE_DOING_RESTUDY : 0), + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), 0); - + CHECK_RESTUDY_GOTO_butfirst(NOOP); r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - if (! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) - { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } } + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", - (IV)minlen, (IV)r->minlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); }); r->minlenret = minlen; - if (r->minlen < minlen) + if (r->minlen < minlen) r->minlen = minlen; - - if (RExC_seen & REG_SEEN_GPOS) - r->extflags |= RXf_GPOS_SEEN; - if (RExC_seen & REG_SEEN_LOOKBEHIND) - r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ + + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_SEEN_CANY) - r->extflags |= RXf_CANY_SEEN; - if (RExC_seen & REG_SEEN_VERBARG) + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ } - if (RExC_seen & REG_SEEN_CUTGROUP) + if (RExC_seen & REG_CUTGROUP_SEEN) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) r->intflags |= PREGf_USE_RE_EVAL; @@ -6431,7 +7073,20 @@ reStudy: else RXp_PAREN_NAMES(r) = NULL; + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); regnode *next = NEXTOPER(first); @@ -6441,16 +7096,28 @@ reStudy: r->extflags |= RXf_NULL; else if (PL_regkind[fop] == BOL && nop == END) r->extflags |= RXf_START_ONLY; - else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END) + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) r->extflags |= RXf_WHITE; - else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + #ifdef DEBUGGING if (RExC_paren_names) { - ri->name_list_idx = add_data( pRExC_state, 1, "a" ); - ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); } else #endif ri->name_list_idx = 0; @@ -6465,15 +7132,17 @@ reStudy: /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ + DEBUG_RExC_seen(); PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); #ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_OFFSETS_r(if (ri->u.offsets) { - const U32 len = ri->u.offsets[0]; - U32 i; + const STRLEN len = ri->u.offsets[0]; + STRLEN i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", @@ -6533,7 +7202,8 @@ Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, else if (flags & RXapif_NEXTKEY) return reg_named_buff_nextkey(rx, flags); else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); return NULL; } } @@ -6659,7 +7329,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) { SV *ret; AV *av; - I32 length; + SSize_t length; struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; @@ -6670,11 +7340,12 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) } else if (flags & RXapif_ONE) { ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = MUTABLE_AV(SvRV(ret)); - length = av_len(av); + length = av_tindex(av); SvREFCNT_dec_NN(ret); return newSViv(length + 1); } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); return NULL; } } @@ -6722,19 +7393,29 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, { struct regexp *const rx = ReANY(r); char *s = NULL; - I32 i = 0; - I32 s1, t1; + SSize_t i = 0; + SSize_t s1, t1; I32 n = paren; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - - if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + + if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - && !(rx->extflags & RXf_PMf_KEEPCOPY) - ) - goto ret_undef; + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } if (!rx->subbeg) goto ret_undef; @@ -6750,14 +7431,14 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, i = rx->offs[0].start; s = rx->subbeg; } - else + else if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) && rx->offs[0].end != -1) { /* $', ${^POSTMATCH} */ s = rx->subbeg - rx->suboffset + rx->offs[0].end; i = rx->sublen + rx->suboffset - rx->offs[0].end; - } + } else if ( 0 <= n && n <= (I32)rx->nparens && (s1 = rx->offs[n].start) != -1 && @@ -6768,12 +7449,12 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, s = rx->subbeg + s1 - rx->suboffset; } else { goto ret_undef; - } + } assert(s >= rx->subbeg); - assert(rx->sublen >= (s - rx->subbeg) + i ); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); if (i >= 0) { -#if NO_TAINT_SUPPORT +#ifdef NO_TAINT_SUPPORT sv_setpvn(sv, s, i); #else const int oldtainted = TAINT_get; @@ -6781,7 +7462,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->extflags & RXf_CANY_SEEN) + if ( (rx->intflags & PREGf_CANY_SEEN) ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) : (RXp_MATCH_UTF8(rx)) ) @@ -6806,7 +7487,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, TAINT; SvTAINT(sv); } - } else + } else SvTAINTED_off(sv); } } else { @@ -6840,13 +7521,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + /* Some of this code was originally in C in F */ switch (paren) { case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; @@ -6859,8 +7554,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, return 0; case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; @@ -6872,13 +7565,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } return 0; - case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - - /* $& / ${^MATCH}, $1, $2, ... */ - default: + default: /* $& / ${^MATCH}, $1, $2, ... */ if (paren <= (I32)rx->nparens && (s1 = rx->offs[paren].start) != -1 && (t1 = rx->offs[paren].end) != -1) @@ -6935,7 +7622,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) PERL_ARGS_ASSERT_REG_SCAN_NAME; - if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { /* skip IDFIRST by using do...while */ if (UTF) do { @@ -6946,7 +7635,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) RExC_parse++; } while (isWORDCHAR(*RExC_parse)); } else { - RExC_parse++; /* so the <- from the vFAIL is after the offending character */ + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ vFAIL("Group name must start with a non-digit word character"); } if ( flags ) { @@ -7122,6 +7812,8 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) PERL_ARGS_ASSERT_INVLIST_SET_LEN; + assert(SvTYPE(invlist) == SVt_INVLIST); + SvCUR_set(invlist, (len == 0) ? 0 @@ -7137,6 +7829,8 @@ S_get_invlist_previous_index_addr(pTHX_ SV* invlist) PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); + return &(((XINVLIST*) SvANY(invlist))->prev_index); } @@ -7170,6 +7864,8 @@ S_invlist_max(pTHX_ SV* const invlist) PERL_ARGS_ASSERT_INVLIST_MAX; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Assumes worst case, in which the 0 element is not counted in the * inversion list, so subtracts 1 for that */ return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ @@ -7207,10 +7903,9 @@ Perl__new_invlist(pTHX_ IV initial_size) return new_list; } -#endif -STATIC SV* -S__new_invlist_C_array(pTHX_ const UV* const list) +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) { /* Return a pointer to a newly constructed inversion list, initialized to * point to , which has to be in the exact correct inversion list @@ -7260,8 +7955,11 @@ S__new_invlist_C_array(pTHX_ const UV* const list) /* Initialize the iteration pointer. */ invlist_iterfinish(invlist); + SvREADONLY_on(invlist); + return invlist; } +#endif /* ifndef PERL_IN_XSUB_RE */ STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) @@ -7270,6 +7968,8 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) PERL_ARGS_ASSERT_INVLIST_EXTEND; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Add one to account for the zero element at the beginning which may not * be counted by the calling parameters */ SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); @@ -7280,15 +7980,16 @@ S_invlist_trim(pTHX_ SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_TRIM; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Change the length of the inversion list to how many entries it currently * has */ SvPV_shrink_to_cur((SV *) invlist); } -#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) - STATIC void -S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) { /* Subject to change or removal. Append the range from 'start' to 'end' at * the end of the inversion list. The range must be above any existing @@ -7318,8 +8019,8 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) { Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", - array[final_element], start, - ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); } /* Here, it is a legal append. If the new range begins with the first @@ -7462,7 +8163,8 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) } void -Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) +Perl__invlist_populate_swatch(pTHX_ SV* const invlist, + const UV start, const UV end, U8* swatch) { /* populates a swatch of a swash the same way swatch_get() does in utf8.c, * but is used when the swash has an inversion list. This makes this much @@ -7555,14 +8257,16 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV } void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. The first list, - * , may be NULL, in which case a copy of the second list is returned. - * If is TRUE, the union is taken of the complement - * (inversion) of instead of b itself. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7603,9 +8307,13 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b /* If either one is empty, the union is the other one */ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + if (*output == a) { if (a != NULL) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } } if (*output != b) { @@ -7614,18 +8322,27 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b _invlist_invert(*output); } } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; if (*output == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } /* The complement of an empty list is a list that has everything in it, * so the union with includes everything too */ if (complement_b) { if (a == *output) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } *output = _new_invlist(1); _append_range_to_invlist(*output, 0, UV_MAX); @@ -7634,6 +8351,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b *output = invlist_clone(a); } /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } @@ -7773,24 +8494,36 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b } } - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *output || b == *output) { assert(! invlist_is_iterating(*output)); - SvREFCNT_dec_NN(*output); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } } *output = u; + return; } void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. - * If is TRUE, the result will be the intersection of - * and the complement (or inversion) of instead of directly. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7826,8 +8559,9 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, assert(a != b); /* Special case if either one is empty */ - len_a = _invlist_len(a); + len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; if (len_a != 0 && complement_b) { @@ -7836,25 +8570,39 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * must be every possible code point. Thus the intersection is * simply 'a'. */ if (*i != a) { - *i = invlist_clone(a); - if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } + + *i = invlist_clone(a); } /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } return; } /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The * intersection must be empty */ if (*i == a) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } else if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + return; } @@ -7959,7 +8707,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } /* The final length is what we've output so far plus what else is in the - * intersection. At most one of the subexpressions below will be non-zero */ + * intersection. At most one of the subexpressions below will be non-zero + * */ len_r = i_r; if (count >= 2) { len_r += (len_a - i_a) + (len_b - i_b); @@ -7984,13 +8733,21 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *i || b == *i) { assert(! invlist_is_iterating(*i)); - SvREFCNT_dec_NN(*i); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } } *i = r; + return; } @@ -8037,6 +8794,35 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) return invlist; } +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + #endif PERL_STATIC_INLINE SV* @@ -8065,43 +8851,6 @@ Perl__invlist_invert(pTHX_ SV* const invlist) *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); } -void -Perl__invlist_invert_prop(pTHX_ SV* const invlist) -{ - /* Complement the input inversion list (which must be a Unicode property, - * all of which don't match above the Unicode maximum code point.) And - * Perl has chosen to not have the inversion match above that either. This - * adds a 0x110000 if the list didn't end with it, and removes it if it did - */ - - UV len; - UV* array; - - PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; - - _invlist_invert(invlist); - - len = _invlist_len(invlist); - - if (len != 0) { /* If empty do nothing */ - array = invlist_array(invlist); - if (array[len - 1] != PERL_UNICODE_MAX + 1) { - /* Add 0x110000. First, grow if necessary */ - len++; - if (invlist_max(invlist) < len) { - invlist_extend(invlist, len); - array = invlist_array(invlist); - } - invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist)); - array[len - 1] = PERL_UNICODE_MAX + 1; - } - else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist)); - } - } - - return; -} #endif PERL_STATIC_INLINE SV* @@ -8109,7 +8858,7 @@ S_invlist_clone(pTHX_ SV* const invlist) { /* Return a new inversion list that is a copy of the input one, which is - * unchanged */ + * unchanged. The new list will not be mortal even if the old one was. */ /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ @@ -8134,6 +8883,8 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist) PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); + return &(((XINVLIST*) SvANY(invlist))->iterator); } @@ -8267,42 +9018,59 @@ Perl__invlist_contents(pTHX_ SV* const invlist) } #endif -#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP +#ifndef PERL_IN_XSUB_RE void -Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) { - /* Dumps out the ranges in an inversion list. The string 'header' - * if present is output on a line before the first range */ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ UV start, end; + STRLEN count = 0; PERL_ARGS_ASSERT__INVLIST_DUMP; - if (header && strlen(header)) { - PerlIO_printf(Perl_debug_log, "%s\n", header); - } if (invlist_is_iterating(invlist)) { - PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n"); + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); return; } invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); } else if (end != start) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", - start, end); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); } else { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start); + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); } + count += 2; } } #endif -#if 0 +#ifdef PERL_ARGS_ASSERT__INVLISTEQ bool S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) { @@ -8344,7 +9112,6 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -8373,7 +9140,7 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) /* End of inversion list object */ STATIC void -S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) { /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' * constructs, and updates RExC_flags with them. On input, RExC_parse @@ -8433,7 +9200,6 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) } cs = REGEX_LOCALE_CHARSET; has_charset_modifier = LOCALE_PAT_MOD; - RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier) { @@ -8487,7 +9253,8 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); } else if (has_charset_modifier == *(RExC_parse - 1)) { - vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); } else { vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); @@ -8495,12 +9262,15 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) /*NOTREACHED*/ neg_modifier: RExC_parse++; - vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { - const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; if (! (wastedflags & wflagbit) ) { wastedflags |= wflagbit; /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ @@ -8553,13 +9323,17 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) RExC_flags |= posflags; RExC_flags &= ~negflags; set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } return; /*NOTREACHED*/ default: fail_modifiers: - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", - RExC_parse-seqstart, seqstart); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } @@ -8634,7 +9408,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char *start_arg = NULL; unsigned char op = 0; int argok = 1; - int internal_argval = 0; /* internal_argval is only useful if !argok */ + int internal_argval = 0; /* internal_argval is only useful if + !argok */ if (has_intervening_patws && SIZE_ONLY) { ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); @@ -8650,9 +9425,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) verb_len = RExC_parse - start_verb; if ( start_arg ) { RExC_parse++; - while ( *RExC_parse && *RExC_parse != ')' ) + while ( *RExC_parse && *RExC_parse != ')' ) RExC_parse++; - if ( *RExC_parse != ')' ) + if ( *RExC_parse != ')' ) vFAIL("Unterminated verb pattern argument"); if ( RExC_parse == start_arg ) start_arg = NULL; @@ -8660,7 +9435,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if ( *RExC_parse != ')' ) vFAIL("Unterminated verb pattern"); } - + switch ( *start_verb ) { case 'A': /* (*ACCEPT) */ if ( memEQs(start_verb,verb_len,"ACCEPT") ) { @@ -8689,48 +9464,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if ( memEQs(start_verb,verb_len,"PRUNE") ) op = PRUNE; break; - case 'S': /* (*SKIP) */ - if ( memEQs(start_verb,verb_len,"SKIP") ) + case 'S': /* (*SKIP) */ + if ( memEQs(start_verb,verb_len,"SKIP") ) op = SKIP; break; case 'T': /* (*THEN) */ /* [19:06] :: is then */ if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; - RExC_seen |= REG_SEEN_CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; } break; } if ( ! op ) { - RExC_parse++; - vFAIL3("Unknown verb pattern '%.*s'", - verb_len, start_verb); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); } if ( argok ) { if ( start_arg && internal_argval ) { vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); + verb_len, start_verb); } else if ( argok < 0 && !start_arg ) { vFAIL3("Verb pattern '%.*s' has a mandatory argument", - verb_len, start_verb); + verb_len, start_verb); } else { ret = reganode(pRExC_state, op, internal_argval); if ( ! internal_argval && ! SIZE_ONLY ) { if (start_arg) { - SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); - ARG(ret) = add_data( pRExC_state, 1, "S" ); + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); RExC_rxi->data->data[ARG(ret)]=(void*)sv; ret->flags = 0; } else { - ret->flags = 1; + ret->flags = 1; } - } + } } if (!internal_argval) - RExC_seen |= REG_SEEN_VERBARG; + RExC_seen |= REG_VERBARG_SEEN; } else if ( start_arg ) { vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); + verb_len, start_verb); } else { ret = reg_node(pRExC_state, op); } @@ -8757,17 +9535,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto named_recursion; } else if (paren == '=') { /* (?P=...) named backref */ - /* this pretty much dupes the code for \k in regatom(), if - you change this make sure you change that */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ char* name_start = RExC_parse; U32 num = 0; SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -8792,12 +9572,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); /*NOTREACHED*/ case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; - else if (*RExC_parse != '=') + else if (*RExC_parse != '=') named_capture: { /* (?<...>) */ char *name_start; @@ -8806,15 +9588,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '\'': /* (?'...') */ name_start= RExC_parse; svname = reg_scan_name(pRExC_state, - SIZE_ONLY ? /* reverse test from the others */ - REG_RSN_RETURN_NAME : - REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - /*NOTREACHED*/ - } - if (*RExC_parse != paren) + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); if (SIZE_ONLY) { @@ -8854,20 +9631,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if ( count ) { - pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); pv[count] = RExC_npar; SvIV_set(sv_dat, SvIVX(sv_dat) + 1); } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); - sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); SvIOK_on(sv_dat); SvIV_set(sv_dat, 1); } #ifdef DEBUGGING - /* Yes this does cause a memory leak in debugging Perls */ - if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) SvREFCNT_dec_NN(svname); #endif @@ -8877,7 +9658,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = 1; goto capturing_parens; } - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; case '=': /* (?=...) */ @@ -8894,7 +9675,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '|': /* (?|...) */ /* branch reset, behave like a (?:...) except that buffers in alternations share the same numbers */ - paren = ':'; + paren = ':'; after_freeze = freeze_paren = RExC_npar; break; case ':': /* (?:...) */ @@ -8921,6 +9702,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; *flagp |= POSTPONED; nextchar(pRExC_state); return ret; @@ -8935,6 +9717,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); goto gen_recurse_regop; assert(0); /* NOT REACHED */ case '+': @@ -8948,7 +9732,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse--; /* rewind to let it be handled later */ goto parse_flags; - } + } /*FALLTHROUGH */ case '1': case '2': case '3': case '4': /* (?1) */ case '5': case '6': case '7': case '8': case '9': @@ -8960,7 +9744,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; while (isDIGIT(*RExC_parse)) RExC_parse++; - if (*RExC_parse!=')') + if (*RExC_parse!=')') vFAIL("Expecting close bracket"); gen_recurse_regop: @@ -8994,11 +9778,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ARG2L_SET( ret, RExC_recurse_count++); RExC_emit++; DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); } else { RExC_size++; } - RExC_seen |= REG_SEEN_RECURSE; + RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ @@ -9012,7 +9797,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) is_logical = 1; if (*RExC_parse != '{') { RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } *flagp |= POSTPONED; @@ -9041,14 +9829,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { OP *o = cb->block; if (cb->src_regex) { - n = add_data(pRExC_state, 2, "rl"); + n = add_data(pRExC_state, STR_WITH_LEN("rl")); RExC_rxi->data->data[n] = (void*)SvREFCNT_inc((SV*)cb->src_regex); RExC_rxi->data->data[n+1] = (void*)o; } else { - n = add_data(pRExC_state, 1, - (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); RExC_rxi->data->data[n] = (void*)o; } } @@ -9086,7 +9874,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reg_node(pRExC_state, LOGICAL); if (!SIZE_ONLY) ret->flags = 1; - + tail = reg(pRExC_state, 1, &flag, depth+1); if (flag & RESTART_UTF8) { *flagp = RESTART_UTF8; @@ -9109,7 +9897,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) (ch == '>' ? '<' : ch)); RExC_parse++; if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -9139,15 +9927,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV *sv_dat; RExC_parse++; sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } - ret = reganode(pRExC_state,INSUBP,parno); + ret = reganode(pRExC_state,INSUBP,parno); goto insert_if_check_paren; } else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; + char *tmp; parno = atoi(RExC_parse++); while (isDIGIT(*RExC_parse)) @@ -9155,8 +9946,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if ((c = *nextchar(pRExC_state)) != ')') + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); + } insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -9168,14 +9965,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); } else - REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); c = *nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { - if (is_define) + if (is_define) vFAIL("(?(DEFINE)....) does not allow branches"); - lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { if (flags & RESTART_UTF8) { *flagp = RESTART_UTF8; @@ -9207,7 +10008,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else { - vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); } } case '[': /* (?[ ... ]) */ @@ -9236,16 +10038,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) capturing_parens: parno = RExC_npar; RExC_npar++; - + ret = reganode(pRExC_state, OPEN, parno); if (!SIZE_ONLY ){ - if (!RExC_nestroot) + if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE + if (RExC_seen & REG_RECURSE_SEEN && !RExC_open_parens[parno-1]) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Setting open paren #%"IVdf" to %d\n", + "Setting open paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); RExC_open_parens[parno-1]= ret; } @@ -9257,7 +10059,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else /* ! paren */ ret = NULL; - + parse_rest: /* Pick up the branches, linking them together. */ parse_start = RExC_parse; /* MJD */ @@ -9298,7 +10100,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { ender = reganode(pRExC_state, LONGJMP,0); - REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); } if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ @@ -9306,7 +10110,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (freeze_paren) { if (RExC_npar > after_freeze) after_freeze = RExC_npar; - RExC_npar = freeze_paren; + RExC_npar = freeze_paren; } br = regbranch(pRExC_state, &flags, 0, depth+1); @@ -9330,14 +10134,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Setting close paren #%"IVdf" to %d\n", + "Setting close paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ender))); RExC_close_parens[parno-1]= ender; - if (RExC_nestroot == parno) + if (RExC_nestroot == parno) RExC_nestroot = 0; - } + } Set_Node_Offset(ender,RExC_parse+1); /* MJD */ Set_Node_Length(ender,1); /* MJD */ break; @@ -9362,8 +10166,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("lsbr"); - regprop(RExC_rx, mysv_val1, lastbr); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(lastbr), @@ -9377,20 +10181,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (have_branch && !SIZE_ONLY) { char is_nothing= 1; if (depth==1) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; /* Hook the tails of the branches to the closing node. */ for (br = ret; br; br = regnext(br)) { const U8 op = PL_regkind[OP(br)]; if (op == BRANCH) { REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); - if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender) + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) is_nothing= 0; } else if (op == BRANCHJ) { REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); /* for now we always disable this optimisation * / - if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender) + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) */ is_nothing= 0; } @@ -9401,8 +10207,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("NADA"); - regprop(RExC_rx, mysv_val1, ret); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(ret), @@ -9644,6 +10450,19 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { + if (SIZE_ONLY) { + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); + } + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); + } do_curly: if ((flags&SIMPLE)) { @@ -9685,6 +10504,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; goto nest_check; } @@ -9722,6 +10543,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '*') { min = 0; @@ -9731,6 +10553,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '+') { min = 1; @@ -9743,10 +10566,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) nest_check: if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN3reg(RExC_parse, - "%.*s matches null string many times", - (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), - origparse); + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); (void)ReREFCNT_inc(RExC_rx_sv); } @@ -9776,11 +10601,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } STATIC bool -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, - const bool strict /* Apply stricter parsing rules? */ +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ ) { - + /* This is expected to be called by a parser routine that has recognized '\N' and needs to handle the rest. RExC_parse is expected to point at the first char following the N at the time of the call. On successful return, @@ -9835,7 +10661,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I more than one character */ GET_RE_DEBUG_FLAGS_DECL; - + PERL_ARGS_ASSERT_GROK_BSLASH_N; GET_RE_DEBUG_FLAGS; @@ -9843,7 +10669,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ /* The [^\n] meaning of \N ignores spaces and comments under the /x - * modifier. The other meaning does not */ + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) ? regwhite( pRExC_state, RExC_parse ) : RExC_parse; @@ -9853,17 +10680,18 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (*p != '{' || regcurly(p, FALSE)) { RExC_parse = p; if (! node_p) { - /* no bare \N in a charclass */ + /* no bare \N allowed in a charclass */ if (in_char_class) { vFAIL("\\N in a character class must be a named character: \\N{...}"); } return FALSE; } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ nextchar(pRExC_state); *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; - RExC_parse--; Set_Node_Length(*node_p, 1); /* MJD */ return TRUE; } @@ -9882,8 +10710,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ || ! (endbrace == RExC_parse /* nothing between the {} */ - || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ - && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ { if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); @@ -10021,7 +10851,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I } FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", (UV) flags); - } + } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); RExC_parse = endbrace; @@ -10089,7 +10919,9 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) } PERL_STATIC_INLINE void -S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) { /* This knows the details about sizing an EXACTish node, setting flags for * it (by setting <*flagp>, and potentially populating it with a single @@ -10104,48 +10936,111 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 * If is zero, the function assumes that the node is to contain only * the single character given by and calculates what * should be. In pass 1, it sizes the node appropriately. In pass 2, it - * additionally will populate the node's STRING with , if - * is 0. In both cases <*flagp> is appropriately set + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set * * It knows that under FOLD, the Latin Sharp S and UTF characters above * 255, must be folded (the former only when the rules indicate it can - * match 'ss') */ + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + if (! len_passed_in) { if (UTF) { - if (FOLD && (! LOC || code_point > 255)) { - _to_uni_fold_flags(NATIVE_TO_UNI(code_point), + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, character, &len, - FOLD_FLAGS_FULL | ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; } else { uvchr_to_utf8( character, code_point); len = UTF8SKIP(character); } - } - else if (! FOLD - || code_point != LATIN_SMALL_LETTER_SHARP_S - || ASCII_FOLD_RESTRICTED - || ! AT_LEAST_UNI_SEMANTICS) - { + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { *character = (U8) code_point; len = 1; - } - else { + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { *character = 's'; *(character + 1) = 's'; len = 2; } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } } if (SIZE_ONLY) { @@ -10169,8 +11064,29 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 { *flagp |= SIMPLE; } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } +} + + +/* return atoi(p), unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + char *q = p; + + for (;isDIGIT(*q); q++); /* calculate length of num */ + if (q - p == 0 || q - p > 9) + return I32_MAX; + return atoi(p); } + /* - regatom - the lowest level @@ -10231,7 +11147,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 by the other. Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with - TRYAGAIN. + TRYAGAIN. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be restarted. Otherwise does not return NULL. @@ -10328,7 +11244,8 @@ tryagain: *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; @@ -10376,7 +11293,7 @@ tryagain: goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); - RExC_seen |= REG_SEEN_GPOS; + RExC_seen |= REG_GPOS_SEEN; *flagp |= SIMPLE; goto finish_meta_pat; case 'K': @@ -10387,7 +11304,7 @@ tryagain: * be necessary here to avoid cases of memory corruption, as * with: C<$_="x" x 80; s/x\K/y/> -- rgs */ - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -10401,7 +11318,7 @@ tryagain: goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_SEEN_CANY; + RExC_seen |= REG_CANY_SEEN; *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'X': @@ -10418,30 +11335,38 @@ tryagain: case 'b': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = BOUND + get_regex_charset(RExC_flags); if (op > BOUNDA) { /* /aa is same as /a */ op = BOUNDA; } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = NBOUND + get_regex_charset(RExC_flags); if (op > NBOUNDA) { /* /aa is same as /a */ op = NBOUNDA; } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); } goto finish_meta_pat; @@ -10485,6 +11410,9 @@ tryagain: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } join_posix_op_known: @@ -10500,10 +11428,10 @@ tryagain: *flagp |= HASWIDTH|SIMPLE; /* FALL THROUGH */ - finish_meta_pat: + finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ - break; + break; case 'p': case 'P': { @@ -10533,7 +11461,7 @@ tryagain: nextchar(pRExC_state); } break; - case 'N': + case 'N': /* Handle \N and \N{NAME} with multiple code points here and not * below because it can be multicharacter. join_exact() will join * them up later on. Also this makes sure that things like @@ -10555,10 +11483,11 @@ tryagain: break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: - { - char ch= RExC_parse[1]; + { + char ch= RExC_parse[1]; if (ch != '<' && ch != '\'' && ch != '{') { RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.2s... not terminated",parse_start); } else { /* this pretty much dupes the code for (?P=...) in reg(), if @@ -10569,10 +11498,11 @@ tryagain: SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -10599,15 +11529,16 @@ tryagain: } break; } - case 'g': + case 'g': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { I32 num; - bool isg = *RExC_parse == 'g'; - bool isrel = 0; bool hasbrace = 0; - if (isg) { + + if (*RExC_parse == 'g') { + bool isrel = 0; + RExC_parse++; if (*RExC_parse == '{') { RExC_parse++; @@ -10619,37 +11550,52 @@ tryagain: } if (hasbrace && !isDIGIT(*RExC_parse)) { if (isrel) RExC_parse--; - RExC_parse -= 2; + RExC_parse -= 2; goto parse_named_seq; - } } - num = atoi(RExC_parse); - if (isg && num == 0) { - if (*RExC_parse == '0') { + } + + num = S_backref_value(RExC_parse); + if (num == 0) vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); } - else { - vFAIL("Unterminated \\g... pattern"); + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); } } - if (isrel) { - num = RExC_npar - num; - if (num < 1) - vFAIL("Reference to nonexistent or unclosed group"); + else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } } - if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9') - /* Probably a character specified in octal, e.g. \35 */ - goto defchar; - else { + + /* at this point RExC_parse definitely points to a backref + * number */ + { #ifdef RE_TRACK_PATTERN_OFFSETS char * const parse_start = RExC_parse - 1; /* MJD */ #endif while (isDIGIT(*RExC_parse)) RExC_parse++; if (hasbrace) { - if (*RExC_parse != '}') + if (*RExC_parse != '}') vFAIL("Unterminated \\g{...} pattern"); RExC_parse++; - } + } if (!SIZE_ONLY) { if (num > (I32)RExC_rx->nparens) vFAIL("Reference to nonexistent group"); @@ -10703,25 +11649,35 @@ tryagain: defchar: { STRLEN len = 0; - UV ender; + UV ender = 0; char *p; char *s; #define MAX_NODE_STRING_SIZE 127 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; - STRLEN foldlen; - U8 node_type; + U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); + /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, * which allows the optimizer more things to look for */ bool maybe_exact; - ender = 0; - node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); /* In pass1, folded, we use a temporary buffer instead of the @@ -10732,10 +11688,9 @@ tryagain: reparse: - /* We do the EXACTFish to EXACT node only if folding, and not if in - * locale, as whether a character folds or not isn't known until - * runtime */ - maybe_exact = FOLD && ! LOC; + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10800,7 +11755,8 @@ tryagain: case 's': case 'S': /* space class */ case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ - case 'X': /* eXtended Unicode "combining character sequence" */ + case 'X': /* eXtended Unicode "combining + character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ --p; goto loopdone; @@ -10848,7 +11804,7 @@ tryagain: p++; break; case 'a': - ender = ASCII_TO_NATIVE('\007'); + ender = '\a'; p++; break; case 'o': @@ -10911,24 +11867,30 @@ tryagain: } case 'c': p++; - ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); + ender = grok_bslash_c(*p++, SIZE_ONLY); break; case '8': case '9': /* must be a backreference */ --p; goto loopdone; case '1': case '2': case '3':case '4': case '5': case '6': case '7': - /* When we parse backslash escapes there is ambiguity between - * backreferences and octal escapes. Any escape from \1 - \9 is - * a backreference, any multi-digit escape which does not start with - * 0 and which when evaluated as decimal could refer to an already - * parsed capture buffer is a backslash. Anything else is octal. + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. * - * Note this implies that \118 could be interpreted as 118 OR as - * "\11" . "8" depending on whether there were 118 capture buffers - * defined already in the pattern. - */ - if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar ) + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) { /* Not to be treated as an octal constant, go find backref */ --p; @@ -10985,7 +11947,7 @@ tryagain: if (! SIZE_ONLY && RExC_flags & RXf_PMf_EXTENDED && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low(p, UTF)) + && is_PATWS_non_low_safe(p, RExC_end, UTF)) { vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), "Escape literal pattern white space under /x"); @@ -11023,7 +11985,10 @@ tryagain: goto loopdone; } - if (! FOLD) { + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { if (UTF) { const STRLEN unilen = reguni(pRExC_state, ender, s); if (unilen > 0) { @@ -11041,81 +12006,100 @@ tryagain: else { REGC((char)ender, s++); } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } } - else /* FOLD */ + else /* FOLD */ if (! ( UTF /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { - *(s++) = (char) ender; - maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender); - } - else { /* UTF */ - - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UNI_IS_INVARIANT(ender)) { - *s = (U8) ender; - foldlen = 1; - } else { - *s = UTF8_TWO_BYTE_HI(ender); - *(s + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) + { + maybe_exactfu = FALSE; } } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ + *(s++) = (char) ender; + } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; + } else { - UV folded = _to_uni_fold_flags( - ender, - (U8 *) s, - &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) - ); - - /* If this node only contains non-folding code points - * so far, see if this new one is also non-folding */ - if (maybe_exact) { - if (folded != ender) { + STRLEN foldlen; + + folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { maybe_exact = FALSE; } - else { - /* Here the fold is the original; we have - * to check further to see if anything - * folds to it */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", - "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = - _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - if (_invlist_contains_cp(PL_utf8_foldable, - ender)) - { - maybe_exact = FALSE; - } - } } - ender = folded; } - s += foldlen; - - /* The loop increments each time, as all but this - * path (and one other) through it add a single byte to the - * EXACTish node. But this one has changed len to be the - * correct final value, so subtract one to cancel out the - * increment that follows */ - len += foldlen - 1; + ender = folded; } if (next_is_quantifier) { @@ -11164,9 +12148,8 @@ tryagain: if (! UTF) { - /* These two have no multi-char folds to non-UTF characters - */ - if (ASCII_FOLD_RESTRICTED || LOC) { + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { goto loopdone; } @@ -11197,12 +12180,8 @@ tryagain: } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - - /* No Latin1 characters participate in multi-char - * folds under /l */ - if (LOC - || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( - *s, *(s+1)))) + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + *s, *(s+1)))) { break; } @@ -11239,6 +12218,15 @@ tryagain: * do any better */ if (len == 0) { len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } } else { /* Here, the node does contain some characters that aren't @@ -11297,14 +12285,26 @@ tryagain: if (len == 0) { OP(ret) = NOTHING; } - else{ - - /* If 'maybe_exact' is still set here, means there are no - * code points in the node that participate in folds */ - if (FOLD && maybe_exact) { - OP(ret) = EXACT; + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * (for /u), or depending on locale for /l */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } } - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); } RExC_parse = p - 1; @@ -11343,7 +12343,7 @@ S_regwhite( RExC_state_t *pRExC_state, char *p ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -11357,7 +12357,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */ + * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -11377,7 +12377,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -11385,6 +12385,72 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) return p; } +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. Character classes ([:foo:]) can also be negated ([:^foo:]). Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. @@ -11503,8 +12569,9 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) } if (namedclass == OOB_NAMEDCLASS) - Simple_vFAIL3("POSIX class [:%.*s:] unknown", - t - s - 1, s + 1); + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); /* The #defines are structured so each complement is +1 to * the normal one */ @@ -11592,8 +12659,9 @@ S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) } STATIC regnode * -S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth, - char * const oregcomp_parse) +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) { /* Handle the (?[...]) construct to do set operations */ @@ -11629,7 +12697,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), "The regex_sets feature is experimental" REPORT_LOCATION, - (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse); + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); while (RExC_parse < RExC_end) { SV* current = NULL; @@ -12083,7 +13154,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ -#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, @@ -12133,8 +13205,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ SV* properties = NULL; /* Code points that match \p{} \P{} */ - SV* posixes = NULL; /* Code points that match classes like, [:word:], - extended beyond the Latin1 range */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ AV * multi_char_matches = NULL; /* Code points that fold to more than one @@ -12161,11 +13241,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * string is in UTF-8. (Because is under /d) */ SV* depends_list = NULL; - /* inversion list of code points this node matches. For much of the - * function, it includes only those that match regardless of the utf8ness - * of the target string */ + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ SV* cp_list = NULL; + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -12173,14 +13260,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif bool invert = FALSE; /* Is this class to be complemented */ - /* Is there any thing like \W or [:^digit:] that matches above the legal - * Unicode range? */ - bool runtime_posix_matches_above_Unicode = FALSE; + bool warn_super = ALWAYS_WARN_SUPER; regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; - const I32 orig_size = RExC_size; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -12201,9 +13287,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ANYOF_FLAGS(ret) = 0; RExC_emit += ANYOF_SKIP; - if (LOC) { - ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } listsv = newSVpvs_flags("# comment\n", SVs_TEMP); initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ @@ -12325,7 +13408,7 @@ parseit: case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { - /* We only pay attention to the first char of + /* We only pay attention to the first char of multichar strings being returned. I kinda wonder if this makes sense as it does change the behaviour from earlier versions, OTOH that behaviour was broken @@ -12346,7 +13429,12 @@ parseit: char *e; /* We will handle any undefined properties ourselves */ - U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF; + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; if (RExC_parse >= RExC_end) vFAIL2("Empty \\%c{}", (U8)value); @@ -12369,6 +13457,7 @@ parseit: } if (!SIZE_ONLY) { SV* invlist; + char* formatted; char* name; if (UCHARAT(RExC_parse) == '^') { @@ -12389,14 +13478,14 @@ parseit: * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - Newx(name, n + sizeof("_i__\n"), char); - - sprintf(name, "%s%.*s%s\n", - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - ); + formatted = Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + ); + name = savepvn(formatted, strlen(formatted)); /* Look up the property name, and get its swash and * inversion list, if the property is found */ @@ -12421,11 +13510,13 @@ parseit: * otherwise add it to the list for run-time look up */ if (ret_invlist) { RExC_parse = e + 1; - vFAIL3("Property '%.*s' is unknown", (int) n, name); + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", (value == 'p' ? '+' : '!'), - name); + UTF8fARG(UTF, n, name)); has_user_defined_property = TRUE; /* We don't know yet, so have to assume that the @@ -12434,7 +13525,7 @@ parseit: * would cause things in to match * inappropriately, except that any \p{}, including * this one forces Unicode semantics, which means there - * is */ + * is no */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; } else { @@ -12442,10 +13533,24 @@ parseit: /* Here, did get the swash and its inversion list. If * the swash is from a user-defined property, then this * whole character class should be regarded as such */ - has_user_defined_property = - (swash_init_flags - & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); - + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + + /* Invert if asking for the complement */ if (value == 'P') { _invlist_union_complement_2nd(properties, @@ -12478,7 +13583,7 @@ parseit: case 'f': value = '\f'; break; case 'b': value = '\b'; break; case 'e': value = ASCII_TO_NATIVE('\033');break; - case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'a': value = '\a'; break; case 'o': RExC_parse--; /* function expects to be pointed at the 'o' */ { @@ -12518,7 +13623,7 @@ parseit: goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -12590,31 +13695,8 @@ parseit: /* Here, we have the current token in 'value' */ - /* What matches in a locale is not known until runtime. This includes - * what the Posix classes (like \w, [:space:]) match. Room must be - * reserved (one time per class) to store such classes, either if Perl - * is compiled so that locale nodes always should have this space, or - * if there is such class info to be stored. The space will contain a - * bit for each named class that is to be matched against. This isn't - * needed for \p{} and pseudo-classes, as they are not affected by - * locale, and hence are dealt with separately */ - if (LOC - && ! need_class - && (ANYOF_LOCALE == ANYOF_CLASS - || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX))) - { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; - ANYOF_CLASS_ZERO(ret); - } - ANYOF_FLAGS(ret) |= ANYOF_CLASS; - } - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a * literal, as is the character that began the false range, i.e. @@ -12625,16 +13707,19 @@ parseit: ? RExC_parse - rangebegin : 0; if (strict) { - vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); } else { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN4reg(RExC_parse, - "False [] range \"%*.*s\"", - w, w, rangebegin); + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); (void)ReREFCNT_inc(RExC_rx_sv); cp_list = add_cp_to_invlist(cp_list, '-'); - cp_list = add_cp_to_invlist(cp_list, prevvalue); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); } } @@ -12642,16 +13727,89 @@ parseit: element_count += 2; /* So counts for three values */ } - if (! SIZE_ONLY) { - U8 classnum = namedclass_to_classnum(namedclass); - if (namedclass >= ANYOF_MAX) { /* If a special class */ + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ - /* Here, should be \h, \H, \v, or \V. Neither /d nor - * /l make a difference in what these match. There - * would be problems if these characters had folds - * other than themselves, as cp_list is subject to - * folding. */ + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ if (classnum != _CC_VERTSPACE) { assert( namedclass == ANYOF_HORIZWS || namedclass == ANYOF_NHORIZWS); @@ -12664,246 +13822,25 @@ parseit: _invlist_union_maybe_complement_2nd( cp_list, PL_XPosix_ptrs[classnum], - cBOOL(namedclass % 2), /* Complement if odd + namedclass % 2 != 0, /* Complement if odd (NHORIZWS, NVERTWS) */ &cp_list); } } - else if (classnum == _CC_ASCII) { -#ifdef HAS_ISASCII - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else -#endif /* Not isascii(); just use the hard-coded definition for it */ - _invlist_union_maybe_complement_2nd( - posixes, - PL_ASCII, - cBOOL(namedclass % 2), /* Complement if odd - (NASCII) */ - &posixes); - } - else { /* Garden variety class */ - - /* The ascii range inversion list */ - SV* ascii_source = PL_Posix_ptrs[classnum]; - - /* The full Latin1 range inversion list */ - SV* l1_source = PL_L1Posix_ptrs[classnum]; - - /* This code is structured into two major clauses. The - * first is for classes whose complete definitions may not - * already be known. It not, the Latin1 definition - * (guaranteed to already known) is used plus code is - * generated to load the rest at run-time (only if needed). - * If the complete definition is known, it drops down to - * the second clause, where the complete definition is - * known */ - - if (classnum < _FIRST_NON_SWASH_CC) { - - /* Here, the class has a swash, which may or not - * already be loaded */ - - /* The name of the property to use to match the full - * eXtended Unicode range swash for this character - * class */ - const char *Xname = swash_property_names[classnum]; - - /* If returning the inversion list, we can't defer - * getting this until runtime */ - if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) { - PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", Xname, &PL_sv_undef, - 1, /* binary */ - 0, /* not tr/// */ - NULL, /* No inversion list */ - NULL /* No flags */ - ); - assert(PL_utf8_swash_ptrs[classnum]); - } - if ( ! PL_utf8_swash_ptrs[classnum]) { - if (namedclass % 2 == 0) { /* A non-complemented - class */ - /* If not /a matching, there are code points we - * don't know at compile time. Arrange for the - * unknown matches to be loaded at run-time, if - * needed */ - if (! AT_LEAST_ASCII_RESTRICTED) { - Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", - Xname); - } - if (LOC) { /* Under locale, set run-time - lookup */ - ANYOF_CLASS_SET(ret, namedclass); - } - else { - /* Add the current class's code points to - * the running total */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : l1_source, - &posixes); - } - } - else { /* A complemented class */ - if (AT_LEAST_ASCII_RESTRICTED) { - /* Under /a should match everything above - * ASCII, plus the complement of the set's - * ASCII matches */ - _invlist_union_complement_2nd(posixes, - ascii_source, - &posixes); - } - else { - /* Arrange for the unknown matches to be - * loaded at run-time, if needed */ - Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", - Xname); - runtime_posix_matches_above_Unicode = TRUE; - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else { - - /* We want to match everything in - * Latin1, except those things that - * l1_source matches */ - SV* scratch_list = NULL; - _invlist_subtract(PL_Latin1, l1_source, - &scratch_list); - - /* Add the list from this class to the - * running total */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, - scratch_list, - &posixes); - SvREFCNT_dec_NN(scratch_list); - } - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) - |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - } - } - goto namedclass_done; - } - - /* Here, there is a swash loaded for the class. If no - * inversion list for it yet, get it */ - if (! PL_XPosix_ptrs[classnum]) { - PL_XPosix_ptrs[classnum] - = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]); - } - } - - /* Here there is an inversion list already loaded for the - * entire class */ - - if (namedclass % 2 == 0) { /* A non-complemented class, - like ANYOF_PUNCT */ - if (! LOC) { - /* For non-locale, just add it to any existing list - * */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - } - else { /* Locale */ - SV* scratch_list = NULL; - - /* For above Latin1 code points, we use the full - * Unicode range */ - _invlist_intersection(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - /* And set the output to it, adding instead if - * there already is an output. Checking if - * 'posixes' is NULL first saves an extra clone. - * Its reference count will be decremented at the - * next union, etc, or if this is the only - * instance, at the end of the routine */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } - -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_BLANK) { -#endif - /* Set this class in the node for runtime - * matching */ - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* No isblank(), use the hard-coded ASCII-range - * blanks, adding them to the running total. */ - - _invlist_union(posixes, ascii_source, &posixes); - } -#endif - } - } - else { /* A complemented class, like ANYOF_NPUNCT */ - if (! LOC) { - _invlist_union_complement_2nd( - posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - /* Under /d, everything in the upper half of the - * Latin1 range matches this complement */ - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* Locale */ - SV* scratch_list = NULL; - _invlist_subtract(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_NBLANK) { -#endif - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* Get the list of all code points in Latin1 - * that are not ASCII blanks, and add them to - * the running total */ - _invlist_subtract(PL_Latin1, ascii_source, - &scratch_list); - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#endif - } - } + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); } - namedclass_done: - continue; /* Go get next character */ + continue; /* Go get next character */ } } /* end of namedclass \blah */ @@ -12922,7 +13859,9 @@ parseit: if (range) { if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; - Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); range = 0; /* not a valid range */ } } @@ -13003,11 +13942,9 @@ parseit: value, foldbuf, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) ); /* Here, should be the first character of the @@ -13075,7 +14012,8 @@ parseit: /* Deal with this element of the class */ if (! SIZE_ONLY) { #ifndef EBCDIC - cp_list = _add_range_to_invlist(cp_list, prevvalue, value); + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); #else SV* this_range = _new_invlist(1); _append_range_to_invlist(this_range, prevvalue, value); @@ -13089,13 +14027,18 @@ parseit: * included. literal_endpoint==2 means both ends of the range used * a literal character, not \x{foo} */ if (literal_endpoint == 2 - && (prevvalue >= 'a' && value <= 'z') - || (prevvalue >= 'A' && value <= 'Z')) + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) { - _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA], + _invlist_intersection(this_range, PL_ASCII, + &this_range); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], &this_range); } - _invlist_union(cp_list, this_range, &cp_list); + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); literal_endpoint = 0; #endif } @@ -13125,7 +14068,7 @@ parseit: #endif /* Look at the longest folds first */ - for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -13180,15 +14123,29 @@ parseit: return ret; } - /* If the character class contains only a single element, it may be - * optimizable into another node type which is smaller and runs faster. - * Check if this is the case for this class */ - if (element_count == 1 && ! ret_invlist) { + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { U8 op = END; U8 arg = 0; - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or - [:digit:] or \p{foo} */ + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ /* All named classes are mapped into POSIXish nodes, with its FLAG * argument giving which class it is */ @@ -13244,14 +14201,6 @@ parseit: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } -#ifndef HAS_ISBLANK - if (op == POSIXL - && (namedclass == ANYOF_BLANK - || namedclass == ANYOF_NBLANK)) - { - op = POSIXA; - } -#endif join_posix: /* The odd numbered ones are the complements of the @@ -13306,13 +14255,16 @@ parseit: /* To get locale nodes to not use the full ANYOF size would * require moving the code above that writes the portions * of it that aren't in other nodes to after this point. - * e.g. ANYOF_CLASS_SET */ + * e.g. ANYOF_POSIXL_SET */ RExC_size = orig_size; } } else { RExC_emit = (regnode *)orig_emit; if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } if (invert) { op += NPOSIXD - POSIXD; } @@ -13328,13 +14280,17 @@ parseit: *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } RExC_parse = (char *) cur_parse; SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); return ret; } } @@ -13345,238 +14301,260 @@ parseit: /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ - if (FOLD && cp_list) { - UV start, end; /* End points of code point ranges */ + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; + } - SV* fold_intersection = NULL; + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, + &fold_intersection); - /* If the highest code point is within Latin1, we can use the - * compiled-in Alphas list, and not have to go out to disk. This - * yields two false positives, the masculine and feminine ordinal - * indicators, which are weeded out below using the - * IS_IN_SOME_FOLD_L1() macro */ - if (invlist_highest(cp_list) < 256) { - _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, - &fold_intersection); - } - else { + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { + + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { - /* Here, there are non-Latin1 code points, so we will have to go - * fetch the list of all the characters that participate in folds - */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { - - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode - * rules hard-coded into Perl. (This case happens legitimately - * during compilation of Perl itself before the Unicode tables - * are generated) */ - if (_invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = newHV(); - } - else { /* If the folds haven't been read in, call a fold function * to force that */ if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; + U8 dummy[UTF8_MAXBYTES_CASE+1]; /* This string is just a short named one above \xff */ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); assert(PL_utf8_tofold); /* Verify that worked */ } - PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); + PL_utf8_foldclosures + = _swash_inversion_hash(PL_utf8_tofold); } } - /* Only the characters in this class that participate in folds need - * be checked. Get the intersection of this class and all the - * possible characters that are foldable. This can quickly narrow - * down a large class */ - _invlist_intersection(PL_utf8_foldable, cp_list, - &fold_intersection); - } - - /* Now look at the foldable characters in this class individually */ - invlist_iterinit(fold_intersection); - while (invlist_iternext(fold_intersection, &start, &end)) { - UV j; - - /* Locale folding for Latin1 characters is deferred until runtime */ - if (LOC && start < 256) { - start = 256; - } - - /* Look at every character in the range */ - for (j = start; j <= end; j++) { - - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - SV** listp; - - if (j < 256) { - - /* We have the latin1 folding rules hard-coded here so that - * an innocent-looking character class, like /[ks]/i won't - * have to go out to disk to find the possible matches. - * XXX It would be better to generate these via regen, in - * case a new version of the Unicode standard adds new - * mappings, though that is not really likely, and may be - * caught by the default: case of the switch below. */ - - if (IS_IN_SOME_FOLD_L1(j)) { - - /* ASCII is always matched; non-ASCII is matched only - * under Unicode rules */ - if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { - cp_list = - add_cp_to_invlist(cp_list, PL_fold_latin1[j]); - } - else { - depends_list = - add_cp_to_invlist(depends_list, PL_fold_latin1[j]); + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; + + if (j < 256) { + + /* We have the latin1 folding rules hard-coded here so + * that an innocent-looking character class, like + * /[ks]/i won't have to go out to disk to find the + * possible matches. XXX It would be better to + * generate these via regen, in case a new version of + * the Unicode standard adds new mappings, though that + * is not really likely, and may be caught by the + * default: case of the switch below. */ + + if (IS_IN_SOME_FOLD_L1(j)) { + + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } } - } - if (HAS_NONLATIN1_FOLD_CLOSURE(j) - && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) - { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for to the - * inversion list. */ - switch (j) { - case 'k': - case 'K': - cp_list = - add_cp_to_invlist(cp_list, KELVIN_SIGN); - break; - case 's': - case 'S': - cp_list = add_cp_to_invlist(cp_list, + if (HAS_NONLATIN1_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + /* Certain Latin1 characters have matches outside + * Latin1. To get here, is one of those + * characters. None of these matches is valid for + * ASCII characters under /aa, which is why the 'if' + * just above excludes those. These matches only + * happen when the target string is utf8. The code + * below adds the single fold closures for to the + * inversion list. */ + + switch (j) { + case 'k': + case 'K': + *use_list = + add_cp_to_invlist(*use_list, KELVIN_SIGN); + break; + case 's': + case 'S': + *use_list = add_cp_to_invlist(*use_list, LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - cp_list = add_cp_to_invlist(cp_list, - GREEK_CAPITAL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - cp_list = - add_cp_to_invlist(cp_list, ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - cp_list = add_cp_to_invlist(cp_list, + break; + case MICRO_SIGN: + *use_list = add_cp_to_invlist(*use_list, + GREEK_CAPITAL_LETTER_MU); + *use_list = add_cp_to_invlist(*use_list, + GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *use_list = + add_cp_to_invlist(*use_list, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *use_list = add_cp_to_invlist(*use_list, LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - cp_list = add_cp_to_invlist(cp_list, - LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 to - * express, so they can't match unless the - * target string is in UTF-8, so no action here - * is necessary, as regexec.c properly handles - * the general case for UTF-8 matching and - * multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; + break; + case LATIN_SMALL_LETTER_SHARP_S: + *use_list = add_cp_to_invlist(*use_list, + LATIN_CAPITAL_LETTER_SHARP_S); + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character + * folds from code points that require UTF8 + * to express, so they can't match unless + * the target string is in UTF-8, so no + * action here is necessary, as regexec.c + * properly handles the general case for + * UTF-8 matching and multi-char folds */ + break; + default: + /* Use deprecated warning to increase the + * chances of this being output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); + break; + } } + continue; } - continue; - } - /* Here is an above Latin1 character. We don't have the rules - * hard-coded for it. First, get its fold. This is the simple - * fold, as the multi-character folds have been handled earlier - * and separated out */ - _to_uni_fold_flags(j, foldbuf, &foldlen, - ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); - - /* Single character fold of above Latin1. Add everything in - * its fold closure to the list that this node should match. - * The fold closures data structure is a hash with the keys - * being the UTF-8 of every character that is folded to, like - * 'k', and the values each an array of all code points that - * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. - * Multi-character folds are not included */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); - - /* /aa doesn't allow folds between ASCII and non-; /l - * doesn't allow them between above and below 256 */ - if ((ASCII_FOLD_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && c < 256)) { - continue; - } + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c = SvUV(*c_p); - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ - if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) - { - cp_list = add_cp_to_invlist(cp_list, c); - } - else { - depends_list = add_cp_to_invlist(depends_list, c); + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } + + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } } } } } - } - SvREFCNT_dec_NN(fold_intersection); + SvREFCNT_dec_NN(fold_intersection); + } + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); } /* And combine the result (if any) with any inversion list from posix * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ - if (posixes) { + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } if (! DEPENDS_SEMANTICS) { if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); @@ -13590,10 +14568,8 @@ parseit: /* Under /d, we put into a separate list the Latin1 things that * match only when the target string is utf8 */ SV* nonascii_but_latin1_properties = NULL; - _invlist_intersection(posixes, PL_Latin1, + _invlist_intersection(posixes, PL_UpperLatin1, &nonascii_but_latin1_properties); - _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII, - &nonascii_but_latin1_properties); _invlist_subtract(posixes, nonascii_but_latin1_properties, &posixes); if (cp_list) { @@ -13627,7 +14603,6 @@ parseit: * , because having a Unicode property forces Unicode * semantics */ if (properties) { - bool warn_super = ! has_user_defined_property; if (cp_list) { /* If it matters to the final outcome, see if a non-property @@ -13638,14 +14613,8 @@ parseit: * are using above-Unicode code points indicates they should know * the issues involved */ if (warn_super) { - bool non_prop_matches_above_Unicode = - runtime_posix_matches_above_Unicode - | (invlist_highest(cp_list) > PERL_UNICODE_MAX); - if (invert) { - non_prop_matches_above_Unicode = - ! non_prop_matches_above_Unicode; - } - warn_super = ! non_prop_matches_above_Unicode; + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); } _invlist_union(properties, cp_list, &cp_list); @@ -13656,7 +14625,7 @@ parseit: } if (warn_super) { - OP(ret) = ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; } } @@ -13669,12 +14638,32 @@ parseit: * shouldn't. Therefore we can't invert folded locale now, as it won't be * folded until runtime */ + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ if (invert - && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { @@ -13704,15 +14693,6 @@ parseit: return orig_emit; } - /* If we didn't do folding, it's because some information isn't available - * until runtime; set the run-time fold flag for these. (We don't have to - * worry about properties folding, as that is taken care of by the swash - * fetching) */ - if (FOLD && LOC) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; - } - /* Some character classes are equivalent to other nodes. Such nodes take * up less room and generally fewer operations to execute than ANYOF nodes. * Above, we checked for and optimized into some such equivalents for @@ -13731,8 +14711,13 @@ parseit: if (cp_list && ! invert && ! depends_list - && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) { UV start, end; U8 op = END; /* The optimzation node-type */ @@ -13756,7 +14741,7 @@ parseit: && (start < 256 || UTF)) { /* Here, the list contains a single code point. Can optimize - * into an EXACT node */ + * into an EXACTish node */ value = start; @@ -13786,12 +14771,6 @@ parseit: } } else { - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } if (_invlist_contains_cp(PL_utf8_foldable, value)) { op = EXACT; } @@ -13831,7 +14810,9 @@ parseit: RExC_parse = (char *)cur_parse; if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } SvREFCNT_dec_NN(cp_list); @@ -13844,55 +14825,8 @@ parseit: * for things that belong in the bitmap, put them there, and delete from * . While we are at it, see if everything above 255 is in the * list, and if so, set a flag to speed up execution */ - ANYOF_BITMAP_ZERO(ret); - if (cp_list) { - - /* This gets set if we actually need to modify things */ - bool change_invlist = FALSE; - UV start, end; - - /* Start looking through */ - invlist_iterinit(cp_list); - while (invlist_iternext(cp_list, &start, &end)) { - UV high; - int i; - - if (end == UV_MAX && start <= 256) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - } - - /* Quit if are above what we should change */ - if (start > 255) { - break; - } - - change_invlist = TRUE; - - /* Set all the bits in the range, up to the max that we are doing */ - high = (end < 255) ? end : 255; - for (i = start; i <= (int) high; i++) { - if (! ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - } - invlist_iterfinish(cp_list); - - /* Done with loop; remove any code points that are in the bitmap from - * */ - if (change_invlist) { - _invlist_subtract(cp_list, PL_Latin1, &cp_list); - } - - /* If have completely emptied it, remove it completely */ - if (_invlist_len(cp_list) == 0) { - SvREFCNT_dec_NN(cp_list); - cp_list = NULL; - } - } + populate_ANYOF_from_invlist(ret, &cp_list); if (invert) { ANYOF_FLAGS(ret) |= ANYOF_INVERT; @@ -13909,6 +14843,7 @@ parseit: else { cp_list = depends_list; } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; } /* If there is a swash and more than one element, we can't use the swash in @@ -13918,56 +14853,104 @@ parseit: swash = NULL; } - if (! cp_list - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - { - ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { - /* av[0] stores the character class description in its textual form: - * used later (regexec.c:Perl_regclass_swash()) to initialize the - * appropriate swash, and is also useful for dumping the regnode. - * av[1] if NULL, is a placeholder to later contain the swash computed - * from av[0]. But if no further computation need be done, the - * swash is stored there now. - * av[2] stores the cp_list inversion list for use in addition or - * instead of av[0]; used only if av[1] is NULL - * av[3] is set if any component of the class is from a user-defined - * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; - av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - ? SvREFCNT_inc(listsv) : &PL_sv_undef); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { av_store(av, 1, swash); SvREFCNT_dec_NN(cp_list); } else { - av_store(av, 1, NULL); + av_store(av, 1, &PL_sv_undef); if (cp_list) { - av_store(av, 2, cp_list); - av_store(av, 3, newSVuv(has_user_defined_property)); + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); } } + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + rv = newRV_noinc(MUTABLE_SV(av)); - n = add_data(pRExC_state, 1, "s"); + n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; - ARG_SET(ret, n); + ARG_SET(node, n); } - - *flagp |= HASWIDTH|SIMPLE; - return ret; } -#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* reg_skipcomment() Absorbs an /x style # comments from the input stream. Returns true if there is more text remaining in the stream. - Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment terminates the pattern without including a newline. Note its the callers responsibility to ensure that we are @@ -13990,7 +14973,7 @@ S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) if (!ended) { /* we ran off the end of the pattern without ending the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; return 0; } else return 1; @@ -14071,14 +15054,15 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) FILL_ADVANCE_NODE(ptr, op); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", - "reg_node", __LINE__, + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + "reg_node", __LINE__, PL_reg_name[op], - (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } #endif @@ -14102,16 +15086,16 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 2; - /* + /* We can't do this: - - assert(2==regarglen[op]+1); + + assert(2==regarglen[op]+1); Anything larger than this has to allocate the extra amount. If we changed this to be: - + RExC_size += (1 + regarglen[op]); - + then it wouldn't matter. Its not clear what side effect might come from that so its not done so far. -- dmq @@ -14127,18 +15111,19 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) FILL_ADVANCE_NODE_ARG(ptr, op, arg); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", __LINE__, PL_reg_name[op], - (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Cur_Node_Offset; } -#endif +#endif RExC_emit = ptr; return(ret); } @@ -14146,7 +15131,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) /* - reguni - emit (if appropriate) a Unicode character */ -STATIC STRLEN +PERL_STATIC_INLINE STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; @@ -14207,30 +15192,32 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) StructCopy(--src, --dst, regnode); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", __LINE__, PL_reg_name[op], - (UV)(dst - RExC_emit_start) > RExC_offsets[0] + (UV)(dst - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(src - RExC_emit_start), (UV)(dst - RExC_emit_start), - (UV)RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); } #endif } - + place = opnd; /* Op node, where operand used to be. */ #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", __LINE__, PL_reg_name[op], - (UV)(place - RExC_emit_start) > RExC_offsets[0] + (UV)(place - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(place - RExC_emit_start), (UV)(RExC_parse - RExC_start), @@ -14238,7 +15225,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset(place, RExC_parse); Set_Node_Length(place, 1); } -#endif +#endif src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); @@ -14250,7 +15237,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) */ /* TODO: All three parms should be const */ STATIC void -S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14271,7 +15259,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tail" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), @@ -14297,7 +15285,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de - Look for optimizable sequences at the same time. - currently only looks for EXACT chains. -This is experimental code. The idea is to use this routine to perform +This is experimental code. The idea is to use this routine to perform in place optimizations on branches and groups as they are constructed, with the long term intention of removing optimization from study_chunk so that it is purely analytical. @@ -14309,7 +15297,8 @@ to control which is which. /* TODO: All four parms should be const */ STATIC U8 -S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14332,8 +15321,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN if (PL_regkind[OP(scan)] == EXACT) { - bool has_exactf_sharp_s; /* Unexamined in this routine */ - if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) return EXACT; } #endif @@ -14341,10 +15331,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, switch (OP(scan)) { case EXACT: case EXACTF: + case EXACTFA_NO_TRIE: case EXACTFA: case EXACTFU: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -14359,7 +15349,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), @@ -14372,8 +15362,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv_val=sv_newmortal(); DEBUG_PARSE_MSG(""); - regprop(RExC_rx, mysv_val, val); - PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", SvPV_nolen_const(mysv_val), (IV)REG_NODE_NUM(val), (IV)(val - scan) @@ -14401,7 +15392,9 @@ S_regdump_intflags(pTHX_ const char *lead, const U32 flags) int bit; int set=0; - for (bit=0; bit<32; bit++) { + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitanchored_substr) { - RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), RE_SV_DUMPLEN(r->anchored_substr), 30); PerlIO_printf(Perl_debug_log, "anchored %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_substr), (IV)r->anchored_offset); } else if (r->anchored_utf8) { - RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), RE_SV_DUMPLEN(r->anchored_utf8), 30); PerlIO_printf(Perl_debug_log, "anchored utf8 %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_utf8), (IV)r->anchored_offset); - } + } if (r->float_substr) { - RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), RE_SV_DUMPLEN(r->float_substr), 30); PerlIO_printf(Perl_debug_log, "floating %s%s at %"IVdf"..%"UVuf" ", s, RE_SV_TAIL(r->float_substr), (IV)r->float_min_offset, (UV)r->float_max_offset); } else if (r->float_utf8) { - RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), RE_SV_DUMPLEN(r->float_utf8), 30); PerlIO_printf(Perl_debug_log, "floating utf8 %s%s at %"IVdf"..%"UVuf" ", @@ -14515,7 +15510,7 @@ Perl_regdump(pTHX_ const regexp *r) (r->check_substr == r->float_substr && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored")); - if (r->extflags & RXf_NOSCAN) + if (r->intflags & PREGf_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->extflags & RXf_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); @@ -14523,22 +15518,22 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, ") "); if (ri->regstclass) { - regprop(r, sv, ri->regstclass); + regprop(r, sv, ri->regstclass, NULL); PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); } - if (r->extflags & RXf_ANCH) { + if (r->intflags & PREGf_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); - if (r->extflags & RXf_ANCH_BOL) + if (r->intflags & PREGf_ANCH_BOL) PerlIO_printf(Perl_debug_log, "(BOL)"); - if (r->extflags & RXf_ANCH_MBOL) + if (r->intflags & PREGf_ANCH_MBOL) PerlIO_printf(Perl_debug_log, "(MBOL)"); - if (r->extflags & RXf_ANCH_SBOL) + if (r->intflags & PREGf_ANCH_SBOL) PerlIO_printf(Perl_debug_log, "(SBOL)"); - if (r->extflags & RXf_ANCH_GPOS) + if (r->intflags & PREGf_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); PerlIO_putc(Perl_debug_log, ' '); } - if (r->extflags & RXf_GPOS_SEEN) + if (r->intflags & PREGf_GPOS_SEEN) PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) PerlIO_printf(Perl_debug_log, "plus "); @@ -14560,21 +15555,11 @@ Perl_regdump(pTHX_ const regexp *r) } /* -- regprop - printable representation of opcode +- regprop - printable representation of opcode, with run time support */ -#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ -STMT_START { \ - if (do_sep) { \ - Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ - if (flags & ANYOF_INVERT) \ - /*make sure the invert info is in each */ \ - sv_catpvs(sv, "^"); \ - do_sep = 0; \ - } \ -} STMT_END void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) { #ifdef DEBUGGING dVAR; @@ -14590,10 +15575,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) || _CC_VERTSPACE != 16 #error Need to adjust order of anyofs[] #endif - "[\\w]", - "[\\W]", - "[\\d]", - "[\\D]", + "\\w", + "\\W", + "\\d", + "\\D", "[:alpha:]", "[:^alpha:]", "[:lower:]", @@ -14610,8 +15595,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^graph:]", "[:cased:]", "[:^cased:]", - "[\\s]", - "[\\S]", + "\\s", + "\\S", "[:blank:]", "[:^blank:]", "[:xdigit:]", @@ -14622,12 +15607,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^cntrl:]", "[:ascii:]", "[:^ascii:]", - "[\\v]", - "[\\V]" + "\\v", + "\\V" }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; - + PERL_ARGS_ASSERT_REGPROP; sv_setpvs(sv, ""); @@ -14635,16 +15620,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from regexec.c, and it would be hard to supply pRExC_state. */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; if (k == EXACT) { sv_catpvs(sv, " "); - /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) - * is a crude hack but it may be the best for now since - * we have no flag "this EXACTish node was UTF-8" + /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) + * is a crude hack but it may be the best for now since + * we have no flag "this EXACTish node was UTF-8" * --jhi */ pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], PERL_PV_ESCAPE_UNI_DETECT | @@ -14663,44 +15649,28 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) NULL; const reg_trie_data * const trie = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; - + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); DEBUG_TRIE_COMPILE_r( - Perl_sv_catpvf(aTHX_ sv, - "", - (UV)trie->startstate, - (IV)trie->statecount-1, /* -1 because of the unused 0 element */ - (UV)trie->wordcount, - (UV)trie->minlen, - (UV)trie->maxlen, - (UV)TRIE_CHARCOUNT(trie), - (UV)trie->uniquecharcount - ) + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { - int i; - int rangestart = -1; - U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); sv_catpvs(sv, "["); - for (i = 0; i <= 256; i++) { - if (i < 256 && BITMAP_TEST(bitmap,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; - } - } + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); sv_catpvs(sv, "]"); - } - + } + } else if (k == CURLY) { if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ @@ -14708,7 +15678,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || (OP(o) < NREF)) { @@ -14716,7 +15688,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SV **name= av_fetch(list, ARG(o), 0 ); if (name) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); - } + } else { AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); @@ -14731,22 +15703,37 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } } - } - } else if (k == GOSUB) - Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ + } + if ( k == REF && reginfo) { + U32 n = ARG(o); /* which paren pair */ + I32 ln = prog->offs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } + } else if (k == GOSUB) + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); else if (k == VERB) { - if (!o->flags) - Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + if (!o->flags) + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { - int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - if (flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE_FLAGS) sv_catpvs(sv, "{loc}"); if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); @@ -14755,140 +15742,137 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "^"); /* output what the standard cp 0-255 bitmap matches */ - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - do_sep = 1; - rangestart = -1; - } - } - - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - /* output any special charclass tests (used entirely under use locale) */ - if (ANYOF_CLASS_TEST_ANY_SET(o)) - for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) - if (ANYOF_CLASS_TEST(o,i)) { + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); + + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { sv_catpv(sv, anyofs[i]); do_sep = 1; } - - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - - if (flags & ANYOF_NON_UTF8_LATIN1_ALL) { - sv_catpvs(sv, "{non-utf8-latin1-all}"); - } - - /* output information about the unicode matching */ - if (flags & ANYOF_UNICODE_ALL) - sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) - sv_catpvs(sv, "{unicode}"); - if (flags & ANYOF_NONBITMAP_NON_UTF8) - sv_catpvs(sv, "{outside bitmap}"); - - if (ANYOF_NONBITMAP(o)) { - SV *lv; /* Set if there is something outside the bit map */ - SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL); - bool byte_output = FALSE; /* If something in the bitmap has been - output */ - - if (lv && lv != &PL_sv_undef) { - if (sw) { - U8 s[UTF8_MAXBYTES_CASE+1]; - - for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */ - uvchr_to_utf8(s, i); - - if (i < 256 - && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate - things already - output as part - of the bitmap */ - && swash_fetch(sw, s, TRUE)) - { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - byte_output = TRUE; - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) { - put_byte(sv, rangestart); - } - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i-1); - } - rangestart = -1; - } - } - } + } + } - { - char *s = savesvpv(lv); - char * const origs = s; + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); + } - while (*s && *s != '\n') - s++; + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } - if (*s == '\n') { - const char * const t = ++s; + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { + char *s = savesvpv(lv); + char * const origs = s; + + while (*s && *s != '\n') + s++; + + if (*s == '\n') { + const char * const t = ++s; + + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } if (byte_output) { sv_catpvs(sv, " "); } - while (*s) { - if (*s == '\n') { + while (*s) { + if (*s == '\n') { /* Truncate very long output */ - if (s - origs > 256) { - Perl_sv_catpvf(aTHX_ sv, - "%.*s...", - (int) (s - origs - 1), - t); - goto out_dump; - } - *s = ' '; - } - else if (*s == '\t') { - *s = '-'; - } - s++; - } - if (s[-1] == ' ') - s[-1] = 0; + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } + s++; + } + if (s[-1] == ' ') + s[-1] = 0; - sv_catpv(sv, t); - } + sv_catpv(sv, t); + } - out_dump: + out_dump: - Safefree(origs); - } - SvREFCNT_dec_NN(lv); - } + Safefree(origs); + SvREFCNT_dec_NN(lv); + } + + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); + } + } } Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; - if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } } else { - sv_catpv(sv, anyofs[index]); + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); } } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -14898,9 +15882,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) PERL_UNUSED_ARG(sv); PERL_UNUSED_ARG(o); PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); #endif /* DEBUGGING */ } + + SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ @@ -14930,17 +15917,17 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r) return prog->check_substr ? prog->check_substr : prog->check_utf8; } -/* - pregfree() - - handles refcounting and freeing the perl core regexp structure. When - it is necessary to actually free the structure the first thing it +/* + pregfree() + + handles refcounting and freeing the perl core regexp structure. When + it is necessary to actually free the structure the first thing it does is call the 'free' method of the regexp_engine associated to - the regexp, allowing the handling of the void *pprivate; member - first. (This routine is not overridable by extensions, which is why + the regexp, allowing the handling of the void *pprivate; member + first. (This routine is not overridable by extensions, which is why the extensions free is called first.) - - See regdupe and regdupe_internal if you change anything here. + + See regdupe and regdupe_internal if you change anything here. */ #ifndef PERL_IN_XSUB_RE void @@ -14964,7 +15951,7 @@ Perl_pregfree2(pTHX_ REGEXP *rx) CALLREGFREE_PVT(rx); /* free the private data */ SvREFCNT_dec(RXp_PAREN_NAMES(r)); Safefree(r->xpv_len_u.xpvlenu_pv); - } + } if (r->substrs) { SvREFCNT_dec(r->anchored_substr); SvREFCNT_dec(r->anchored_utf8); @@ -14982,22 +15969,22 @@ Perl_pregfree2(pTHX_ REGEXP *rx) } /* reg_temp_copy() - + This is a hacky workaround to the structural issue of match results being stored in the regexp structure which is in turn stored in PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern could be PL_curpm in multiple contexts, and could require multiple result sets being associated with the pattern simultaneously, such as when doing a recursive match with (??{$qr}) - - The solution is to make a lightweight copy of the regexp structure + + The solution is to make a lightweight copy of the regexp structure when a qr// is returned from the code executed by (??{$qr}) this lightweight copy doesn't actually own any of its data except for - the starp/end and the actual regexp structure itself. - -*/ - - + the starp/end and the actual regexp structure itself. + +*/ + + REGEXP * Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) { @@ -15030,7 +16017,7 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) sv_force_normal(sv) is called. */ SvFAKE_on(ret_x); ret = ReANY(ret_x); - + SvFLAGS(ret_x) |= SvUTF8(rx); /* We share the same string buffer as the original regexp, on which we hold a reference count, incremented when mother_re is set below. @@ -15061,23 +16048,23 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) #endif ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); SvREFCNT_inc_void(ret->qr_anoncv); - + return ret_x; } #endif -/* regfree_internal() +/* regfree_internal() - Free the private data in a regexp. This is overloadable by - extensions. Perl takes care of the regexp structure in pregfree(), - this covers the *pprivate pointer which technically perl doesn't - know about, however of course we have to handle the - regexp_internal structure when no extension is in use. - - Note this is called before freeing anything in the regexp - structure. + Free the private data in a regexp. This is overloadable by + extensions. Perl takes care of the regexp structure in pregfree(), + this covers the *pprivate pointer which technically perl doesn't + know about, however of course we have to handle the + regexp_internal structure when no extension is in use. + + Note this is called before freeing anything in the regexp + structure. */ - + void Perl_regfree_internal(pTHX_ REGEXP * const rx) { @@ -15095,7 +16082,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RX_UTF8(rx), dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); - PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } }); @@ -15129,7 +16116,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) case 'l': case 'L': break; - case 'T': + case 'T': { /* Aho Corasick add-on structure for a trie node. Used in stclass optimization only */ U32 refcount; @@ -15169,7 +16156,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } break; default: - Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); } } Safefree(ri->data->what); @@ -15183,9 +16171,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) -/* - re_dup - duplicate a regexp. - +/* + re_dup - duplicate a regexp. + This routine is expected to clone a given regexp structure. It is only compiled under USE_ITHREADS. @@ -15194,7 +16182,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) stored in the *pprivate pointer. This allows extensions to handle any duplication it needs to do. - See pregfree() and regfree_internal() if you change anything here. + See pregfree() and regfree_internal() if you change anything here. */ #if defined(USE_ITHREADS) #ifndef PERL_IN_XSUB_RE @@ -15205,7 +16193,7 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) I32 npar; const struct regexp *r = ReANY(sstr); struct regexp *ret = ReANY(dstr); - + PERL_ARGS_ASSERT_RE_DUP_GUTS; npar = r->nparens+1; @@ -15272,21 +16260,20 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) so we need to copy it locally. */ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); ret->mother_re = NULL; - ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ /* regdupe_internal() - + This is the internal complement to regdupe() which is used to copy the structure pointed to by the *pprivate pointer in the regexp. This is the core version of the extension overridable cloning hook. The regexp structure being duplicated will be copied by perl prior - to this and will be provided as the regexp *r argument, however + to this and will be provided as the regexp *r argument, however with the /old/ structures pprivate pointer value. Thus this routine may override any copying normally done by perl. - + It returns a pointer to the new regexp_internal structure. */ @@ -15300,10 +16287,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) RXi_GET_DECL(r,ri); PERL_ARGS_ASSERT_REGDUPE_INTERNAL; - + len = ProgLen(ri); - - Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); + + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); reti->num_code_blocks = ri->num_code_blocks; @@ -15345,9 +16333,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) break; case 'f': /* This is cheating. */ - Newx(d->data[i], 1, struct regnode_charclass_class); - StructCopy(ri->data->data[i], d->data[i], - struct regnode_charclass_class); + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); reti->regstclass = (regnode*)d->data[i]; break; case 'T': @@ -15367,7 +16354,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) d->data[i] = ri->data->data[i]; break; default: - Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); } } @@ -15407,7 +16395,8 @@ Perl_regnext(pTHX_ regnode *p) return(NULL); if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); } offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); @@ -15419,7 +16408,7 @@ Perl_regnext(pTHX_ regnode *p) #endif STATIC void -S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) { va_list args; STRLEN l1 = strlen(pat1); @@ -15438,20 +16427,15 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) Copy(pat2, buf + l1, l2 , char); buf[l1 + l2] = '\n'; buf[l1 + l2 + 1] = '\0'; -#ifdef I_STDARG - /* ANSI variant takes additional second argument */ va_start(args, pat2); -#else - va_start(args); -#endif msv = vmess(buf, &args); va_end(args); message = SvPV_const(msv,l1); if (l1 > 512) l1 = 512; Copy(message, buf, l1 , char); - buf[l1-1] = '\0'; /* Overwrite \n */ - Perl_croak(aTHX_ "%s", buf); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); } /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ @@ -15469,7 +16453,8 @@ Perl_save_re_context(pTHX) U32 i; for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; - const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); GV *const *const gvp = (GV**)hv_fetch(PL_defstash, digits, len, 0); @@ -15491,26 +16476,18 @@ S_put_byte(pTHX_ SV *sv, int c) { PERL_ARGS_ASSERT_PUT_BYTE; - /* Our definition of isPRINT() ignores locales, so only bytes that are - not part of UTF-8 are considered printable. I assume that the same - holds for UTF-EBCDIC. - Also, code point 255 is not printable in either (it's E0 in EBCDIC, - which Wikipedia says: - - EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all - ones (binary 1111 1111, hexadecimal FF). It is similar, but not - identical, to the ASCII delete (DEL) or rubout control character. ... - it is typically mapped to hexadecimal code 9F, in order to provide a - unique character mapping in both directions) - - So the old condition can be simplified to !isPRINT(c) */ if (!isPRINT(c)) { - if (c < 256) { - Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); - } + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; + } } else { const char string = c; @@ -15520,25 +16497,104 @@ S_put_byte(pTHX_ SV *sv, int c) } } +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end' */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) + put_byte(sv, start); + } + else if ( end > 255 + || ! isALPHANUMERIC(start) + || ! isALPHANUMERIC(end) + || isDIGIT(start) != isDIGIT(end) + || isUPPER(start) != isUPPER(end) + || isLOWER(start) != isLOWER(end) + + /* This final test should get optimized out except on EBCDIC + * platforms, where it causes ranges that cross discontinuities + * like i/j to be shown as hex instead of the misleading, + * e.g. H-K (since that range includes more than H, I, J, K). + * */ + || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start)) + { + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + } + else { /* Here, the ends of the range are both digits, or both uppercase, + or both lowercase; and there's no discontinuity in the range + (which could happen on EBCDIC platforms) */ + put_byte(sv, start); + sv_catpvs(sv, "-"); + put_byte(sv, end); + } +} + +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i < 256; i++) { + if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j <= 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); + has_output_anything = TRUE; + i = j; + } + } + + return has_output_anything; +} #define CLEAR_OPTSTART \ - if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ } STMT_END -#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); STATIC const regnode * S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, - const regnode *last, const regnode *plast, + const regnode *last, const regnode *plast, SV* sv, I32 indent, U32 depth) { dVAR; U8 op = PSEUDO; /* Arbitrary non-END op. */ const regnode *next; const regnode *optstart= NULL; - + RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; @@ -15548,8 +16604,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, last ? last-start : 0,plast ? plast-start : 0); #endif - - if (plast && plast < last) + + if (plast && plast < last) last= plast; while (PL_regkind[op] != END && (!last || node < last)) { @@ -15569,20 +16625,21 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else CLEAR_OPTSTART; - regprop(r, sv, node); + regprop(r, sv, node, NULL); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); - - if (OP(node) != OPTIMIZED) { + + if (OP(node) != OPTIMIZED) { if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, " (0)"); - else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) PerlIO_printf(Perl_debug_log, " (FAIL)"); - else + else PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); - (void)PerlIO_putc(Perl_debug_log, '\n'); + (void)PerlIO_putc(Perl_debug_log, '\n'); } - + after_print: if (PL_regkind[(U8)op] == BRANCHJ) { assert(next); @@ -15609,7 +16666,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const reg_trie_data * const trie = (reg_trie_data*)ri->data->data[optrie]; #ifdef DEBUGGING - AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); #endif const regnode *nextbranch= NULL; I32 word_idx; @@ -15619,21 +16677,25 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PerlIO_printf(Perl_debug_log, "%*s%s ", (int)(2*(indent+3)), "", - elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, - PL_colors[0], PL_colors[1], - (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELLIPSES | - PERL_PV_PRETTY_LTGT + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT ) - : "???" + : "???" ); if (trie->jump) { U16 dist= trie->jump[word_idx+1]; PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", - (UV)((dist ? this_trie + dist : next) - start)); + (UV)((dist ? this_trie + dist : next) - start)); if (dist) { if (!nextbranch) - nextbranch= this_trie + trie->jump[0]; + nextbranch= this_trie + trie->jump[0]; DUMPUNTIL(this_trie + dist, nextbranch); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) @@ -15660,8 +16722,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS) - ? ANYOF_CLASS_SKIP : ANYOF_SKIP); + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) { @@ -15677,7 +16740,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, indent++; } CLEAR_OPTSTART; -#ifdef DEBUG_DUMPUNTIL +#ifdef DEBUG_DUMPUNTIL PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); #endif return node; diff --git a/src/5019002/orig/regexec.c b/src/5021000/orig/regexec.c similarity index 77% rename from src/5019002/orig/regexec.c rename to src/5021000/orig/regexec.c index 3869d04..362390b 100644 --- a/src/5019002/orig/regexec.c +++ b/src/5021000/orig/regexec.c @@ -37,16 +37,6 @@ #include "re_top.h" #endif -/* At least one required character in the target string is expressible only in - * UTF-8. */ -static const char* const non_utf8_target_but_utf8_required - = "Can't match, because target string needs to be in UTF-8\n"; - -#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ - goto target; \ -} STMT_END - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * @@ -93,16 +83,28 @@ static const char* const non_utf8_target_but_utf8_required #include "inline_invlist.c" #include "unicode_constants.h" +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #ifndef STATIC #define STATIC static #endif -/* Valid for non-utf8 strings: avoids the reginclass +/* Valid only for non-utf8 strings: avoids the reginclass * call if there are no complications: i.e., if everything matchable is * straight forward in the bitmap */ -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ : ANYOF_BITMAP_TEST(p,*(c))) /* @@ -117,6 +119,7 @@ static const char* const non_utf8_target_but_utf8_required ? reghop3((U8*)pos, off, \ (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ : (U8*)(pos + off)) + #define HOPBACKc(pos, off) \ (char*)(reginfo->is_utf8_target \ ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ @@ -127,6 +130,24 @@ static const char* const non_utf8_target_but_utf8_required #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ #define NEXTCHR_IS_EOS (nextchr < 0) @@ -139,11 +160,11 @@ static const char* const non_utf8_target_but_utf8_required SET_nextchr -#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \ +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ if (!swash_ptr) { \ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ - 1, 0, NULL, &flags); \ + 1, 0, invlist, &flags); \ assert(swash_ptr); \ } \ } STMT_END @@ -152,28 +173,33 @@ static const char* const non_utf8_target_but_utf8_required #ifdef DEBUGGING # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); #else # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name) + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) #endif #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ PL_utf8_swash_ptrs[_CC_WORDCHAR], \ - swash_property_names[_CC_WORDCHAR], \ - GREEK_SMALL_LETTER_IOTA_UTF8) + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ STMT_START { \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ "_X_regular_begin", \ - GREEK_SMALL_LETTER_IOTA_UTF8); \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ "_X_extend", \ + NULL, \ COMBINING_GRAVE_ACCENT_UTF8); \ } STMT_END @@ -190,14 +216,14 @@ static const char* const non_utf8_target_but_utf8_required * although it may be done at run time beause of the REF possibility - more * investigation required. -- demerphq */ -#define JUMPABLE(rn) ( \ - OP(rn) == OPEN || \ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ - OP(rn) == EVAL || \ - OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ - OP(rn) == PLUS || OP(rn) == MINMOD || \ - OP(rn) == KEEPS || \ - (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ ) #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) @@ -207,13 +233,13 @@ static const char* const non_utf8_target_but_utf8_required /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -223,7 +249,7 @@ static const char* const non_utf8_target_but_utf8_required Search for mandatory following text node; for lookahead, the text must follow but for lookbehind (rn->flags != 0) we skip to the next step. */ -#define FIND_NEXT_IMPT(rn) STMT_START { \ +#define FIND_NEXT_IMPT(rn) STMT_START { \ while (JUMPABLE(rn)) { \ const OPCODE type = OP(rn); \ if (type == SUSPEND || PL_regkind[type] == CURLY) \ @@ -274,8 +300,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) PERL_ARGS_ASSERT_REGCPPUSH; if (paren_elems_to_push < 0) - Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", - paren_elems_to_push); + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i", + paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS); if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf @@ -296,8 +322,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) ); for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(rex->offs[p].end); - SSPUSHINT(rex->offs[p].start); + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", @@ -369,10 +395,10 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) ); paren = *maxopenparen_p; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { - I32 tmps; + SSize_t tmps; rex->offs[paren].start_tmp = SSPOPINT; - rex->offs[paren].start = SSPOPINT; - tmps = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; if (paren <= rex->lastparen) rex->offs[paren].end = tmps; DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, @@ -484,7 +510,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } if (classnum < _FIRST_NON_SWASH_CC) { @@ -492,8 +518,11 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) /* Initialize the swash unless done already */ if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags); + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) @@ -526,7 +555,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) */ I32 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, - char *strbeg, I32 minend, SV *screamer, U32 nosave) + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -543,68 +572,70 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, } #endif -/* - * Need to implement the following flags for reg_anch: - * - * USE_INTUIT_NOML - Useful to call re_intuit_start() first - * USE_INTUIT_ML - * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer - * INTUIT_AUTORITATIVE_ML - * INTUIT_ONCE_NOML - Intuit can match in one location only. - * INTUIT_ONCE_ML - * - * Another flag for this function: SECOND_TIME (so that float substrs - * with giant delta may be not rechecked). - */ - -/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ - -/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. - Otherwise, only SvCUR(sv) is used to get strbeg. */ - -/* XXXX We assume that strpos is strbeg unless sv. */ -/* XXXX Some places assume that there is a fixed substring. - An update may be needed if optimizer marks as "INTUITable" - RExen without fixed substrings. Similarly, it is assumed that - lengths of all the strings are no more than minlen, thus they - cannot come from lookahead. - (Or minlen should take into account lookahead.) - NOTE: Some of this comment is not correct. minlen does now take account - of lookahead/behind. Further research is required. -- demerphq -*/ - -/* A failure to find a constant substring means that there is no need to make - an expensive call to REx engine, thus we celebrate a failure. Similarly, - finding a substring too deep into the string means that fewer calls to - regtry() should be needed. - - REx compiler's optimizer found 4 possible hints: - a) Anchored substring; - b) Fixed substring; - c) Whether we are anchored (beginning-of-line or \G); - d) First node (of those at offset 0) which may distinguish positions; - We use a)b)d) and multiline-part of c), and try to find a position in the - string which does not contradict any of them. - */ - -/* Most of decisions we do here should have been done at compile time. - The nodes of the REx which we used for the search should have been - deleted from the finite automaton. */ - -/* args: - * rx: the regex to match against - * sv: the SV being matched: only used for utf8 flag; the string - * itself is accessed via the pointers below. Note that on - * something like an overloaded SV, SvPOK(sv) may be false - * and the string pointers may point to something unrelated to - * the SV itself. - * strbeg: real beginning of string - * strpos: the point in the string at which to begin matching - * strend: pointer to the byte following the last char of the string - * flags currently unused; set to 0 - * data: currently unused; set to NULL +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. */ char * @@ -619,34 +650,60 @@ Perl_re_intuit_start(pTHX_ { dVAR; struct regexp *const prog = ReANY(rx); - I32 start_shift = 0; + SSize_t start_shift = prog->check_offset_min; /* Should be nonnegative! */ - I32 end_shift = 0; - char *s; + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; SV *check; - char *t; const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ - I32 ml_anch; - char *other_last = NULL; /* other substr checked before this */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ char *check_at = NULL; /* check substr found at this pos */ - char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; -#ifdef DEBUGGING - const char * const i_strpos = strpos; -#endif GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_START; PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(data); - /* CHR_DIST() would be more correct here but it makes things slow. */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ if (prog->minlen > strend - strpos) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "String too short... [re_intuit_start]\n")); + " String too short...\n")); goto fail; } @@ -671,551 +728,702 @@ Perl_re_intuit_start(pTHX_ } check = prog->check_substr; } - if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */ - ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) - || ( (prog->extflags & RXf_ANCH_BOL) - && !multiline ) ); /* Check after \n? */ - - if (!ml_anch) { - if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */ - && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ - && (strpos != strbeg)) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); - goto fail; - } - if (prog->check_offset_min == prog->check_offset_max - && !(prog->extflags & RXf_CANY_SEEN) - && ! multiline) /* /m can cause \n's to match that aren't - accounted for in the string max length. - See [perl #115242] */ - { - /* Substring at constant offset from beg-of-str... */ - I32 slen; - - s = HOP3c(strpos, prog->check_offset_min, strend); - - if (SvTAIL(check)) { - slen = SvCUR(check); /* >= 1 */ - if ( strend - s > slen || strend - s < slen - 1 - || (strend - s == slen && strend[-1] != '\n')) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); - goto fail_finish; - } - /* Now should match s[0..slen-2] */ - slen--; - if (slen && (*SvPVX_const(check) != *s - || (slen > 1 - && memNE(SvPVX_const(check), s, slen)))) { - report_neq: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); - goto fail_finish; - } + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); + } + }); + + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ + + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); + + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; } - else if (*SvPVX_const(check) != *s - || ((slen = SvCUR(check)) > 1 - && memNE(SvPVX_const(check), s, slen))) - goto report_neq; - check_at = s; - goto success_at_start; - } - } - /* Match is anchored, but substr is not anchored wrt beg-of-str. */ - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - - if (!ml_anch) { - const I32 end = prog->check_offset_max + CHR_SVLEN(check) - - (SvTAIL(check) != 0); - const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; - if (end_shift < eshift) - end_shift = eshift; + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; + } + if (slen && (*SvPVX_const(check) != *s + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); + goto fail_finish; + } + + check_at = s; + goto success_at_start; + } } } - else { /* Can match at random position */ - ml_anch = 0; - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - - /* end shift should be non negative here */ - } -#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ + end_shift = prog->check_end_shift; + +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", (IV)end_shift, RX_PRECOMP(prog)); #endif restart: - /* Find a possible match in the region s..strend by looking for - the "check" substring in the region corrected by start/end_shift. */ + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ + { - I32 srch_start_shift = start_shift; - I32 srch_end_shift = end_shift; U8* start_point; U8* end_point; - if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { - srch_end_shift -= ((strbeg - s) - srch_start_shift); - srch_start_shift = strbeg - s; - } - DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", - (IV)prog->check_offset_min, - (IV)srch_start_shift, - (IV)srch_end_shift, - (IV)prog->check_end_shift); - }); + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); - if (prog->extflags & RXf_CANY_SEEN) { - start_point= (U8*)(s + srch_start_shift); - end_point= (U8*)(strend - srch_end_shift); + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; } else { - start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); - end_point= HOP3(strend, -srch_end_shift, strbeg); + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } + } + DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", (int)(end_point - start_point), (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), start_point); }); - s = fbm_instr( start_point, end_point, + check_at = fbm_instr( start_point, end_point, check, multiline ? FBMrf_MULTILINE : 0); + + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ + + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ + + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); } - /* Update the count-of-usability, remove useless subpatterns, - unshift s. */ - - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(check), RE_SV_DUMPLEN(check), 30); - PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", - (s ? "Found" : "Did not find"), - (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) - ? "anchored" : "floating"), - quoted, - RE_SV_TAIL(check), - (s ? " at offset " : "...\n") ); - }); - if (!s) - goto fail_finish; - /* Finish the diagnostic message */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); - /* XXX dmq: first branch is for positive lookbehind... - Our check string is offset from the beginning of the pattern. - So we need to do any stclass tests offset forward from that - point. I think. :-( - */ - - - - check_at=s; - - - /* Got a candidate. Check MBOL anchoring, and the *other* substr. - Start with the other substr. - XXXX no SCREAM optimization yet - and a very coarse implementation - XXXX /ttx+/ results in anchored="ttx", floating="x". floating will - *always* match. Probably should be marked during compile... - Probably it is right to do no SCREAM here... - */ + /* now look for the 'other' substring if defined */ - if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8) - : (prog->float_substr && prog->anchored_substr)) + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) { /* Take into account the "other" substring. */ - /* XXXX May be hopelessly wrong for UTF... */ - if (!other_last) - other_last = strpos; - if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { - do_other_anchored: - { - char * const last = HOP3c(s, -start_shift, strbeg); - char *last1, *last2; - char * const saved_s = s; - SV* must; - - t = s - prog->check_offset_max; - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) - && t > strpos))) - NOOP; - else - t = strpos; - t = HOP3c(t, prog->anchored_offset, strend); - if (t < other_last) /* These positions already checked */ - t = other_last; - last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); - if (last < last1) - last1 = last; - /* XXXX It is not documented what units *_offsets are in. - We assume bytes, but this is clearly wrong. - Meaning this code needs to be carefully reviewed for errors. - dmq. - */ - - /* On end-of-str: see comment below. */ - must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->anchored_utf8); /* for debug */ - } - else - s = fbm_instr( - (unsigned char*)t, - HOP3(HOP3(last1, prog->anchored_offset, strend) - + SvCUR(must), -(SvTAIL(must)!=0), strbeg), - must, - multiline ? FBMrf_MULTILINE : 0 - ); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); - - - if (!s) { - if (last1 >= last2) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying floating at offset %ld...\n", - (long)(HOP3c(saved_s, 1, strend) - i_strpos))); - other_last = HOP3c(last1, prog->anchored_offset+1, strend); - s = HOP3c(last, 1, strend); - goto restart; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - t = HOP3c(s, -prog->anchored_offset, strbeg); - other_last = HOP3c(s, 1, strend); - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; - } - } - } - else { /* Take into account the floating substring. */ - char *last, *last1; - char * const saved_s = s; - SV* must; - - t = HOP3c(s, -start_shift, strbeg); - last1 = last = - HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); - if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) - last = HOP3c(t, prog->float_max_offset, strend); - s = HOP3c(t, prog->float_min_offset, strend); - if (s < other_last) - s = other_last; - /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ - must = utf8_target ? prog->float_utf8 : prog->float_substr; - /* fbm_instr() takes into account exact value of end-of-str - if the check is SvTAIL(ed). Since false positives are OK, - and end-of-str is not later than strend we are OK. */ - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->float_utf8); /* for debug message */ - } - else - s = fbm_instr((unsigned char*)s, - (unsigned char*)last + SvCUR(must) - - (SvTAIL(must)!=0), - must, multiline ? FBMrf_MULTILINE : 0); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); - if (!s) { - if (last1 == last) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying anchored starting at offset %ld...\n", - (long)(saved_s + 1 - i_strpos))); - other_last = last; - s = HOP3c(t, 1, strend); - goto restart; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - other_last = s; /* Fix this later. --Hugo */ - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; - } - } + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ + + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); + + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); + } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; + } + else { + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); + } + } + } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); } - - t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); - - DEBUG_OPTIMISE_MORE_r( - PerlIO_printf(Perl_debug_log, - "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", - (IV)prog->check_offset_min, - (IV)prog->check_offset_max, - (IV)(s-strpos), - (IV)(t-strpos), - (IV)(t-s), - (IV)(strend-strpos) - ) - ); + postprocess_substr_matches: - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) - && t > strpos))) - { - /* Fixed substring is found far enough so that the match - cannot start at strpos. */ - try_at_offset: - if (ml_anch && t[-1] != '\n') { - /* Eventually fbm_*() should handle this, but often - anchored_offset is not 0, so this check will not be wasted. */ - /* XXXX In the code below we prefer to look for "^" even in - presence of anchored substrings. And we search even - beyond the found float position. These pessimizations - are historical artefacts only. */ - find_anchor: - while (t < strend - prog->minlen) { - if (*t == '\n') { - if (t < check_at - prog->check_offset_min) { - if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { - /* Since we moved from the found position, - we definitely contradict the found anchored - substr. Due to the above check we do not - contradict "check" substr. - Thus we can arrive here only if check substr - is float. Redo checking for "other"=="fixed". - */ - strpos = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); - goto do_other_anchored; - } - /* We don't contradict the found floating substring. */ - /* XXXX Why not check for STCLASS? */ - s = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(s - i_strpos))); - goto set_useful; - } - /* Position contradicts check-string */ - /* XXXX probably better to look for check-string - than for "\n", so one should lower the limit for t? */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos))); - other_last = strpos = s = t + 1; - goto restart; - } - t++; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); - goto fail_finish; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); - } - s = t; - set_useful: - ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + /* handle the extra constraint of /^.../m if present */ + + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); + goto fail_finish; + } + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) + { + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; + } + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; + } + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); } else { - /* The found string does not prohibit matching at strpos, - - no optimization of calling REx engine can be performed, - unless it was an MBOL and we are not after MBOL, - or a future STCLASS check will fail this. */ - try_at_start: - /* Even in this situation we may use MBOL flag if strpos is offset - wrt the start of the string. */ - if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n' - /* May be due to an implicit anchor of m{.*foo} */ - && !(prog->intflags & PREGf_IMPLICIT)) - { - t = strpos; - goto find_anchor; - } - DEBUG_EXECUTE_r( if (ml_anch) - PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", - (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]); - ); - success_at_start: - if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ - && (utf8_target ? ( - prog->check_utf8 /* Could be deleted already */ - && --BmUSEFUL(prog->check_utf8) < 0 - && (prog->check_utf8 == prog->float_utf8) - ) : ( - prog->check_substr /* Could be deleted already */ - && --BmUSEFUL(prog->check_substr) < 0 - && (prog->check_substr == prog->float_substr) - ))) - { - /* If flags & SOMETHING - do not do it many times on the same match */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); - /* XXX Does the destruction order has to change with utf8_target? */ - SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); - SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); - prog->check_substr = prog->check_utf8 = NULL; /* disable */ - prog->float_substr = prog->float_utf8 = NULL; /* clear */ - check = NULL; /* abort */ - s = strpos; - /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag - see http://bugs.activestate.com/show_bug.cgi?id=87173 */ - if (prog->intflags & PREGf_IMPLICIT) - prog->extflags &= ~RXf_ANCH_MBOL; - /* XXXX This is a remnant of the old implementation. It - looks wasteful, since now INTUIT can use many - other heuristics. */ - prog->extflags &= ~RXf_USE_INTUIT; - /* XXXX What other flags might need to be cleared in this branch? */ - } - else - s = strpos; + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); } - /* Last resort... */ - /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ - /* trie stclasses are too expensive to use here, we are better off to - leave it to regmatch itself */ + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ + if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { - /* minlen == 0 is possible if regstclass is \b or \B, - and the fixed substr is ''$. - Since minlen is already taken into account, s+1 is before strend; - accidentally, minlen >= 1 guaranties no false positives at s + 1 - even for \b or \B. But (minlen? 1 : 0) below assumes that - regstclass does not come from lookahead... */ - /* If regstclass takes bytelength more than 1: If charlength==1, OK. - This leaves EXACTF-ish only, which are dealt with in find_byclass(). */ const U8* const str = (U8*)STRING(progi->regstclass); + + /* XXX this value could be pre-computed */ const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT - ? CHR_DIST(str+STR_LEN(progi->regstclass), str) + ? (reginfo->is_utf8_pat + ? utf8_distance(str + STR_LEN(progi->regstclass), str) + : STR_LEN(progi->regstclass)) : 1); char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) - endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); - else if (prog->float_substr || prog->float_utf8) - endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } else endpos= strend; - if (checked_upto < s) - checked_upto = s; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); - t = s; - s = find_byclass(prog, progi->regstclass, checked_upto, endpos, + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, reginfo); - if (s) { - checked_upto = s; - } else { -#ifdef DEBUGGING - const char *what = NULL; -#endif + if (!s) { if (endpos == strend) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); + " Could not match STCLASS...\n") ); goto fail; } DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "This position contradicts STCLASS...\n") ); - if ((prog->extflags & RXf_ANCH) && !ml_anch) + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) goto fail; - checked_upto = HOPBACKc(endpos, start_shift); - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { - if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { - DEBUG_EXECUTE_r( what = "anchored" ); - hop_and_restart: - s = HOP3c(t, 1, strend); - if (s + start_shift + end_shift > strend) { - /* XXXX Should be taken into account earlier? */ - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); - goto fail; - } - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for %s substr starting at offset %ld...\n", - what, (long)(s + start_shift - i_strpos)) ); - goto restart; - } - /* Have both, check_string is floating */ - if (t + start_shift >= check_at) /* Contradicts floating=check */ - goto retry_floating_check; - /* Recheck anchored substring, but not floating... */ - s = check_at; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for anchored substr starting at offset %ld...\n", - (long)(other_last - i_strpos)) ); - goto do_other_anchored; - } - /* Another way we could have checked stclass at the - current position only: */ - if (ml_anch) { - s = t = t + 1; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for /%s^%s/m starting at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); - goto try_at_offset; - } - if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ - goto fail; - /* Check is floating substring. */ - retry_floating_check: - t = check_at - start_shift; - DEBUG_EXECUTE_r( what = "floating" ); - goto hop_and_restart; + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; + } + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; + } + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; + } + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; } - if (t != s) { + + /* Success !!! */ + + if (rx_origin != s) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "By STCLASS: moving %ld --> %ld\n", - (long)(t - i_strpos), (long)(s - i_strpos)) + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) ); } else { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "Does not contradict STCLASS...\n"); + " Does not contradict STCLASS...\n"); ); } } - giveup: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", - PL_colors[4], (check ? "Guessed" : "Giving up"), - PL_colors[5], (long)(s - i_strpos)) ); - return s; + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; + } + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; fail_finish: /* Substring not found */ if (prog->check_substr || prog->check_utf8) /* could be removed already */ @@ -1226,46 +1434,57 @@ Perl_re_intuit_start(pTHX_ return NULL; } + #define DECL_TRIE_TYPE(scan) \ - const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ trie_type = ((scan->flags == EXACT) \ ? (utf8_target ? trie_utf8 : trie_plain) \ - : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ -STMT_START { \ +STMT_START { \ STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ - uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ len = UTF8SKIP(uc); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_latin_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ case trie_utf8: \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ case trie_plain: \ uvc = (UV)*uc; \ @@ -1336,28 +1555,28 @@ REXEC_FBC_SCAN( \ tmp = 1; \ ) -#define REXEC_FBC_TRYIT \ +#define REXEC_FBC_TRYIT \ if ((reginfo->intuit || regtry(reginfo, &s))) \ goto got_it #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ - if (utf8_target) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ } -#define DUMP_EXEC_POS(li,s,doutf8) \ +#define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ startpos, doutf8) -#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ tmp = TEST_NON_UTF8(tmp); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1366,18 +1585,19 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ -#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ if (s == reginfo->strbeg) { \ tmp = '\n'; \ } \ else { \ U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ } \ tmp = TeSt1_UtF8; \ - LOAD_UTF8_CHARCLASS_ALNUM(); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! (TeSt2_UtF8)) { \ + if (tmp == ! (TeSt2_UtF8)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1412,9 +1632,9 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ * one, and compare it with the wordness of this one. If they differ, we have * a boundary. At the beginning of the string, pretend that the previous * character was a new-line */ -#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ if (utf8_target) { \ - UTF8_CODE \ + UTF8_CODE \ } \ else { /* Not utf8 */ \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ @@ -1429,7 +1649,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ } \ - if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ + if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ goto got_it; /* We know what class REx starts with. Try to find this position... */ @@ -1468,11 +1688,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { case ANYOF: - case ANYOF_SYNTHETIC: - case ANYOF_WARN_SUPER: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( - reginclass(prog, c, (U8*)s, utf8_target)); + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); } else { REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); @@ -1487,6 +1705,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: if (is_utf8_pat || utf8_target) { utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; @@ -1496,10 +1717,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1; /* /a, except the sharp s one which */ goto do_exactf_non_utf8; /* isn't dealt with by these */ - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); if (utf8_target) { - - /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1508,8 +1728,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; case EXACTFL: - if (is_utf8_pat || utf8_target) { - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; } fold_array = PL_fold_locale; @@ -1522,7 +1742,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_TRICKYFOLD: case EXACTFU: if (is_utf8_pat || utf8_target) { utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -1555,7 +1774,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * characters, and there are only 2 availabe, we know without * trying that it will fail; so don't start a match past the * required minimum number from the far end */ - e = HOP3c(strend, -((I32)ln), s); + e = HOP3c(strend, -((SSize_t)ln), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1601,7 +1820,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * only 2 are left, it's guaranteed to fail, so don't start a * match that would require us to go beyond the end of the string */ - e = HOP3c(strend, -((I32)lnc), s); + e = HOP3c(strend, -((SSize_t)lnc), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1627,15 +1846,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; } case BOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -1682,7 +1899,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); break; @@ -1746,7 +1962,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), classnum)))) { if (tmp && (reginfo->intuit || regtry(reginfo, &s))) @@ -1804,8 +2021,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } /* This is a copy of the loop above for swash classes, though using the @@ -2047,13 +2266,163 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, return s; } +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +static void +S_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) + { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + /* - regexec_flags - match a regexp against a string */ I32 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, - char *strbeg, I32 minend, SV *sv, void *data, U32 flags) + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -2061,21 +2430,17 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* sv: SV being matched: only used for utf8 flag, pos() etc; string * itself is accessed via the pointers above */ /* data: May be used for some additional optimizations. - Currently its only used, with a U32 cast, for transmitting - the ganch offset when doing a /g match. This will change */ -/* nosave: For optimizations. */ + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ { dVAR; struct regexp *const prog = ReANY(rx); char *s; regnode *c; - char *startpos = stringarg; - I32 minlen; /* must match at least this many chars */ - I32 dontbother = 0; /* how many characters not to try at end */ - I32 end_shift = 0; /* Same for the end. */ /* CC */ - I32 scream_pos = -1; /* Internal iterator of scream. */ - char *scream_olds = NULL; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); @@ -2089,16 +2454,70 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PERL_UNUSED_ARG(data); /* Be paranoid... */ - if (prog == NULL || startpos == NULL) { + if (prog == NULL || stringarg == NULL) { Perl_croak(aTHX_ "NULL regexp parameter"); return 0; } DEBUG_EXECUTE_r( - debug_start_match(rx, utf8_target, startpos, strend, + debug_start_match(rx, utf8_target, stringarg, strend, "Matching"); ); + startpos = stringarg; + + if (prog->intflags & PREGf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if prog->gofs is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ + + if (prog->intflags & PREGf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->intflags & PREGf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; + } /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), * which will call destuctors to reset PL_regmatch_state, free higher @@ -2107,10 +2526,54 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, oldsave = PL_savestack_ix; + s = startpos; + + if ((prog->extflags & RXf_USE_INTUIT) + && !(flags & REXEC_CHECKED)) + { + s = re_intuit_start(rx, sv, strbeg, startpos, strend, + flags, NULL); + if (!s) + return 0; + + if (prog->extflags & RXf_CHECK_ALL) { + /* we can match based purely on the result of INTUIT. + * Set up captures etc just for $& and $-[0] + * (an intuit-only match wont have $1,$2,..) */ + assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + /* match via INTUIT shouldn't have any captures. + * Let @-, @+, $^N know */ + prog->lastparen = prog->lastcloseparen = 0; + RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, strend, + sv, flags, utf8_target); + + return 1; + } + } + multiline = prog->extflags & RXf_PMf_MULTILINE; - minlen = prog->minlen; - if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too short [regexec_flags]...\n")); goto phooey; @@ -2133,7 +2596,20 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, reginfo->poscache_maxiter = 0; /* not yet started a countdown */ reginfo->strend = strend; /* see how far we have to get to not match where we matched before */ - reginfo->till = startpos+minend; + reginfo->till = stringarg + minend; + + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { + /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after + S_cleanup_regmatch_info_aux has executed (registered by + SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies + magic belonging to this SV. + Not newSVsv, either, as it does not COW. + */ + assert(!IS_PADGV(sv)); + reginfo->sv = newSV(0); + SvSetSV_nosteal(reginfo->sv, sv); + SAVEFREESV(reginfo->sv); + } /* reserve next 2 or 3 slots in PL_regmatch_state: * slot N+0: may currently be in use: skip it @@ -2186,41 +2662,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* If there is a "must appear" string, look for it. */ - s = startpos; - if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */ - MAGIC *mg; - if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */ - reginfo->ganch = startpos + prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs)); - } else if (sv && (mg = mg_find_mglob(sv)) - && mg->mg_len >= 0) { - reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */ - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len)); - - if (prog->extflags & RXf_ANCH_GPOS) { - if (s > reginfo->ganch) - goto phooey; - s = reginfo->ganch - prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs)); - if (s < strbeg) - goto phooey; - } - } - else if (data) { - reginfo->ganch = strbeg + PTR2UV(data); - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data))); - - } else { /* pos() not defined */ - reginfo->ganch = strbeg; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS: reginfo->ganch = strbeg\n")); - } - } if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { /* We have to be careful. If the previous successful match was from this regex we don't want a subsequent partially @@ -2239,27 +2681,13 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PTR2UV(prog->offs) )); } - if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { - re_scream_pos_data d; - - d.scream_olds = &scream_olds; - d.scream_pos = &scream_pos; - s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d); - if (!s) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); - goto phooey; /* not present */ - } - } - - /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ - if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { - if (s == startpos && regtry(reginfo, &startpos)) + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { + if (s == startpos && regtry(reginfo, &s)) goto got_it; - else if (multiline || (prog->intflags & PREGf_IMPLICIT) - || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ { char *end; @@ -2333,14 +2761,15 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* end search for newline */ } /* end anchored/multiline check string search */ goto phooey; - } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) + } else if (prog->intflags & PREGf_ANCH_GPOS) { - /* the warning about reginfo->ganch being used without initialization - is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN - and we only enter this block when the same bit is set. */ - char *tmp_s = reginfo->ganch - prog->gofs; - - if (tmp_s >= strbeg && regtry(reginfo, &tmp_s)) + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) goto got_it; goto phooey; } @@ -2396,8 +2825,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, || ((prog->float_substr != NULL || prog->float_utf8 != NULL) && prog->float_max_offset < strend - s)) { SV *must; - I32 back_max; - I32 back_min; + SSize_t back_max; + SSize_t back_min; char *last; char *last1; /* Last position checked before */ #ifdef DEBUGGING @@ -2442,7 +2871,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, last = strend; } else { last = HOP3c(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) + -(SSize_t)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min), strbeg); } if (s > reginfo->strbeg) @@ -2452,11 +2881,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* XXXX check_substr already used to find "s", can optimize if check_substr==must. */ - scream_pos = -1; - dontbother = end_shift; + dontbother = 0; strend = HOPc(strend, -dontbother); while ( (s <= last) && - (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), (unsigned char*)strend, must, multiline ? FBMrf_MULTILINE : 0)) ) { DEBUG_EXECUTE_r( did_match = 1 ); @@ -2509,7 +2937,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, c); + regprop(prog, prop, c, reginfo); { RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); @@ -2646,6 +3074,18 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, goto phooey; got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + DEBUG_BUFFERS_r( if (swap) PerlIO_printf(Perl_debug_log, @@ -2668,123 +3108,10 @@ got_it: RX_MATCH_UTF8_set(rx, utf8_target); /* make sure $`, $&, $', and $digit will work later */ - if ( !(flags & REXEC_NOT_FIRST) ) { - if (flags & REXEC_COPY_STR) { -#ifdef PERL_ANY_COW - if (SvCANCOW(sv)) { - if (DEBUG_C_TEST) { - PerlIO_printf(Perl_debug_log, - "Copy on write: regexp capture, type %d\n", - (int) SvTYPE(sv)); - } - RX_MATCH_COPY_FREE(rx); - prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); - prog->subbeg = (char *)SvPVX_const(prog->saved_copy); - assert (SvPOKp(prog->saved_copy)); - prog->sublen = reginfo->strend - strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - } else -#endif - { - I32 min = 0; - I32 max = reginfo->strend - strbeg; - I32 sublen; - - if ( (flags & REXEC_COPY_SKIP_POST) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_RIGHT) - ) { /* don't copy $' part of string */ - U32 n = 0; - max = -1; - /* calculate the right-most part of the string covered - * by a capture. Due to look-ahead, this may be to - * the right of $&, so we have to scan all captures */ - while (n <= prog->lastparen) { - if (prog->offs[n].end > max) - max = prog->offs[n].end; - n++; - } - if (max == -1) - max = (PL_sawampersand & SAWAMPERSAND_LEFT) - ? prog->offs[0].start - : 0; - assert(max >= 0 && max <= reginfo->strend - strbeg); - } - - if ( (flags & REXEC_COPY_SKIP_PRE) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_LEFT) - ) { /* don't copy $` part of string */ - U32 n = 0; - min = max; - /* calculate the left-most part of the string covered - * by a capture. Due to look-behind, this may be to - * the left of $&, so we have to scan all captures */ - while (min && n <= prog->lastparen) { - if ( prog->offs[n].start != -1 - && prog->offs[n].start < min) - { - min = prog->offs[n].start; - } - n++; - } - if ((PL_sawampersand & SAWAMPERSAND_RIGHT) - && min > prog->offs[0].end - ) - min = prog->offs[0].end; - - } - - assert(min >= 0 && min <= max - && min <= reginfo->strend - strbeg); - sublen = max - min; - - if (RX_MATCH_COPIED(rx)) { - if (sublen > prog->sublen) - prog->subbeg = - (char*)saferealloc(prog->subbeg, sublen+1); - } - else - prog->subbeg = (char*)safemalloc(sublen+1); - Copy(strbeg + min, prog->subbeg, sublen, char); - prog->subbeg[sublen] = '\0'; - prog->suboffset = min; - prog->sublen = sublen; - RX_MATCH_COPIED_on(rx); - } - prog->subcoffset = prog->suboffset; - if (prog->suboffset && utf8_target) { - /* Convert byte offset to chars. - * XXX ideally should only compute this if @-/@+ - * has been seen, a la PL_sawampersand ??? */ - - /* If there's a direct correspondence between the - * string which we're matching and the original SV, - * then we can use the utf8 len cache associated with - * the SV. In particular, it means that under //g, - * sv_pos_b2u() will use the previously cached - * position to speed up working out the new length of - * subcoffset, rather than counting from the start of - * the string each time. This stops - * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; - * from going quadratic */ - if (SvPOKp(sv) && SvPVX(sv) == strbeg) - sv_pos_b2u(sv, &(prog->subcoffset)); - else - prog->subcoffset = utf8_length((U8*)strbeg, - (U8*)(strbeg+prog->suboffset)); - } - } - else { - RX_MATCH_COPY_FREE(rx); - prog->subbeg = strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - /* use reginfo->strend, as strend may have been modified */ - prog->sublen = reginfo->strend - strbeg; - } - } + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); return 1; @@ -2815,7 +3142,7 @@ phooey: /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. * Do inc before dec, in case old and new rex are the same */ -#define SET_reg_curpm(Re2) \ +#define SET_reg_curpm(Re2) \ if (reginfo->info_aux_eval) { \ (void)ReREFCNT_inc(Re2); \ ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ @@ -2833,7 +3160,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) CHECKPOINT lastcp; REGEXP *const rx = reginfo->prog; regexp *const prog = ReANY(rx); - I32 result; + SSize_t result; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -3080,11 +3407,11 @@ regmatch(), slabs allocated since entry are freed. #define DEBUG_STATE_pp(pp) \ DEBUG_STATE_r({ \ - DUMP_EXEC_POS(locinput, scan, utf8_target); \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ PerlIO_printf(Perl_debug_log, \ " %*s"pp" %s%s%s%s%s\n", \ depth*2, "", \ - PL_reg_name[st->resume_state], \ + PL_reg_name[st->resume_state], \ ((st==yes_state||st==mark_state) ? "[" : ""), \ ((st==yes_state) ? "Y" : ""), \ ((st==mark_state) ? "M" : ""), \ @@ -3288,6 +3615,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, dVAR; U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; if (OP(text_node) == EXACT) { @@ -3307,136 +3635,193 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = c1 = valid_utf8_to_uvchr(pat, NULL); } } - else /* an EXACTFish node */ - if ((is_utf8_pat - && is_MULTI_CHAR_FOLD_utf8_safe(pat, - pat + STR_LEN(text_node))) - || (!is_utf8_pat - && is_MULTI_CHAR_FOLD_latin1_safe(pat, - pat + STR_LEN(text_node)))) - { - /* Multi-character folds require more context to sort out. Also - * PL_utf8_foldclosures used below doesn't handle them, so have to be - * handled outside this routine */ - use_chrtest_void = TRUE; - } - else { /* an EXACTFish node which doesn't begin with a multi-char fold */ - c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; - if (c1 > 256) { - /* Load the folds hash, if not already done */ - SV** listp; - if (! PL_utf8_foldclosures) { - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - - /* Force loading this by folding an above-Latin1 char */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); } + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } + } - /* The fold closures data structure is a hash with the keys being - * the UTF-8 of every character that is folded to, like 'k', and - * the values each an array of all code points that fold to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are - * not included */ - if ((! (listp = hv_fetch(PL_utf8_foldclosures, - (char *) pat, - UTF8SKIP(pat), - FALSE)))) - { - /* Not found in the hash, therefore there are no folds - * containing it, so there is only a single character that - * could match */ - c2 = c1; + pat = folded; + pat_end = d; } - else { /* Does participate in folds */ - AV* list = (AV*) *listp; - if (av_len(list) != 1) { + } - /* If there aren't exactly two folds to this, it is outside - * the scope of this function */ - use_chrtest_void = TRUE; - } - else { /* There are two. Get them */ - SV** c_p = av_fetch(list, 0, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 256) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* Force loading this by folding an above-Latin1 char */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ } - c1 = SvUV(*c_p); + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + } + + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { - c_p = av_fetch(list, 1, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; } - c2 = SvUV(*c_p); - - /* Folds that cross the 255/256 boundary are forbidden if - * EXACTFL, or EXACTFA and one is ASCIII. Since the - * pattern character is above 256, and its only other match - * is below 256, the only legal match will be to itself. - * We have thrown away the original, so have to compute - * which is the one above 255 */ - if ((c1 < 256) != (c2 < 256)) { - if (OP(text_node) == EXACTFL - || (OP(text_node) == EXACTFA - && (isASCII(c1) || isASCII(c2)))) - { - if (c1 < 256) { - c1 = c2; - } - else { - c2 = c1; + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 256, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255 */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } } } } } } - } - else /* Here, c1 is < 255 */ - if (utf8_target - && HAS_NONLATIN1_FOLD_CLOSURE(c1) - && OP(text_node) != EXACTFL - && (OP(text_node) != EXACTFA || ! isASCII(c1))) - { - /* Here, there could be something above Latin1 in the target which - * folds to this character in the pattern. All such cases except - * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters - * involved in their folds, so are outside the scope of this - * function */ - if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { - c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; - } - else { - use_chrtest_void = TRUE; + else /* Here, c1 is < 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } } - } - else { /* Here nothing above Latin1 can fold to the pattern character */ - switch (OP(text_node)) { + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { - case EXACTFL: /* /l rules */ - c2 = PL_fold_locale[c1]; - break; + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; - case EXACTF: - if (! utf8_target) { /* /d rules */ - c2 = PL_fold[c1]; + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; break; - } - /* FALLTHROUGH */ - /* /u rules for all these. This happens to work for - * EXACTFA as nothing in Latin1 folds to ASCII */ - case EXACTFA: - case EXACTFU_TRICKYFOLD: - case EXACTFU_SS: - case EXACTFU: - c2 = PL_fold_latin1[c1]; - break; - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); - assert(0); /* NOTREACHED */ + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } } } } @@ -3478,7 +3863,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } /* returns -1 on failure, $+[0] on success */ -STATIC I32 +STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { #if PERL_VERSION < 9 && !defined(PERL_CORE) @@ -3496,7 +3881,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regnode *scan; regnode *next; U32 n = 0; /* general value; init to avoid compiler warning */ - I32 ln = 0; /* len or last; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput) */ @@ -3524,7 +3909,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; - SV* const oreplsv = GvSV(PL_replgv); + SV* const oreplsv = GvSVn(PL_replgv); /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops @@ -3554,6 +3939,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) GET_RE_DEBUG_FLAGS_DECL; #endif + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ multicall_oldcatch = 0; multicall_cv = NULL; @@ -3579,7 +3967,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SV * const prop = sv_newmortal(); regnode *rnext=regnext(scan); DUMP_EXEC_POS( locinput, scan, utf8_target ); - regprop(rex, prop, scan); + regprop(rex, prop, scan, reginfo); PerlIO_printf(Perl_debug_log, "%3"IVdf":%*s%s(%"IVdf")\n", @@ -3601,7 +3989,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); switch (state_num) { - case BOL: /* /^../ */ + case BOL: /* /^../ */ + case SBOL: /* /^../s */ if (locinput == reginfo->strbeg) break; sayNO; @@ -3614,11 +4003,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } sayNO; - case SBOL: /* /^../s */ - if (locinput == reginfo->strbeg) - break; - sayNO; - case GPOS: /* \G */ if (locinput == reginfo->ganch) break; @@ -3636,16 +4020,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO_SILENT; assert(0); /*NOTREACHED*/ - case EOL: /* /..$/ */ - goto seol; - case MEOL: /* /..$/m */ if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; break; + case EOL: /* /..$/ */ + /* FALL THROUGH */ case SEOL: /* /..$/s */ - seol: if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; if (reginfo->strend - locinput > 1) @@ -3943,7 +4325,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (chars) { if (utf8_target) { - uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; } @@ -3956,7 +4338,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (foldlen) { if (!--chars) break; - uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, uniflags); uscan += len; foldlen -= len; @@ -4047,7 +4429,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { sayNO; } l += 2; @@ -4070,7 +4453,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { sayNO; } s += 2; @@ -4100,27 +4484,31 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * s; U32 fold_utf8_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - fold_utf8_flags = FOLDEQ_UTF8_LOCALE; + fold_utf8_flags = FOLDEQ_LOCALE; goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ - case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: /* /abc/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; - case EXACTF: /* /abc/i */ + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); folder = foldEQ; fold_array = PL_fold; fold_utf8_flags = 0; @@ -4129,7 +4517,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) { + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { /* Either target or the pattern are utf8, or has the issue where * the fold lengths may differ. */ const char * const l = locinput; @@ -4164,8 +4556,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * have to set the FLAGS fields of these */ case BOUNDL: /* /\b/l */ case NBOUNDL: /* /\B/l */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* FALL THROUGH */ case BOUND: /* /\b/ */ case BOUNDU: /* /\b/u */ case BOUNDA: /* /\b/a */ @@ -4183,7 +4573,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 * const r = reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); - ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); } if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isWORDCHAR_uni(ln); @@ -4196,7 +4587,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -4245,11 +4636,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case ANYOF: /* /[abc]/ */ - case ANYOF_WARN_SUPER: if (NEXTCHR_IS_EOS) sayNO; if (utf8_target) { - if (!reginclass(rex, scan, (U8*)locinput, utf8_target)) + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) sayNO; locinput += UTF8SKIP(locinput); } @@ -4271,10 +4662,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (NEXTCHR_IS_EOS) sayNO; - /* The locale hasn't influenced the outcome before this, so defer - * tainting until now */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* Use isFOO_lc() for characters within Latin1. (Note that * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else * wouldn't be invariant) */ @@ -4285,7 +4672,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_UNI(nextchr, + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -4366,9 +4753,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), - FLAGS(scan))))) + FLAGS(scan))))) { sayNO; } @@ -4384,8 +4771,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } if (! (to_complement ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], @@ -4647,11 +5035,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 *fold_array; UV utf8_fold_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; type = REFFL; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; case NREFFA: /* /\g{name}/iaa */ @@ -4692,10 +5079,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_nref_ref_common; case REFFL: /* /\1/il */ - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_ref; case REFFA: /* /\1/iaa */ @@ -4735,8 +5121,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = reginfo->strbeg + ln; if (type != REF /* REF can do byte comparison */ - && (utf8_target || type == REFFU)) - { /* XXX handle REFFL better */ + && (utf8_target || type == REFFU || type == REFFL)) + { char * limit = reginfo->strend; /* This call case insensitively compares the entire buffer @@ -4810,7 +5196,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) startpoint = rei->program+1; ST.close_paren = 0; } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ goto eval_recurse_doit; + assert(0); /* NOTREACHED */ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ @@ -4922,8 +5315,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->offs[0].end = locinput - reginfo->strbeg; if (reginfo->info_aux_eval->pos_magic) - reginfo->info_aux_eval->pos_magic->mg_len - = locinput - reginfo->strbeg; + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); @@ -4961,20 +5355,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* /(??{}) */ /* if its overloaded, let the regex compiler handle * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); if (!SvAMAGIC(ret)) { SV *sv = ret; if (SvROK(sv)) sv = SvRV(sv); if (SvTYPE(sv) == SVt_REGEXP) re_sv = (REGEXP*) sv; - else if (SvSMAGICAL(sv)) { - MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); if (mg) re_sv = (REGEXP *) mg->mg_obj; } - /* force any magic, undef warnings here */ - if (!re_sv) { + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { ret = sv_mortalcopy(ret); (void) SvPV_force_nolen(ret); } @@ -5028,17 +5424,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) pm_flags); if (!(SvFLAGS(ret) - & (SVs_TEMP | SVs_PADTMP | SVf_READONLY - | SVs_GMG))) { + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { /* This isn't a first class regexp. Instead, it's caching a regexp onto an existing, Perl visible scalar. */ sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); } - /* safe to do now that any $1 etc has been - * interpolated into the new pattern string and - * compiled */ - S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); } SAVEFREESV(re_sv); re = ReANY(re_sv); @@ -5048,6 +5440,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) re->sublen = rex->sublen; re->suboffset = rex->suboffset; re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; rei = RXi_GET(re); DEBUG_EXECUTE_r( debug_start_match(re_sv, utf8_target, locinput, @@ -5055,18 +5449,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ + /* Save all the seen positions so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + /* and set maxopenparen to 0, since we are starting a "fresh" match */ + maxopenparen = 0; + /* run the pattern returned from (??{...}) */ - eval_recurse_doit: /* Share code with GOSUB below this line */ - /* run the pattern returned from (??{...}) */ - - /* Save *all* the positions. */ - ST.cp = regcppush(rex, 0, maxopenparen); - REGCP_SET(ST.lastcp); - - re->lastparen = 0; - re->lastcloseparen = 0; - - maxopenparen = 0; + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ /* invalidate the S-L poscache. We're now executing a * different set of WHILEM ops (and their associated @@ -5078,6 +5470,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * pattern again */ reginfo->poscache_maxiter = 0; + /* the new regexp might have a different is_utf8_pat than we do */ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); ST.prev_rex = rex_sv; @@ -5102,7 +5495,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); rexi = RXi_GET(rex); - regcpblow(ST.cp); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; @@ -5452,10 +5852,10 @@ NULL if (reginfo->poscache_iter-- == 0) { /* initialise cache */ - const I32 size = (reginfo->poscache_maxiter + 7)/8; + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; regmatch_info_aux *const aux = reginfo->info_aux; if (aux->poscache) { - if ((I32)reginfo->poscache_size < size) { + if ((SSize_t)reginfo->poscache_size < size) { Renew(aux->poscache, size, char); reginfo->poscache_size = size; } @@ -5473,7 +5873,7 @@ NULL if (reginfo->poscache_iter < 0) { /* have we already failed at this position? */ - I32 offset, mask; + SSize_t offset, mask; reginfo->poscache_iter = -1; /* stop eventual underflow */ offset = (scan->flags & 0xf) - 1 @@ -5798,7 +6198,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", (int)(REPORT_CODE_OFF+(depth*2)),"", valid_utf8_to_uvchr((U8 *) locinput, NULL), valid_utf8_to_uvchr(ST.c1_utf8, NULL), @@ -5812,7 +6212,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", (int)(REPORT_CODE_OFF+(depth*2)),"", (int) nextchr, ST.c1, ST.c2) ); @@ -6551,6 +6951,10 @@ yes: * When popping the save stack, all these locals would be undone; * bypass this by setting the outermost saved $^R to the latest * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ if (oreplsv != GvSV(PL_replgv)) sv_setsv(oreplsv, GvSV(PL_replgv)); } @@ -6753,7 +7157,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -6780,21 +7184,23 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: - utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; case EXACTFL: - RXp_MATCH_TAINTED_on(prog); - utf8_flags = FOLDEQ_UTF8_LOCALE; + utf8_flags = FOLDEQ_LOCALE; goto do_exactf; - case EXACTF: - utf8_flags = 0; - goto do_exactf; + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6858,11 +7264,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; } case ANYOF: - case ANYOF_WARN_SUPER: if (utf8_target) { while (hardcount < max && scan < loceol - && reginclass(prog, p, (U8*)scan, utf8_target)) + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; @@ -6880,7 +7285,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); if (! utf8_target) { while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) @@ -6980,8 +7384,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, - *(scan + 1)), + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), classnum)))) { break; @@ -7063,8 +7467,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init( - "utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } while (hardcount < max && scan < loceol @@ -7132,7 +7538,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, GET_RE_DEBUG_FLAGS_DECL; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, p); + regprop(prog, prop, p, reginfo); PerlIO_printf(Perl_debug_log, "%*s %s can match %"IVdf" times out of %"IVdf"...\n", REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); @@ -7158,31 +7564,39 @@ Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, *altsvp = NULL; } - return newSVsv(core_regclass_swash(prog, node, doinit, listsvp)); + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); } -#endif -STATIC SV * -S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp) +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) { - /* Returns the swash for the input 'node' in the regex 'prog'. - * If is true, will attempt to create the swash if not already + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. + * If is 'true', will attempt to create the swash if not already * done. - * If is non-null, will return the swash initialization string in - * it. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). * Tied intimately to how regcomp.c sets up the data structure */ dVAR; SV *sw = NULL; - SV *si = NULL; + SV *si = NULL; /* Input swash initialization string */ SV* invlist = NULL; RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; - PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; - assert(ANYOF_NONBITMAP(node)); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); if (data && data->count) { const U32 n = ARG(node); @@ -7195,25 +7609,38 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit si = *ary; /* ary[0] = the string to initialize the swash with */ - /* Elements 2 and 3 are either both present or both absent. [2] is - * any inversion list generated at compile time; [3] indicates if + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if * that inversion list has any user-defined properties in it. */ - if (av_len(av) >= 2) { - invlist = ary[2]; - if (SvUV(ary[3])) { - swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; } - } - else { - invlist = NULL; } /* Element [1] is reserved for the set-up swash. If already there, * return it; if not, create it and store it there */ - if (SvROK(ary[1])) { + if (ary[1] && SvROK(ary[1])) { sw = ary[1]; } - else if (si && doinit) { + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { sw = _core_swash_init("utf8", /* the utf8 package */ "", /* nameless */ @@ -7227,16 +7654,18 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit } } + /* If requested, return a printable version of what this swash matches */ if (listsvp) { SV* matches_string = newSVpvn("", 0); - /* Use the swash, if any, which has to have incorporated into it all - * possibilities */ + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) && (si && si != &PL_sv_undef)) { - - /* If no swash, use the input initialization string, if available */ sv_catsv(matches_string, si); } @@ -7250,12 +7679,14 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit return sw; } +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ /* - reginclass - determine if a character falls into a character class n is the ANYOF regnode p is the target string + p_end points to one byte beyond the end of the target string utf8_target tells whether p is in UTF-8. Returns true if matched; false otherwise. @@ -7267,7 +7698,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit */ STATIC bool -S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target) +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); @@ -7280,7 +7711,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_IS_INVARIANT() works even if not in UTF-8 */ if (! UTF8_IS_INVARIANT(c) && utf8_target) { STRLEN c_len = 0; - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, + c = utf8n_to_uvchr(p, p_end - p, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for @@ -7293,21 +7724,19 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_NON_UTF8_LATIN1_ALL + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL && ! utf8_target && ! isASCII(c)) { match = TRUE; } - else if (flags & ANYOF_LOCALE) { - RXp_MATCH_TAINTED_on(prog); - - if ((flags & ANYOF_LOC_FOLD) - && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) - { - match = TRUE; - } - else if (ANYOF_CLASS_TEST_ANY_SET(n)) { + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } + } + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { /* The data structure is arranged so bits 0, 2, 4, ... are set * if the class includes the Posix character class given by @@ -7341,8 +7770,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const int count = 0; int to_complement = 0; + while (count < ANYOF_MAX) { - if (ANYOF_CLASS_TEST(n, count) + if (ANYOF_POSIXL_TEST(n, count) && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) { match = TRUE; @@ -7355,27 +7785,22 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } } + /* If the bitmap didn't (or couldn't) match, and something outside the - * bitmap could match, try that. Locale nodes specify completely the - * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except in - * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes. So allow locale - * to match for the synthetic start class, which will give a false - * positive that will be resolved when the match is done again as not part - * of the synthetic start class */ + * bitmap could match, try that. */ if (!match) { - if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { match = TRUE; /* Everything above 255 matches */ } - else if (ANYOF_NONBITMAP(n) - && ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target - && (c >=256 - || (! (flags & ANYOF_LOCALE)) - || OP(n) == ANYOF_SYNTHETIC)))) - { - SV * const sw = core_regclass_swash(prog, n, TRUE, 0); + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) + { + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); if (sw) { U8 * utf8_p; if (utf8_target) { @@ -7392,23 +7817,32 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const /* If we allocated a string above, free it */ if (! utf8_target) Safefree(utf8_p); } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } } if (UNICODE_IS_SUPER(c) - && OP(n) == ANYOF_WARN_SUPER + && (flags & ANYOF_WARN_SUPER) && ckWARN_d(WARN_NON_UNICODE)) { Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); } } +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ - return cBOOL(flags & ANYOF_INVERT) ^ match; + return (flags & ANYOF_INVERT) ^ match; } STATIC U8 * -S_reghop3(U8 *s, I32 off, const U8* lim) +S_reghop3(U8 *s, SSize_t off, const U8* lim) { /* return the position 'off' UTF-8 characters away from 's', forward if * 'off' >= 0, backwards if negative. But don't go outside of position @@ -7437,13 +7871,8 @@ S_reghop3(U8 *s, I32 off, const U8* lim) return s; } -#ifdef XXX_dmq -/* there are a bunch of places where we use two reghop3's that should - be replaced with this routine. but since thats not done yet - we ifdef it out - dmq -*/ STATIC U8 * -S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) { dVAR; @@ -7467,10 +7896,12 @@ S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) } return s; } -#endif + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ STATIC U8 * -S_reghopmaybe3(U8* s, I32 off, const U8* lim) +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) { dVAR; @@ -7538,6 +7969,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) } eval_state->pos_magic = mg; eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; } else eval_state->pos_magic = NULL; @@ -7554,7 +7986,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) /* this regexp is also owned by the new PL_reg_curpm, which will try to free it. */ av_push(PL_regex_padav, repointer); - PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); PL_regex_pad = AvARRAY(PL_regex_padav); } #endif @@ -7612,7 +8044,12 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } if (eval_state->pos_magic) + { eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } PL_curpm = eval_state->curpm; } diff --git a/src/5019002/regcomp.c b/src/5021000/regcomp.c similarity index 75% rename from src/5019002/regcomp.c rename to src/5021000/regcomp.c index 184f6e6..2e6d5e2 100644 --- a/src/5019002/regcomp.c +++ b/src/5021000/regcomp.c @@ -81,7 +81,7 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" -extern const struct regexp_engine my_reg_engine; +EXTERN_C const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif @@ -91,51 +91,46 @@ extern const struct regexp_engine my_reg_engine; #include "inline_invlist.c" #include "unicode_constants.h" -#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) -#ifdef op -#undef op -#endif /* op */ - -#ifdef MSDOS -# if defined(BUGGY_MSC6) - /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ -# pragma optimize("a",off) - /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ -# pragma optimize("w",on ) -# endif /* BUGGY_MSC6 */ -#endif /* MSDOS */ - #ifndef STATIC #define STATIC static #endif -typedef struct RExC_state_t { +struct RExC_state_t { U32 flags; /* RXf_* are we folding, multilining? */ U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ - regexp_internal *rxi; /* internal data for regexp object pprivate field */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ - I32 whilem_seen; /* number of WHILEM in this expr */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ - regnode *emit_bound; /* First regnode outside of the allocated space */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ regnode *emit; /* Code-emit pointer; if = &emit_dummy, implies compiling, so don't emit */ - regnode emit_dummy; /* placeholder for emit to point to */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; - I32 size; /* Code size. */ - I32 npar; /* Capture buffer count, (OPEN). */ - I32 cpar; /* Capture buffer count, (CLOSE). */ - I32 nestroot; /* root parens we are in - used by accept */ + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ I32 extralen; I32 seen_zerolen; regnode **open_parens; /* pointers to open parens */ @@ -152,15 +147,20 @@ typedef struct RExC_state_t { regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ I32 in_lookbehind; I32 contains_locale; + I32 contains_i; I32 override_recoding; I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ -#if ADD_TO_REGEXEC + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif @@ -173,7 +173,7 @@ typedef struct RExC_state_t { #define RExC_lastnum (pRExC_state->lastnum) #define RExC_paren_name_list (pRExC_state->paren_name_list) #endif -} RExC_state_t; +}; #define RExC_flags (pRExC_state->flags) #define RExC_pm_flags (pRExC_state->pm_flags) @@ -186,7 +186,8 @@ typedef struct RExC_state_t { #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) #ifdef RE_TRACK_PATTERN_OFFSETS -#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ #endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_dummy (pRExC_state->emit_dummy) @@ -196,6 +197,7 @@ typedef struct RExC_state_t { #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) @@ -209,8 +211,12 @@ typedef struct RExC_state_t { #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) #define RExC_override_recoding (pRExC_state->override_recoding) #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) @@ -219,9 +225,6 @@ typedef struct RExC_state_t { #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s, FALSE))) -#ifdef SPSTART -#undef SPSTART /* dratted cpp namespace... */ -#endif /* * Flags to be passed up and down. */ @@ -267,6 +270,11 @@ typedef struct RExC_state_t { #define namedclass_to_classnum(class) ((int) ((class) / 2)) #define classnum_to_namedclass(classnum) ((classnum) * 2) +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + /* About scan_data_t. During optimisation we recurse through the regexp program performing @@ -301,7 +309,7 @@ typedef struct RExC_state_t { - max_offset Only used for floating strings. This is the rightmost point that - the string can appear at. If set to I32 max it indicates that the + the string can appear at. If set to SSize_t_MAX it indicates that the string can occur infinitely far to the right. - minlenp @@ -345,28 +353,46 @@ typedef struct RExC_state_t { typedef struct scan_data_t { /*I32 len_min; unused */ /*I32 len_delta; unused */ - I32 pos_min; - I32 pos_delta; + SSize_t pos_min; + SSize_t pos_delta; SV *last_found; - I32 last_end; /* min value, <0 unless valid. */ - I32 last_start_min; - I32 last_start_max; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; SV **longest; /* Either &l_fixed, or &l_float. */ SV *longest_fixed; /* longest fixed string found in pattern */ - I32 offset_fixed; /* offset where it starts */ - I32 *minlen_fixed; /* pointer to the minlen relevant to the string */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ I32 lookbehind_fixed; /* is the position of the string modfied by LB */ SV *longest_float; /* longest floating string found in pattern */ - I32 offset_float_min; /* earliest point in string it can appear */ - I32 offset_float_max; /* latest point in string it can appear */ - I32 *minlen_float; /* pointer to the minlen relevant to the string */ - I32 lookbehind_float; /* is the position of the string modified by LB */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ I32 flags; I32 whilem_c; - I32 *last_closep; - struct regnode_charclass_class *start_class; + SSize_t *last_closep; + regnode_ssc *start_class; } scan_data_t; +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + /* * Forward declarations for pregcomp()'s friends. */ @@ -380,13 +406,8 @@ static const scan_data_t zero_scan_data = #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) -#ifdef NO_UNARY_PLUS -# define SF_FIX_SHIFT_EOL (0+2) -# define SF_FL_SHIFT_EOL (0+4) -#else -# define SF_FIX_SHIFT_EOL (+2) -# define SF_FL_SHIFT_EOL (+4) -#endif +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) @@ -411,15 +432,25 @@ static const scan_data_t zero_scan_data = /* The enums for all these are ordered so things work out correctly */ #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) -#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) -#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) -#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + #define OOB_NAMEDCLASS -1 /* There is no code point that is out-of-bounds, so this is problematic. But @@ -442,7 +473,12 @@ static const scan_data_t zero_scan_data = #define MARKER1 "<-- HERE" /* marker as it appears in the description */ #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ -#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given @@ -464,12 +500,12 @@ static const scan_data_t zero_scan_data = } STMT_END #define FAIL(msg) _FAIL( \ - Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) #define FAIL2(msg,arg) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ - arg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan @@ -477,7 +513,7 @@ static const scan_data_t zero_scan_data = #define Simple_vFAIL(m) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -494,8 +530,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL2(m,a1) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -513,8 +549,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL3(m, a1, a2) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -531,8 +567,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vFAIL4(m,a1,a2,a3) STMT_START { \ @@ -541,80 +577,90 @@ static const scan_data_t zero_scan_data = Simple_vFAIL4(m, a1, a2, a3); \ } STMT_END +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + /* m is not necessarily a "literal string", in this macro */ #define reg_warn_non_literal_string(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN_dep(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg_d(loc,m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN3(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ } STMT_END @@ -649,7 +695,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ __LINE__, (int)(node), (int)(byte))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)-1] = (byte); \ } \ @@ -665,7 +712,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ __LINE__, (int)(node), (int)(len))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)] = (len); \ } \ @@ -691,6 +739,49 @@ static const scan_data_t zero_scan_data = #define EXPERIMENTAL_INPLACESCAN #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log, \ @@ -730,7 +821,8 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ floating substrings if needed. */ STATIC void -S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) { const STRLEN l = CHR_SVLEN(data->last_found); const STRLEN old_l = CHR_SVLEN(*data->longest); @@ -754,9 +846,12 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min data->offset_float_min = l ? data->last_start_min : data->pos_min; data->offset_float_max = (l ? data->last_start_max - : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta)); - if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) - data->offset_float_max = I32_MAX; + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; if (data->flags & SF_BEFORE_EOL) data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); @@ -780,299 +875,592 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min DEBUG_STUDYDATA("commit: ",data,0); } -/* These macros set, clear and test whether the synthetic start class ('ssc', - * given by the parameter) matches an empty string (EOS). This uses the - * 'next_off' field in the node, to save a bit in the flags field. The ssc - * stands alone, so there is never a next_off, so this field is otherwise - * unused. The EOS information is used only for compilation, but theoretically - * it could be passed on to the execution code. This could be used to store - * more than one bit of information, but only this one is currently used. */ -#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END -#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END -#define TEST_SSC_EOS(node) cBOOL((node)->next_off) - -/* Can match anything (initialization) */ +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(pTHX_ const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + STATIC void -S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) { - PERL_ARGS_ASSERT_CL_ANYTHING; + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ + + PERL_ARGS_ASSERT_SSC_INIT; - ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_UNICODE_ALL; - SET_SSC_EOS(cl); + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); /* If any portion of the regex is to operate under locale rules, * initialization includes it. The reason this isn't done for all regexes * is that the optimizer was written under the assumption that locale was * all-or-nothing. Given the complexity and lack of documentation in the - * optimizer, and that there are inadequate test cases for locale, so many + * optimizer, and that there are inadequate test cases for locale, many * parts of it may not work properly, it is safest to avoid locale unless * necessary. */ if (RExC_contains_locale) { - ANYOF_CLASS_SETALL(cl); /* /l uses class */ - cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD; + ANYOF_POSIXL_SETALL(ssc); } else { - ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ + ANYOF_POSIXL_ZERO(ssc); } } -/* Can match anything (initialization) */ STATIC int -S_cl_is_anything(const struct regnode_charclass_class *cl) +S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) { - int value; + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ - PERL_ARGS_ASSERT_CL_IS_ANYTHING; + UV start, end; + bool ret; - for (value = 0; value < ANYOF_MAX; value += 2) - if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) - return 1; - if (!(cl->flags & ANYOF_UNICODE_ALL)) - return 0; - if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) - return 0; - return 1; + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; } -/* Can match anything (initialization) */ -STATIC void -S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) { - PERL_ARGS_ASSERT_CL_INIT; + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ + + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } + + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } - Zero(cl, 1, struct regnode_charclass_class); - cl->type = ANYOF; - cl_anything(pRExC_state, cl); - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } + + return invlist; } /* These two functions currently do the exact same thing */ -#define cl_init_zero S_cl_init +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ -/* 'AND' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if - * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_and(struct regnode_charclass_class *cl, - const struct regnode_charclass_class *and_with) +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) { - PERL_ARGS_ASSERT_CL_AND; + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } - assert(PL_regkind[and_with->type] == ANYOF); + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ - /* I (khw) am not sure all these restrictions are necessary XXX */ - if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) - && !(ANYOF_CLASS_TEST_ANY_SET(cl)) - && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(and_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD)) { - int i; + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; - if (and_with->flags & ANYOF_INVERT) - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= ~and_with->bitmap[i]; - else - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= and_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ - - if (and_with->flags & ANYOF_INVERT) { - - /* Here, the and'ed node is inverted. Get the AND of the flags that - * aren't affected by the inversion. Those that are affected are - * handled individually below */ - U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; - cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); - cl->flags |= affected_flags; - - /* We currently don't know how to deal with things that aren't in the - * bitmap, but we know that the intersection is no greater than what - * is already in cl, so let there be false positives that get sorted - * out after the synthetic start class succeeds, and the node is - * matched for real. */ - - /* The inversion of these two flags indicate that the resulting - * intersection doesn't have them */ - if (and_with->flags & ANYOF_UNICODE_ALL) { - cl->flags &= ~ANYOF_UNICODE_ALL; - } - if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { - cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* and'd node is not inverted */ - U8 outside_bitmap_but_not_utf8; /* Temp variable */ - - if (! ANYOF_NONBITMAP(and_with)) { - - /* Here 'and_with' doesn't match anything outside the bitmap - * (except possibly ANYOF_UNICODE_ALL), which means the - * intersection can't either, except for ANYOF_UNICODE_ALL, in - * which case we don't know what the intersection is, but it's no - * greater than what cl already has, so can just leave it alone, - * with possible false positives */ - if (! (and_with->flags & ANYOF_UNICODE_ALL)) { - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); - cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; - } - } - else if (! ANYOF_NONBITMAP(cl)) { - - /* Here, 'and_with' does match something outside the bitmap, and cl - * doesn't have a list of things to match outside the bitmap. If - * cl can match all code points above 255, the intersection will - * be those above-255 code points that 'and_with' matches. If cl - * can't match all Unicode code points, it means that it can't - * match anything outside the bitmap (since the 'if' that got us - * into this block tested for that), so we leave the bitmap empty. - */ - if (cl->flags & ANYOF_UNICODE_ALL) { - ARG_SET(cl, ARG(and_with)); + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); - /* and_with's ARG may match things that don't require UTF8. - * And now cl's will too, in spite of this being an 'and'. See - * the comments below about the kludge */ - cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8; + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); + + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } } } - else { - /* Here, both 'and_with' and cl match something outside the - * bitmap. Currently we do not do the intersection, so just match - * whatever cl had at the beginning. */ - } - - - /* Take the intersection of the two sets of flags. However, the - * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a - * kludge around the fact that this flag is not treated like the others - * which are initialized in cl_anything(). The way the optimizer works - * is that the synthetic start class (SSC) is initialized to match - * anything, and then the first time a real node is encountered, its - * values are AND'd with the SSC's with the result being the values of - * the real node. However, there are paths through the optimizer where - * the AND never gets called, so those initialized bits are set - * inappropriately, which is not usually a big deal, as they just cause - * false positives in the SSC, which will just mean a probably - * imperceptible slow down in execution. However this bit has a - * higher false positive consequence in that it can cause utf8.pm, - * utf8_heavy.pl ... to be loaded when not necessary, which is a much - * bigger slowdown and also causes significant extra memory to be used. - * In order to prevent this, the code now takes a different tack. The - * bit isn't set unless some part of the regular expression needs it, - * but once set it won't get cleared. This means that these extra - * modules won't get loaded unless there was some path through the - * pattern that would have required them anyway, and so any false - * positives that occur by not ANDing them out when they could be - * aren't as severe as they would be if we treated this bit like all - * the others */ - outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags) - & ANYOF_NONBITMAP_NON_UTF8; - cl->flags &= and_with->flags; - cl->flags |= outside_bitmap_but_not_utf8; + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } } } -/* 'OR' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if - * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) { - PERL_ARGS_ASSERT_CL_OR; - - if (or_with->flags & ANYOF_INVERT) { - - /* Here, the or'd node is to be inverted. This means we take the - * complement of everything not in the bitmap, but currently we don't - * know what that is, so give up and match anything */ - if (ANYOF_NONBITMAP(or_with)) { - cl_anything(pRExC_state, cl); - } - /* We do not use - * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) - * <= (B1 | !B2) | (CL1 | !CL2) - * which is wasteful if CL2 is small, but we ignore CL2: - * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 - * XXXX Can we handle case-fold? Unclear: - * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = - * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) - */ - else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(or_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD) ) { - int i; - - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= ~or_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise */ - else { - cl_anything(pRExC_state, cl); - } + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ - /* And, we can just take the union of the flags that aren't affected - * by the inversion */ - cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; + SV* ored_cp_list; + U8 ored_flags; - /* For the remaining flags: - ANYOF_UNICODE_ALL and inverted means to not match anything above - 255, which means that the union with cl should just be - what cl has in it, so can ignore this flag - ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord - is 127-255 to match them, but then invert that, so the - union with cl should just be what cl has in it, so can - ignore this flag - */ - } else { /* 'or_with' is not inverted */ - /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ - if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_LOC_FOLD) - || (cl->flags & ANYOF_LOC_FOLD)) ) { - int i; + PERL_ARGS_ASSERT_SSC_OR; - /* OR char bitmap and class bitmap separately */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= or_with->bitmap[i]; - if (or_with->flags & ANYOF_CLASS) { - ANYOF_CLASS_OR(or_with, cl); - } - } - else { /* XXXX: logic is complicated, leave it along for a moment. */ - cl_anything(pRExC_state, cl); - } + assert(is_ANYOF_SYNTHETIC(ssc)); - if (ANYOF_NONBITMAP(or_with)) { + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } - /* Use the added node's outside-the-bit-map match if there isn't a - * conflict. If there is a conflict (both nodes match something - * outside the bitmap, but what they match outside is not the same - * pointer, and hence not easily compared until XXX we extend - * inversion lists this far), give up and allow the start class to - * match everything outside the bitmap. If that stuff is all above - * 255, can just set UNICODE_ALL, otherwise caould be anything. */ - if (! ANYOF_NONBITMAP(cl)) { - ARG_SET(cl, ARG(or_with)); - } - else if (ARG(cl) != ARG(or_with)) { + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ - if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { - cl_anything(pRExC_state, cl); - } - else { - cl->flags |= ANYOF_UNICODE_ALL; + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); } } } + } + + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} + +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); - /* Take the union */ - cl->flags |= or_with->flags; + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) -#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) #ifdef DEBUGGING @@ -1135,10 +1523,12 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; - PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); if ( trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); } else { PerlIO_printf( Perl_debug_log, "%6s", "" ); } @@ -1150,19 +1540,23 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, while( ( base + ofs < trie->uniquecharcount ) || ( base + ofs - trie->uniquecharcount < trie->lasttrans - && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) ofs++; PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) && - ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && - trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) { PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, - (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); } else { PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); } @@ -1173,7 +1567,8 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, } PerlIO_printf( Perl_debug_log, "\n" ); } - PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, ""); + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); for (word=1; word <= trie->wordcount; word++) { PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", (int)word, (int)(trie->wordinfo[word].prev), @@ -1217,14 +1612,16 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, ); } for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { - SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); if ( tmp ) { PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR ) , TRIE_LIST_ITEM(state,charid).forid, (UV)TRIE_LIST_ITEM(state,charid).newstate @@ -1300,9 +1697,11 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); } if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); } else { - PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, trie->states[ TRIE_NODENUM( state ) ].wordnum ); } } @@ -1421,7 +1820,7 @@ and would end up looking like: 8: EXACT (10) 10: END(0) - d = uvuni_to_utf8_flags(d, uv, 0); + d = uvchr_to_utf8_flags(d, uv, 0); is the recommended Unicode-aware way of saying @@ -1433,7 +1832,7 @@ is the recommended Unicode-aware way of saying if (UTF) { \ SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ @@ -1444,31 +1843,28 @@ is the recommended Unicode-aware way of saying } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - /* if it is UTF then it is either already folded, or does not need folding */ \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ - } \ - else if (folder == PL_fold_latin1) { \ - /* if we use this folder we have to obey unicode rules on latin-1 data */ \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ - skiplen = UNISKIP(uvc); \ - foldlen -= skiplen; \ - scan = foldbuf + skiplen; \ - } \ - } else { \ - /* raw data, will be folded later if needed */ \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1511,7 +1907,8 @@ is the recommended Unicode-aware way of saying \ if ( noper_next < tail ) { \ if (!trie->jump) \ - trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ trie->jump[curword] = (U16)(noper_next - convert); \ if (!jumper) \ jumper = noper_next; \ @@ -1546,7 +1943,9 @@ is the recommended Unicode-aware way of saying #define MADE_EXACT_TRIE 4 STATIC I32 -S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) { dVAR; /* first pass, loop through and scan words */ @@ -1554,7 +1953,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs HV *widecharmap = NULL; AV *revcharmap = newAV(); regnode *cur; - const U32 uniflags = UTF8_ALLOW_DEFAULT; STRLEN len = 0; UV uvc = 0; U16 curword = 0; @@ -1567,13 +1965,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 * folder = NULL; #ifdef DEBUGGING - const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); AV *trie_words = NULL; /* along with revcharmap, this only used during construction but both are * useful during debugging so we store them in the struct when debugging. */ #else - const U32 data_slot = add_data( pRExC_state, 2, "tu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); STRLEN trie_charcount=0; #endif SV *re_trie_maxbuff; @@ -1588,10 +1986,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs case EXACT: break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; - case EXACTFL: folder = PL_fold_locale; break; default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } @@ -1615,12 +2011,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } DEBUG_TRIE_COMPILE_r({ - PerlIO_printf( Perl_debug_log, - "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - (int)depth * 2 + 2, "", - REG_NODE_NUM(startbranch),REG_NODE_NUM(first), - REG_NODE_NUM(last), REG_NODE_NUM(tail), - (int)depth); + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); }); /* Find the node we are going to overwrite */ @@ -1640,9 +2035,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store Unicode characters. We use the - native representation of the character value as the key and IV's for the - coded index. + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. *TODO* If we keep track of how many times each character is used we can remap the columns so that the table compression later on is more @@ -1659,13 +2054,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); const U8 *e = uc + STR_LEN( noper ); - STRLEN foldlen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; - const U8 *scan = (U8*)NULL; + int foldlen = 0; U32 wordlen = 0; /* required init */ - STRLEN chars = 0; - bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1685,13 +2079,77 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regardless of encoding */ if (OP( noper ) == EXACTFU_SS) { /* false positives are ok, so just set this */ - TRIE_BITMAP_SET(trie,0xDF); + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; - chars++; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ if ( uvc < 256 ) { if ( folder ) { U8 folded= folder[ (U8) uvc ]; @@ -1715,13 +2173,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !UTF ) { /* store first byte of utf8 representation of variant codepoints */ - if (! UNI_IS_INVARIANT(uvc)) { + if (! UVCHR_IS_INVARIANT(uvc)) { TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); } } set_bit = 0; /* We've done our bit :-) */ } } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + SV** svpp; if ( !widecharmap ) widecharmap = newHV(); @@ -1736,30 +2202,22 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs TRIE_STORE_REVCHAR(uvc); } } - } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ if( cur == first ) { - trie->minlen = chars; - trie->maxlen = chars; - } else if (chars < trie->minlen) { - trie->minlen = chars; - } else if (chars > trie->maxlen) { - trie->maxlen = chars; - } - if (OP( noper ) == EXACTFU_SS) { - /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ - if (trie->minlen > 1) - trie->minlen= 1; + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; } - if (OP( noper ) == EXACTFU_TRICKYFOLD) { - /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" - * - We assume that any such sequence might match a 2 byte string */ - if (trie->minlen > 2 ) - trie->minlen= 2; - } - } /* end first pass */ DEBUG_TRIE_COMPILE_r( - PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", (int)depth * 2 + 2,"", ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, @@ -1791,7 +2249,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); prev_states[1] = 0; - if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { /* Second Pass -- Array Of Lists Representation @@ -1823,11 +2283,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1846,14 +2302,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); if ( !svpp ) { charid = 0; } else { charid=(U16)SvIV( *svpp ); } } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ if ( charid ) { U16 check; @@ -1863,8 +2323,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !trie->states[ state ].trans.list ) { TRIE_LIST_NEW( state ); } - for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { - if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { newstate = TRIE_LIST_ITEM( state, check ).newstate; break; } @@ -1934,7 +2399,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, transcount * sizeof(reg_trie_trans) ); - Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); } base = trie->uniquecharcount + tp - minid; if ( maxid == minid ) { @@ -1942,22 +2409,27 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( ; zp < tp ; zp++ ) { if ( ! trie->trans[ zp ].next ) { base = trie->uniquecharcount + zp - minid; - trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ zp ].check = state; set = 1; break; } } if ( !set ) { - trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ tp ].check = state; tp++; zp = tp; } } else { for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; - trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; trie->trans[ tid ].check = state; } tp += ( maxid - minid + 1 ); @@ -1977,26 +2449,26 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* Second Pass -- Flat Table Representation. - we dont use the 0 slot of either trans[] or states[] so we add 1 to each. - We know that we will need Charcount+1 trans at most to store the data - (one row per char at worst case) So we preallocate both structures - assuming worst case. + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. We then construct the trie using only the .next slots of the entry structs. - We use the .check field of the first entry of the node temporarily to - make compression both faster and easier by keeping track of how many non - zero fields are in the node. + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. Since trans are numbered from 1 any 0 pointer in the table is a FAIL transition. - There are two terms at use here: state as a TRIE_NODEIDX() which is a - number representing the first entry of the node, and state as a - TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and - TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there - are 2 entrys per node. eg: + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: A B A B 1. 2 4 1. 3 7 @@ -2004,9 +2476,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs 3. 0 0 5. 0 0 4. 0 0 7. 0 0 - The table is internally in the right hand, idx form. However as we also - have to deal with the states array which is indexed by nodenum we have to - use TRIE_NODENUM() to convert. + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. */ DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, @@ -2033,12 +2505,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U16 charid = 0; /* sanity init */ U32 accept_state = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - STRLEN skiplen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -2057,7 +2525,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); charid = svpp ? (U16)SvIV(*svpp) : 0; } if ( charid ) { @@ -2073,7 +2544,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } else { Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ } } accept_state = TRIE_NODENUM( state ); @@ -2160,7 +2632,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U32 used = trie->trans[ stateidx ].check; trie->trans[ stateidx ].check = 0; - for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { if ( flag || trie->trans[ stateidx + charid ].next ) { if ( trie->trans[ stateidx + charid ].next ) { if (o_used == 1) { @@ -2169,8 +2644,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs break; } } - trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; - trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); trie->trans[ zp ].check = state; if ( ++zp > pos ) pos = zp; break; @@ -2179,9 +2659,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( !flag ) { flag = 1; - trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; } - trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); trie->trans[ pos ].check = state; pos++; } @@ -2192,19 +2675,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->states, laststate * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, - "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", - (int)depth * 2 + 2,"", - (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), - (IV)next_alloc, - (IV)pos, - ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); ); } /* end table compress */ } DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", (int)depth * 2 + 2, "", (UV)trie->statecount, (UV)trie->lasttrans) @@ -2255,7 +2740,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs }); } DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); @@ -2499,22 +2985,27 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STATIC void S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) { -/* The Trie is constructed and compressed now so we can build a fail array if it's needed +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed - This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the - "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 ISBN 0-201-10088-6 - We find the fail state for each state in the trie, this state is the longest proper - suffix of the current state's 'word' that is also a proper prefix of another word in our - trie. State 1 represents the word '' and is thus the default fail state. This allows - the DFA not to have to restart after its tried and failed a word at a given point, it - simply continues as though it had been matching the other word in the first place. + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. Consider 'abcdgu'=~/abcdefg|cdgu/ - When we get to 'd' we are still matching the first word, we would encounter 'g' which would - fail, which would bring us to the state representing 'd' in the second word where we would - try 'g' and succeed, proceeding to match 'cdgu'. + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. */ /* add a fail transition */ const U32 trie_offset = ARG(source); @@ -2529,7 +3020,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 base = trie->states[ 1 ].trans.base; U32 *fail; reg_ac_data *aho; - const U32 data_slot = add_data( pRExC_state, 1, "T" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; @@ -2602,26 +3093,15 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode PerlIO_printf(Perl_debug_log, "\n"); }); Safefree(q); - /*RExC_seen |= REG_SEEN_TRIEDFA;*/ + /*RExC_seen |= REG_TRIEDFA_SEEN;*/ } -/* - * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. - * These need to be revisited when a newer toolchain becomes available. - */ -#if defined(__sparc64__) && defined(__GNUC__) -# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) -# undef SPARC64_GCC_WORKAROUND -# define SPARC64_GCC_WORKAROUND 1 -# endif -#endif - #define DEBUG_PEEP(str,scan,depth) \ DEBUG_OPTIMISE_r({if (scan){ \ SV * const mysv=sv_newmortal(); \ regnode *Next = regnext(scan); \ - regprop(RExC_rx, mysv, scan); \ + regprop(RExC_rx, mysv, scan, NULL); \ PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ Next ? (REG_NODE_NUM(Next)) : 0 ); \ @@ -2640,49 +3120,58 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta of the input nodes. + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. * - * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF - * and contains LATIN SMALL LETTER SHARP S + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) * * This is as good a place as any to discuss the design of handling these * multi-character fold sequences. It's been wrong in Perl for a very long * time. There are three code points in Unicode whose multi-character folds * were long ago discovered to mess things up. The previous designs for * dealing with these involved assigning a special node for them. This - * approach doesn't work, as evidenced by this example: + * approach doesn't always work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node that + * Both sides fold to "sss", but if the pattern is parsed to create a node that * would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node - * that is "sss". + * that is "sss" in this case. * * It turns out that there are problems with all multi-character folds, and not - * just these three. Now the code is general, for all such cases, but the - * three still have some special handling. The approach taken is: + * just these three. Now the code is general, for all such cases. The + * approach taken is: * 1) This routine examines each EXACTFish node that could contain multi- - * character fold sequences. It returns in *min_subtract how much to - * subtract from the the actual length of the string to get a real minimum - * match length; it is 0 if there are no multi-char folds. This delta is - * used by the caller to adjust the min length of the match, and the delta - * between min and max, so that the optimizer doesn't reject these - * possibilities based on size constraints. - * 2) Certain of these sequences require special handling by the trie code, - * so, if found, this code changes the joined node type to special ops: - * EXACTFU_TRICKYFOLD and EXACTFU_SS. - * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS * is used for an EXACTFU node that contains at least one "ss" sequence in * it. For non-UTF-8 patterns and strings, this is the only case where * there is a possible fold length change. That means that a regular * EXACTFU node without UTF-8 involvement doesn't have to concern itself * with length changes, and so can be processed faster. regexec.c takes * advantage of this. Generally, an EXACTFish node that is in UTF-8 is - * pre-folded by regcomp.c. This saves effort in regex matching. - * However, the pre-folding isn't done for non-UTF8 patterns because the - * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things - * down by forcing the pattern into UTF8 unless necessary. Also what - * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold * possibilities for the non-UTF8 patterns are quite simple, except for * the sharp s. All the ones that don't involve a UTF-8 target string are * members of a fold-pair, and arrays are set up for all of them so that @@ -2690,45 +3179,63 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * this file makes sure that in EXACTFU nodes, the sharp s gets folded to * 'ss', even if the pattern isn't UTF-8. This avoids the issues * described in the next item. - * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the - * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a - * UTF-8 pattern.) An assumption that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes is that a - * character in the pattern corresponds to at most a single character in - * the target string. (And I do mean character, and not byte here, unlike - * other parts of the documentation that have never been updated to - * account for multibyte Unicode.) sharp s in EXACTF nodes can match the - * two character string 'ss'; in EXACTFA nodes it can match - * "\x{17F}\x{17F}". These violate the assumption, and they are the only - * instances where it is violated. I'm reluctant to try to change the - * assumption, as the code involved is impenetrable to me (khw), so - * instead the code here punts. This routine examines (when the pattern - * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a - * boolean indicating whether or not the node contains a sharp s. When it - * is true, the caller sets a flag that later causes the optimizer in this - * file to not set values for the floating and fixed string lengths, and - * thus avoids the optimizer code in regexec.c that makes the invalid + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid * assumption. Thus, there is no optimization based on string lengths for - * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. - * (The reason the assumption is wrong only in these two cases is that all - * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all - * other folds to their expanded versions. We can't prefold sharp s to - * 'ss' in EXACTF nodes because we don't know at compile time if it - * actually matches 'ss' or not. It will match iff the target string is - * in UTF-8, unlike the EXACTFU nodes, where it always matches; and - * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 - * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; - * but in a non-UTF8 pattern, folding it to that above-Latin1 string would - * require the pattern to be forced into UTF-8, the overhead of which we - * want to avoid.) - */ - -#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2774,8 +3281,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); - /* XXX I (khw) kind of doubt that this works on platforms where - * U8_MAX is above 255 because of lots of other assumptions */ + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ /* Don't join if the sum can't fit into a single node */ if (oldl + STR_LEN(n) > U8_MAX) break; @@ -2810,7 +3318,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } *min_subtract = 0; - *has_exactf_sharp_s = FALSE; + *unfolded_multi_char = FALSE; /* Here, all the adjacent mergeable EXACTish nodes have been merged. We * can now analyze for sequences of problematic code points. (Prior to @@ -2818,15 +3326,68 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - const U8 * const s0 = (U8*) STRING(scan); - const U8 * s = s0; - const U8 * const s_end = s0 + STR_LEN(scan); + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ /* One pass is made over the node's string looking for all the - * possibilities. to avoid some tests in the loop, there are two main + * possibilities. To avoid some tests in the loop, there are two main * cases, for UTF-8 patterns (which can't have EXACTF nodes) and * non-UTF-8 */ if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ /* Examine the string for a multi-character fold sequence. UTF-8 * patterns have all characters pre-folded by the time this code is @@ -2834,60 +3395,32 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b while (s < s_end - 1) /* Can stop 1 before the end, as minimum length sequence we are looking for is 2 */ { - int count = 0; + int count = 0; /* How many characters in a multi-char fold */ int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); if (! len) { /* Not a multi-char fold: get next char */ s += UTF8SKIP(s); continue; } - /* Nodes with 'ss' require special handling, except for EXACTFL - * and EXACTFA for which there is no multi-char fold to this */ + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ if (len == 2 && *s == 's' && *(s+1) == 's' - && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) { count = 2; - OP(scan) = EXACTFU_SS; - s += 2; - } - else if (len == 6 /* len is the same in both ASCII and EBCDIC - for these */ - && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6) - || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6))) - { - count = 3; - - /* These two folds require special handling by trie's, so - * change the node type to indicate this. If EXACTFA and - * EXACTFL were ever to be handled by trie's, this would - * have to be changed. If this node has already been - * changed to EXACTFU_SS in this loop, leave it as is. (I - * (khw) think it doesn't matter in regexec.c for UTF - * patterns, but no need to change it */ - if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_TRICKYFOLD; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; } - s += 6; + s += 2; } else { /* Here is a generic multi-char fold. */ - const U8* multi_end = s + len; - - /* Count how many characters in it. In the case of /l and - * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. (In - * EXACTFL, no folds are allowed to any Latin1 code point, - * not just ASCII. But there aren't any of these - * currently, nor ever likely, so don't take the time to - * test for them. The code that generates the - * is_MULTI_foo() macros croaks should one actually get put - * into Unicode .) */ - if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + U8* multi_end = s + len; + + /* Count how many characters in it. In the case of /aa, no + * folds which contain ASCII code points are allowed, so + * check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; } @@ -2907,70 +3440,78 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b /* The delta is how long the sequence is minus 1 (1 is how long * the character that folds to the sequence is) */ - *min_subtract += count - 1; + total_count_delta += count - 1; next_iteration: ; } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); } else if (OP(scan) == EXACTFA) { /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char * fold to the ASCII range (and there are no existing ones in the * upper latin1 range). But, as outlined in the comments preceding - * this function, we need to flag any occurrences of the sharp s */ + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { - *has_exactf_sharp_s = TRUE; + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; break; } s++; continue; } } - else if (OP(scan) != EXACTFL) { - - /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the - * multi-char folds that are all Latin1. (This code knows that - * there are no current multi-char folds possible with EXACTFL, - * relying on fold_grind.t to catch any errors if the very unlikely - * event happens that some get added in future Unicode versions.) - * As explained in the comments preceding this function, we look - * also for the sharp s in EXACTF nodes; it can be in the final - * position. Otherwise we can stop looking 1 byte earlier because - * have to find at least two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; - - /* The below is perhaps overboard, but this allows us to save a - * test each time through the loop at the expense of a mask. This - * is because on both EBCDIC and ASCII machines, 'S' and 's' differ - * by a single bit. On ASCII they are 32 apart; on EBCDIC, they - * are 64. This uses an exclusive 'or' to find that bit and then - * inverts it to form a mask, with just a single 0, in the bit - * position where 'S' and 's' differ. */ - const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); - const U8 s_masked = 's' & S_or_s_mask; + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; while (s < upper) { int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) { - *has_exactf_sharp_s = TRUE; + *unfolded_multi_char = TRUE; } s++; continue; } if (len == 2 - && ((*s & S_or_s_mask) == s_masked) - && ((*(s+1) & S_or_s_mask) == s_masked)) + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) { /* EXACTF nodes need to know that the minimum length * changed so that a sharp s in the string can match this * ss in the pattern, but they remain EXACTF nodes, as they * won't match this unless the target string is is UTF-8, - * which we don't know until runtime */ - if (OP(scan) != EXACTF) { + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { OP(scan) = EXACTFU_SS; } } @@ -3004,7 +3545,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b #define INIT_AND_WITHP \ assert(!and_withp); \ - Newx(and_withp,1,struct regnode_charclass_class); \ + Newx(and_withp,1, regnode_ssc); \ SAVEFREEPV(and_withp) /* this is a chain of data about sub patterns we are processing that @@ -3015,20 +3556,19 @@ typedef struct scan_frame { regnode *last; /* last node to process in this frame */ regnode *next; /* next node to process when last is reached */ struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; I32 stop; /* what stopparen do we use */ } scan_frame; -#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) - -STATIC I32 +STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, - I32 *minlenp, I32 *deltap, + SSize_t *minlenp, SSize_t *deltap, regnode *last, scan_data_t *data, I32 stopparen, - U8* recursed, - struct regnode_charclass_class *and_withp, + U32 recursed_depth, + regnode_ssc *and_withp, U32 flags, U32 depth) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ @@ -3039,17 +3579,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0; /* There must be at least this number of characters to match */ + /* There must be at least this number of characters to match */ + SSize_t min = 0; I32 pars = 0, code; regnode *scan = *scanp, *next; - I32 delta = 0; + SSize_t delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; SV *re_trie_maxbuff = NULL; regnode *first_non_open = scan; - I32 stopmin = I32_MAX; + SSize_t stopmin = SSize_t_MAX; scan_frame *frame = NULL; GET_RE_DEBUG_FLAGS_DECL; @@ -3058,7 +3599,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif - if ( depth == 0 ) { while (first_non_open && OP(first_non_open) == OPEN) first_non_open=regnext(first_non_open); @@ -3070,17 +3610,42 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV min_subtract = 0; /* How mmany chars to subtract from the minimum node length to get a real minimum (because the folded version may be shorter) */ - bool has_exactf_sharp_s = FALSE; + bool unfolded_multi_char = FALSE; /* Peephole optimizer: */ - DEBUG_STUDYDATA("Peep:", data,depth); - DEBUG_PEEP("Peep",scan,depth); + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); - /* Its not clear to khw or hv why this is done here, and not in the - * clauses that deal with EXACT nodes. khw's guess is that it's - * because of a previous design */ - JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); - /* Follow the next-chain of the current node and optimize + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); + + /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ if (OP(scan) != CURLYX) { const int max = (reg_off_by_arg[OP(scan)] @@ -3111,24 +3676,29 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, || OP(scan) == IFTHEN) { next = regnext(scan); code = OP(scan); - /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ if (OP(next) == code || code == IFTHEN) { - /* NOTE - There is similar code to this block below for handling - TRIE nodes on a re-study. If you change stuff here check there - too. */ - I32 max1 = 0, min1 = I32_MAX, num = 0; - struct regnode_charclass_class accum; + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; regnode * const startbranch=scan; - if (flags & SCF_DO_SUBSTR) - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + ssc_init_zero(pRExC_state, &accum); while (OP(scan) == code) { - I32 deltanext, minnext, f = 0, fake; - struct regnode_charclass_class this_class; + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; num++; data_fake.flags = 0; @@ -3145,7 +3715,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (code != BRANCH) scan = NEXTOPER(scan); if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -3153,14 +3723,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - next, &data_fake, - stopparen, recursed, NULL, f,depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); if (min1 > minnext) min1 = minnext; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; + max1 = SSize_t_MAX; } else if (max1 < minnext + deltanext) max1 = minnext + deltanext; scan = next; @@ -3179,63 +3749,64 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; if (flags & SCF_DO_SUBSTR) { data->pos_min += min1; - if (data->pos_delta >= I32_MAX - (max1 - min1)) - data->pos_delta = I32_MAX; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; else data->pos_delta += max1 - min1; if (max1 != min1 || is_inf) data->longest = &(data->longest_float); } min += min1; - if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0) - delta = I32_MAX; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; else delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } - if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { /* demq. - Assuming this was/is a branch we are dealing with: 'scan' now - points at the item that follows the branch sequence, whatever - it is. We now start at the beginning of the sequence and look - for subsequences of + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of BRANCH->EXACT=>x1 BRANCH->EXACT=>x2 tail - which would be constructed from a pattern like /A|LIST|OF|WORDS/ + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ If we can find such a subsequence we need to turn the first element into a trie and then add the subsequent branch exact @@ -3243,7 +3814,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, We have two cases - 1. patterns where the whole set of branches can be converted. + 1. patterns where the whole set of branches can be + converted. 2. patterns where only a subset can be converted. @@ -3280,7 +3852,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U32 count=0; #ifdef DEBUGGING - SV * const mysv = sv_newmortal(); /* for dumping */ + SV * const mysv = sv_newmortal(); /* for dumping */ #endif /* var tail is used because there may be a TAIL regop in the way. Ie, the exacts will point to the @@ -3297,11 +3869,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, tail ); + regprop(RExC_rx, mysv, tail, NULL); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", - (int)depth * 2 + 2, "", - "Looking for TRIE'able sequences. Tail node is: ", - SvPV_nolen_const( mysv ) + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) ); }); @@ -3309,35 +3881,46 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, Step through the branches cur represents each branch, - noper is the first thing to be matched as part of that branch + noper is the first thing to be matched as part + of that branch noper_next is the regnext() of that node. - We normally handle a case like this /FOO[xyz]|BAR[pqr]/ - via a "jump trie" but we also support building with NOJUMPTRIE, - which restricts the trie logic to structures like /FOO|BAR/. - - If noper is a trieable nodetype then the branch is a possible optimization - target. If we are building under NOJUMPTRIE then we require that noper_next - is the same as scan (our current position in the regex program). - - Once we have two or more consecutive such branches we can create a - trie of the EXACT's contents and stitch it in place into the program. - - If the sequence represents all of the branches in the alternation we - replace the entire thing with a single TRIE node. - - Otherwise when it is a subsequence we need to stitch it in place and - replace only the relevant branches. This means the first branch has - to remain as it is used by the alternation logic, and its next pointer, - and needs to be repointed at the item on the branch chain following - the last branch we have optimized away. - - This could be either a BRANCH, in which case the subsequence is internal, - or it could be the item following the branch sequence in which case the - subsequence is at the end (which does not necessarily mean the first node - is the start of the alternation). - - TRIE_TYPE(X) is a define which maps the optype to a trietype. + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. optype | trietype ----------------+----------- @@ -3345,14 +3928,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, EXACT | EXACT EXACTFU | EXACTFU EXACTFU_SS | EXACTFU - EXACTFU_TRICKYFOLD | EXACTFU - EXACTFA | 0 + EXACTFA | EXACTFA */ #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ ( EXACT == (X) ) ? EXACT : \ - ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ 0 ) /* dont use tail as the end marker for this traverse */ @@ -3367,16 +3950,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); - regprop(RExC_rx, mysv, noper); + regprop(RExC_rx, mysv, noper, NULL); PerlIO_printf( Perl_debug_log, " -> %s", SvPV_nolen_const(mysv)); if ( noper_next ) { - regprop(RExC_rx, mysv, noper_next ); + regprop(RExC_rx, mysv, noper_next, NULL); PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } @@ -3386,8 +3969,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ); }); - /* Is noper a trieable nodetype that can be merged with the - * current trie (if there is one)? */ + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ if ( noper_trietype && ( @@ -3400,10 +3983,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif && count < U16_MAX) { - /* Handle mergable triable node - * Either we are the first node in a new trieable sequence, - * in which case we do some bookkeeping, otherwise we update - * the end pointer. */ + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ if ( !first ) { first = cur; if ( noper_trietype == NOTHING ) { @@ -3416,8 +3999,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_next_trietype ) { trietype = noper_next_trietype; } else if (noper_next_type) { - /* a NOTHING regop is 1 regop wide. We need at least two - * for a trie so we can't merge this in */ + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ first = NULL; } } else { @@ -3433,31 +4017,39 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle mergable triable node */ else { /* handle unmergable node - - * noper may either be a triable node which can not be tried - * together with the current trie, or a non triable node */ + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ if ( last ) { - /* If last is set and trietype is not NOTHING then we have found - * at least two triable branch sequences in a row of a similar - * trietype so we can turn them into a trie. If/when we - * allow NOTHING to start a trie sequence this condition will be - * required, and it isn't expensive so we leave it in for now. */ + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ if ( trietype && trietype != NOTHING ) make_trie( pRExC_state, - startbranch, first, cur, tail, count, - trietype, depth+1 ); - last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ } if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ - /* noper is triable, so we can start a new trie sequence */ + /* noper is triable, so we can start a new + * trie sequence */ count = 1; first = cur; trietype = noper_trietype; } else if (first) { - /* if we already saw a first but the current node is not triable then we have + /* if we already saw a first but the + * current node is not triable then we have * to reset the first information. */ count = 0; first = NULL; @@ -3466,18 +4058,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle unmergable node */ } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) \n", (int)depth * 2 + 2, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); if ( last && trietype ) { if ( trietype != NOTHING ) { - /* the last branch of the sequence was part of a trie, - * so we have to construct it here outside of the loop - */ - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) @@ -3487,20 +4082,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( startbranch == first && scan == tail ) { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } } #endif } else { - /* at this point we know whatever we have is a NOTHING sequence/branch - * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING */ if ( startbranch == first ) { regnode *opt; - /* the entire thing is a NOTHING sequence, something like this: - * (?:|) So we can turn it into a plain NOTHING op. */ + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); @@ -3528,9 +4126,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 paren; regnode *start; regnode *end; + U32 my_recursed_depth= recursed_depth; if (OP(scan) != SUSPEND) { - /* set the pointer */ + /* set the pointer */ if (OP(scan) == GOSUB) { paren = ARG(scan); RExC_recurse[ARG2L(scan)] = scan; @@ -3541,21 +4140,33 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, start = RExC_rxi->program + 1; end = RExC_opend; } - if (!recursed) { - Newxz(recursed, (((RExC_npar)>>3) +1), U8); - SAVEFREEPV(recursed); - } - if (!PAREN_TEST(recursed,paren+1)) { - PAREN_SET(recursed,paren+1); + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; Newx(newframe,1,scan_frame); } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } } else { @@ -3572,17 +4183,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, newframe->last = last; newframe->stop = stopparen; newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); frame = newframe; scan = start; stopparen = paren; last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; continue; } } else if (OP(scan) == EXACT) { - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc; if (UTF) { const U8 * const s = (U8*)STRING(scan); @@ -3598,7 +4215,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data->last_end == -1) { /* Update the start info. */ data->last_start_min = data->pos_min; data->last_start_max = is_inf - ? I32_MAX : data->pos_min + data->pos_delta; + ? SSize_t_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); if (UTF) @@ -3609,83 +4226,47 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) mg->mg_len += utf8_length((U8*)STRING(scan), - (U8*)STRING(scan)+STR_LEN(scan)); + (U8*)STRING(scan)+STR_LEN(scan)); } data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - - - /* If compatible, we or it in below. It is compatible if is - * in the bitmp and either 1) its bit or its fold is set, or 2) - * it's for a locale. Even if there isn't unicode semantics - * here, at runtime there may be because of matching against a - * utf8 string, so accept a possible false positive for - * latin1-range folds */ - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && (!(data->start_class->flags & ANYOF_LOC_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - ) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) - ANYOF_BITMAP_SET(data->start_class, uc); - else if (uc >= 0x100) { - int i; - - /* Some Unicode code points fold to the Latin1 range; as - * XXX temporary code, instead of figuring out if this is - * one, just assume it is and set all the start class bits - * that could be some such above 255 code point's fold - * which will generate fals positives. As the code - * elsewhere that does compute the fold settles down, it - * can be extracted out and re-used here */ - for (i = 0; i < 256; i++){ - if (HAS_NONLATIN1_FOLD_CLOSURE(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } - } - } - CLEAR_SSC_EOS(data->start_class); - if (uc < 0x100) - data->start_class->flags &= ~ANYOF_UNICODE_ALL; + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); } else if (flags & SCF_DO_STCLASS_OR) { - /* false positive possible if the class is case-folded */ - if (uc < 0x100) - ANYOF_BITMAP_SET(data->start_class, uc); - else - data->start_class->flags |= ANYOF_UNICODE_ALL; - CLEAR_SSC_EOS(data->start_class); - cl_and(data->start_class, and_withp); + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { const U8 * const s = (U8 *)STRING(scan); uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - if (has_exactf_sharp_s) { - RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; } min += l - min_subtract; assert (min >= 0); @@ -3700,99 +4281,95 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) { - ANYOF_BITMAP_SET(data->start_class, uc); - CLEAR_SSC_EOS(data->start_class); - if (OP(scan) == EXACTFL) { - /* XXX This set is probably no longer necessary, and - * probably wrong as LOCALE now is on in the initial - * state */ - data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; - } - else { - - /* Also set the other member of the fold pair. In case - * that unicode semantics is called for at runtime, use - * the full latin1 fold. (Can't do this for locale, - * because not known until runtime) */ - ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); - - /* All other (EXACTFL handled above) folds except under - * /iaa that include s, S, and sharp_s also may include - * the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, + if (OP(scan) == EXACTFL) { + + /* We don't know what the folds are; it could be anything. XXX + * Actually, we only support UTF-8 encoding for code points + * above Latin1, so we could know what those folds are. */ + EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, + 0, + UV_MAX); + } + else { /* Non-locale EXACTFish */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + if (uc < 256) { /* We know what the Latin1 folds are ... */ + if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we + know if anything folds + with this */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); + if (OP(scan) != EXACTFA) { /* The folds below aren't + legal under /iaa */ + if (isARG2_lower_or_UPPER_ARG1('s', uc)) { + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, LATIN_SMALL_LETTER_SHARP_S); } else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, 's'); + EXACTF_invlist + = add_cp_to_invlist(EXACTF_invlist, 'S'); } } + + /* We also know if there are above-Latin1 code points + * that fold to this (none legal for ASCII and /iaa) */ + if ((! isASCII(uc) || OP(scan) != EXACTFA) + && HAS_NONLATIN1_FOLD_CLOSURE(uc)) + { + /* XXX We could know exactly what does fold to this + * if the reverse folds are loaded, as currently in + * S_regclass() */ + _invlist_union(EXACTF_invlist, + PL_AboveLatin1, + &EXACTF_invlist); + } } } - else if (uc >= 0x100) { - int i; - for (i = 0; i < 256; i++){ - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } + else { /* Non-locale, above Latin1. XXX We don't currently + know what participates in folds with this, so have + to assume anything could */ + + /* XXX We could know exactly what does fold to this if the + * reverse folds are loaded, as currently in S_regclass(). + * But we do know that under /iaa nothing in the ASCII + * range can participate */ + if (OP(scan) == EXACTFA) { + _invlist_union_complement_2nd(EXACTF_invlist, + PL_XPosix_ptrs[_CC_ASCII], + &EXACTF_invlist); + } + else { + EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, + 0, UV_MAX); } } } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); + } else if (flags & SCF_DO_STCLASS_OR) { - if (data->start_class->flags & ANYOF_LOC_FOLD) { - /* false positive possible if the class is case-folded. - Assume that the locale settings are the same... */ - if (uc < 0x100) { - ANYOF_BITMAP_SET(data->start_class, uc); - if (OP(scan) != EXACTFL) { - - /* And set the other member of the fold pair, but - * can't do that in locale because not known until - * run-time */ - ANYOF_BITMAP_SET(data->start_class, - PL_fold_latin1[uc]); - - /* All folds except under /iaa that include s, S, - * and sharp_s also may include the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } - } - } - } - CLEAR_SSC_EOS(data->start_class); - } - cl_and(data->start_class, and_withp); + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); } else if (REGNODE_VARIES(OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, fl = 0; - I32 f = flags, pos_before = 0; + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; regnode * const oscan = scan; - struct regnode_charclass_class this_class; - struct regnode_charclass_class *oclass = NULL; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; I32 next_is_eval = 0; switch (PL_regkind[OP(scan)]) { @@ -3822,12 +4399,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan = NEXTOPER(scan); goto do_curly; } - is_inf = is_inf_internal = 1; - scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } + is_inf = is_inf_internal = 1; + scan = regnext(scan); goto optimize_curly_tail; case CURLY: if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) @@ -3848,7 +4426,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ pos_before = data->pos_min; } if (data) { @@ -3858,7 +4438,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags |= SF_IS_INF; } if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); oclass = data->start_class; data->start_class = &this_class; f |= SCF_DO_STCLASS_AND; @@ -3878,35 +4458,35 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* This will finish on WHILEM, setting scan, or on NULL: */ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - last, data, stopparen, recursed, NULL, - (mincount == 0 - ? (f & ~SCF_DO_SUBSTR) : f),depth+1); + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); if (flags & SCF_DO_STCLASS) data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&this_class, data->start_class, - struct regnode_charclass_class); + StructCopy(&this_class, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); - cl_and(data->start_class, and_withp); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); } else if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); flags &= ~SCF_DO_STCLASS; } if (!scan) /* It was not CURLYX, but CURLY. */ @@ -3916,7 +4496,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3) /* Complement check for big count */ + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ { /* Fatal warnings may leak the regexp without this: */ SAVEFREESV(RExC_rx_sv); @@ -3926,14 +4507,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } min += minnext * mincount; - is_inf_internal |= deltanext == I32_MAX - || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); is_inf |= is_inf_internal; - if (is_inf) - delta = I32_MAX; - else - delta += (minnext + deltanext) * maxcount - minnext * mincount; - + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } /* Try powerful optimization CURLYX => CURLYN. */ if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR @@ -3984,7 +4566,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ - && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4031,7 +4616,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif /* Optimize again: */ study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, - NULL, stopparen, recursed, NULL, 0,depth+1); + NULL, stopparen, recursed_depth, NULL, 0,depth+1); } else oscan->flags = 0; @@ -4056,43 +4641,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, pars++; if (flags & SCF_DO_SUBSTR) { SV *last_str = NULL; + STRLEN last_chrs = 0; int counted = mincount != 0; - if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ -#if defined(SPARC64_GCC_WORKAROUND) - I32 b = 0; - STRLEN l = 0; - const char *s = NULL; - I32 old = 0; - - if (pos_before >= data->last_start_min) - b = pos_before; - else - b = data->last_start_min; - - l = 0; - s = SvPV_const(data->last_found, l); - old = b - data->last_start_min; - -#else - I32 b = pos_before >= data->last_start_min + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min ? pos_before : data->last_start_min; STRLEN l; const char * const s = SvPV_const(data->last_found, l); - I32 old = b - data->last_start_min; -#endif + SSize_t old = b - data->last_start_min; if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); repeatcpy(SvPVX(last_str) + l, - SvPVX_const(last_str), l, mincount - 1); + SvPVX_const(last_str), l, + mincount - 1); SvCUR_set(last_str, SvCUR(last_str) * mincount); /* Add additional parts. */ SvCUR_set(data->last_found, @@ -4104,34 +4678,41 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str) - l; + mg->mg_len += last_chrs * (mincount-1); } + last_chrs *= mincount; data->last_end += l * (mincount - 1); } } else { /* start offset must point into the last copy */ data->last_start_min += minnext * (mincount - 1); - data->last_start_max += is_inf ? I32_MAX + data->last_start_max += is_inf ? SSize_t_MAX : (maxcount - 1) * (minnext + data->pos_delta); } } /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n", - counted, deltanext, I32_MAX, minnext, maxcount, mincount); -if (deltanext != I32_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta); +PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf + " SSize_t_MAX=%"UVdf" minnext=%"UVdf + " maxcount=%"UVdf" mincount=%"UVdf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif - if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta) - data->pos_delta = I32_MAX; + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; else data->pos_delta += - counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount; if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); if (mincount && last_str) { SV * const sv = data->last_found; MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? @@ -4141,12 +4722,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext mg->mg_len = -1; sv_setsv(sv, last_str); data->last_end = data->pos_min; - data->last_start_min = - data->pos_min - CHR_SVLEN(last_str); + data->last_start_min = data->pos_min - last_chrs; data->last_start_max = is_inf - ? I32_MAX - : data->pos_min + data->pos_delta - - CHR_SVLEN(last_str); + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; } data->longest = &(data->longest_float); } @@ -4161,164 +4740,212 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext NEXT_OFF(oscan) += NEXT_OFF(next); } continue; - default: /* REF, and CLUMP only? */ + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) - cl_anything(pRExC_state, data->start_class); + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } flags &= ~SCF_DO_STCLASS; break; } } else if (OP(scan) == LNBREAK) { if (flags & SCF_DO_STCLASS) { - int value = 0; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ if (flags & SCF_DO_STCLASS_AND) { - for (value = 0; value < 256; value++) - if (!is_VERTWS_cp(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - else { - for (value = 0; value < 256; value++) - if (is_VERTWS_cp(value)) - ANYOF_BITMAP_SET(data->start_class, value); + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } min++; delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += 1; data->pos_delta += 1; data->longest = &(data->longest_float); } } else if (REGNODE_SIMPLE(OP(scan))) { - int value = 0; if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min++; } min++; if (flags & SCF_DO_STCLASS) { - int loop_max = 256; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; /* Some of the logic below assumes that switching locale on will only add false positives. */ - switch (PL_regkind[OP(scan)]) { - U8 classnum; + switch (OP(scan)) { - case SANY: default: #ifdef DEBUGGING - Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); #endif - do_default: + case CANY: + case SANY: if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_match_all_cp(data->start_class); break; + case REG_ANY: - if (OP(scan) == SANY) - goto do_default; - if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ - value = (ANYOF_BITMAP_TEST(data->start_class,'\n') - || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); - cl_anything(pRExC_state, data->start_class); - } - if (flags & SCF_DO_STCLASS_AND || !value) - ANYOF_BITMAP_CLEAR(data->start_class,'\n'); + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); + } break; + case ANYOF: if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, - (struct regnode_charclass_class*)scan); + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); else - cl_or(pRExC_state, data->start_class, - (struct regnode_charclass_class*)scan); + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); break; - case POSIXA: - loop_max = 128; + + case NPOSIXL: + invert = 1; /* FALL THROUGH */ + case POSIXL: - case POSIXD: - case POSIXU: - classnum = FLAGS(scan); + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); } else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum)); - } - else { - - /* Even if under locale, set the bits for non-locale - * in case it isn't a true locale-node. This will - * create false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } } break; - case NPOSIXA: - loop_max = 128; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; /* FALL THROUGH */ - case NPOSIXL: - case NPOSIXU: + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + case NPOSIXD: - classnum = FLAGS(scan); + case NPOSIXU: + invert = 1; + /* FALL THROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } + + join_posix: + if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } - } + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); } else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1); - } - else { - - /* Even if under locale, set the bits for non-locale in - * case it isn't a true locale-node. This will create - * false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } - } - if (PL_regkind[OP(scan)] == NPOSIXD) { - data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); } - break; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -4326,7 +4953,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } else if ( PL_regkind[OP(scan)] == BRANCHJ @@ -4345,11 +4972,12 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("OPFAIL",data,depth); /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, mysv_val, upto); - PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(mysv_val), - (IV)REG_NODE_NUM(upto), - (IV)(upto - scan) + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) ); }); OP(scan) = OPFAIL; @@ -4366,9 +4994,9 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext In this case we can't do fixed string optimisation. */ - I32 deltanext, minnext, fake = 0; + SSize_t deltanext, minnext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; data_fake.flags = 0; @@ -4381,7 +5009,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.pos_delta = delta; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4390,13 +5018,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f, depth+1); + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (minnext > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)minnext; } @@ -4415,14 +5045,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext * *** HACK *** for now just treat as "no information". * See [perl #56690]. */ - cl_init(pRExC_state, data->start_class); + ssc_init(pRExC_state, data->start_class); } else { /* AND before and after: combine and continue */ - const int was = TEST_SSC_EOS(data->start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } } } @@ -4435,9 +5061,9 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext length of the pattern, something we won't know about until after the recurse. */ - I32 deltanext, fake = 0; + SSize_t deltanext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; /* We use SAVEFREEPV so that when the full compile is finished perl will clean up the allocated @@ -4445,8 +5071,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext have to worry about freeing them when we know they wont be used, which would be a pain. */ - I32 *minnextp; - Newx( minnextp, 1, I32 ); + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); SAVEFREEPV(minnextp); if (data) { @@ -4454,7 +5080,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext if ((flags & SCF_DO_SUBSTR) && data->last_found) { f |= SCF_DO_SUBSTR; if (scan->flags) - SCAN_COMMIT(pRExC_state, &data_fake,minlenp); + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); data_fake.last_found=newSVsv(data->last_found); } } @@ -4466,7 +5092,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.flags |= SF_IS_INF; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4475,14 +5101,17 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f,depth+1); + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (*minnextp > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)*minnextp; } @@ -4490,11 +5119,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext *minnextp += min; if (f & SCF_DO_STCLASS_AND) { - const int was = TEST_SSC_EOS(data.start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4505,7 +5130,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { if (RExC_rx->minlen<*minnextp) RExC_rx->minlen=*minnextp; - SCAN_COMMIT(pRExC_state, &data_fake, minnextp); + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); SvREFCNT_dec_NN(data_fake.last_found); if ( data_fake.minlen_fixed != minlenp ) @@ -4549,7 +5174,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext } else if ( PL_regkind[OP(scan)] == ENDLIKE ) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); flags &= ~SCF_DO_SUBSTR; } if (data && OP(scan)==ACCEPT) { @@ -4561,24 +5186,24 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } else if (OP(scan) == GPOS) { - if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && !(delta || is_inf || (data && data->pos_delta))) { - if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) - RExC_rx->extflags |= RXf_ANCH_GPOS; - if (RExC_rx->gofs < (U32)min) + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) RExC_rx->gofs = min; } else { - RExC_rx->extflags |= RXf_GPOS_FLOAT; + RExC_rx->intflags |= PREGf_GPOS_FLOAT; RExC_rx->gofs = 0; } } @@ -4591,13 +5216,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext regnode *trie_node= scan; regnode *tail= regnext(scan); reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - I32 max1 = 0, min1 = I32_MAX; - struct regnode_charclass_class accum; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; - if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + ssc_init_zero(pRExC_state, &accum); if (!trie->jump) { min1= trie->minlen; @@ -4608,8 +5235,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext for ( word=1 ; word <= trie->wordcount ; word++) { - I32 deltanext=0, minnext=0, f = 0, fake; - struct regnode_charclass_class this_class; + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; data_fake.flags = 0; if (data) { @@ -4620,7 +5247,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.last_closep = &fake; data_fake.pos_delta = delta; if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -4632,22 +5259,21 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext nextbranch = trie_node + trie->jump[0]; scan= trie_node + trie->jump[word]; /* We go from the jump point to the branch that follows - it. Note this means we need the vestigal unused branches - even though they arent otherwise used. - */ + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, (regnode *)nextbranch, &data_fake, - stopparen, recursed, NULL, f,depth+1); + stopparen, recursed_depth, NULL, f,depth+1); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode*)nextbranch); - if (min1 > (I32)(minnext + trie->minlen)) + if (min1 > (SSize_t)(minnext + trie->minlen)) min1 = minnext + trie->minlen; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; - } else if (max1 < (I32)(minnext + deltanext + trie->maxlen)) + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) max1 = minnext + deltanext + trie->maxlen; if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4665,7 +5291,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -4677,28 +5303,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } scan= tail; @@ -4713,14 +5336,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext delta += (trie->maxlen - trie->minlen); flags &= ~SCF_DO_STCLASS; /* xxx */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += trie->minlen; data->pos_delta += (trie->maxlen - trie->minlen); if (trie->maxlen != trie->minlen) data->longest = &(data->longest_float); } if (trie->jump) /* no more substrings -- for now /grr*/ - flags &= ~SCF_DO_SUBSTR; + flags &= ~SCF_DO_SUBSTR; } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ @@ -4728,10 +5352,24 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext /* Else: zero-length, ignore. */ scan = regnext(scan); } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ last = frame->last; scan = frame->next; stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + frame = frame->prev; goto fake_study_recurse; } @@ -4741,9 +5379,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("pre-fin:",data,depth); *scanp = scan; - *deltap = is_inf_internal ? I32_MAX : delta; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) - data->pos_delta = I32_MAX - data->pos_min; + data->pos_delta = SSize_t_MAX - data->pos_min; if (is_par > (I32)U8_MAX) is_par = 0; if (is_par && pars==1 && data) { @@ -4755,17 +5394,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; DEBUG_STUDYDATA("post-fin:",data,depth); - return min < stopmin ? min : stopmin; + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ } STATIC U32 -S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; @@ -4848,7 +5495,7 @@ Perl_current_re_engine(pTHX) HV * const table = GvHV(PL_hintgv); SV **ptr; - if (!table) + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) return &reh_regexp_engine; ptr = hv_fetchs(table, "regcomp", FALSE); if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) @@ -4929,12 +5576,11 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, Newx(dst, *plen_p * 2 + 1, U8); while (s < *plen_p) { - const UV uv = NATIVE_TO_ASCII(src[s]); - if (UNI_IS_INVARIANT(uv)) - dst[d] = (U8)UTF_TO_NATIVE(uv); + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; else { - dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv); - dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv); + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); } if (n < num_code_blocks) { if (!do_end && pRExC_state->code_blocks[n].start == s) { @@ -4998,6 +5644,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, STRLEN orig_patlen = 0; bool code = 0; SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; /* if we've got a delimiter, we go round the loop twice for each * svp slot (except the last), using the delimiter the second @@ -5016,7 +5663,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * The code in this block is based on S_pushav() */ AV *const av = (AV*)msv; - const I32 maxarg = AvFILL(av) + 1; + const SSize_t maxarg = AvFILL(av) + 1; SV **array; if (oplist) { @@ -5026,11 +5673,11 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, } if (SvRMAGICAL(av)) { - U32 i; + SSize_t i; Newx(array, maxarg, SV*); SAVEFREEPV(array); - for (i=0; i < (U32)maxarg; i++) { + for (i=0; i < maxarg; i++) { SV ** const svp = av_fetch(av, i, FALSE); array[i] = svp ? *svp : &PL_sv_undef; } @@ -5320,7 +5967,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, { Safefree(pRExC_state->code_blocks); /* use croak_sv ? */ - Perl_croak_nocontext("%s", SvPV_nolen_const(errsv)); + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); } } assert(SvROK(qr_ref)); @@ -5411,20 +6058,24 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, STATIC bool -S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol) +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) { /* This is the common code for setting up the floating and fixed length * string data extracted from Perl_re_op_compile() below. Returns a boolean * as to whether succeeded or not */ - I32 t,ml; + I32 t; + SSize_t ml; if (! (longest_length || (eol /* Can't have SEOL and MULTI */ && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) ) - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) { return FALSE; } @@ -5442,7 +6093,7 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, S follow this item. We calculate it ahead of time as once the lookbehind offset is added in we lose the ability to correctly calculate it.*/ - ml = minlen ? *(minlen) : (I32)longest_length; + ml = minlen ? *(minlen) : (SSize_t)longest_length; *rx_end_shift = ml - offset - longest_length + (SvTAIL(sv_longest) != 0) + lookbehind; @@ -5511,7 +6162,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, char *exp; regnode *scan; I32 flags; - I32 minlen = 0; + SSize_t minlen = 0; U32 rx_flags; SV *pat; SV *code_blocksv = NULL; @@ -5545,61 +6196,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * having to test them each time otherwise */ if (! PL_AboveLatin1) { PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); - PL_ASCII = _new_invlist_C_array(ASCII_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); - - PL_L1Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(PosixAlnum_invlist); - - PL_L1Posix_ptrs[_CC_ALPHA] - = _new_invlist_C_array(L1PosixAlpha_invlist); - PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist); - PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist); - - /* Cased is the same as Alpha in the ASCII range */ - PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist); - PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist); - PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist); - - PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - - PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist); - PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist); - - PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist); - PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist); - - PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist); - PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist); - - PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist); - PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist); - - PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist); - PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist); - PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist); - PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist); - - PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist); - PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist); - - PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist); - - PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist); - PL_L1Posix_ptrs[_CC_WORDCHAR] - = _new_invlist_C_array(L1PosixWord_invlist); - - PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist); - PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); - - PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); } #endif @@ -5715,6 +6316,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + RExC_contains_i = 0; pRExC_state->runtime_code_qr = NULL; DEBUG_COMPILE_r({ @@ -5736,11 +6338,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); /* return old regex if pattern hasn't changed */ - /* XXX: note in the below we have to check the flags as well as the pattern. + /* XXX: note in the below we have to check the flags as well as the + * pattern. * - * Things get a touch tricky as we have to compare the utf8 flag independently - * from the compile flags. - */ + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ if ( old_re && !recompile @@ -5757,10 +6359,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, rx_flags = orig_rx_flags; - if (initial_charset == REGEX_LOCALE_CHARSET) { - RExC_contains_locale = 1; + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; } - else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -5788,6 +6390,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_sawback = 0; RExC_seen = 0; + RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; @@ -5802,7 +6405,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_npar = 1; RExC_nestroot = 0; RExC_size = 0L; - RExC_emit = &RExC_emit_dummy; + RExC_emit = (regnode *) &RExC_emit_dummy; RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; @@ -5812,6 +6415,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_paren_name_list = NULL; #endif RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; RExC_recurse_count = 0; pRExC_state->code_index = 0; @@ -5890,7 +6495,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, FAIL("Regexp out of space"); #ifdef DEBUGGING /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ - Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); #else /* bulk initialize base fields with 0. */ Zero(ri, sizeof(regexp_internal), char); @@ -5917,14 +6523,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); /* The caret is output if there are any defaults: if not all the STD * flags are set, or if no character set specifier is needed */ bool has_default = (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) || ! has_charset); - bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ @@ -5986,12 +6594,23 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - if (RExC_seen & REG_SEEN_RECURSE) { + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { Newxz(RExC_open_parens, RExC_npar,regnode *); SAVEFREEPV(RExC_open_parens); Newxz(RExC_close_parens,RExC_npar,regnode *); SAVEFREEPV(RExC_close_parens); } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } /* Useful during FAIL. */ #ifdef RE_TRACK_PATTERN_OFFSETS @@ -6035,6 +6654,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, reStudy: r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); #ifdef TRIE_STUDY_OPT if (!restudied) { @@ -6045,10 +6667,10 @@ reStudy: DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); RExC_state = copyRExC_state; - if (seen & REG_TOP_LEVEL_BRANCHES) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; else - RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; StructCopy(&zero_scan_data, &data, scan_data_t); } #else @@ -6068,12 +6690,13 @@ reStudy: /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ - if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ - I32 fake; + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; STRLEN longest_float_length, longest_fixed_length; - struct regnode_charclass_class ch_class; /* pointed to by data */ + regnode_ssc ch_class; /* pointed to by data */ int stclass_flag; - I32 last_close = 0; /* pointed to by data */ + SSize_t last_close = 0; /* pointed to by data */ regnode *first= scan; regnode *first_next= regnext(first); /* @@ -6153,35 +6776,35 @@ reStudy: PL_regkind[OP(first)] == NBOUND) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOL) { - r->extflags |= (OP(first) == MBOL - ? RXf_ANCH_MBOL + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL : (OP(first) == SBOL - ? RXf_ANCH_SBOL - : RXf_ANCH_BOL)); + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); first = NEXTOPER(first); goto again; } else if (OP(first) == GPOS) { - r->extflags |= RXf_ANCH_GPOS; + r->intflags |= PREGf_ANCH_GPOS; first = NEXTOPER(first); goto again; } else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = (OP(NEXTOPER(first)) == REG_ANY) - ? RXf_ANCH_MBOL - : RXf_ANCH_SBOL; - r->extflags |= type; - r->intflags |= PREGf_IMPLICIT; + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); first = NEXTOPER(first); goto again; } - if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback) + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6223,15 +6846,17 @@ reStudy: SAVEFREESV(data.last_found); first = scan; if (!ri->regstclass) { - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; } else /* XXXX Check for BOUND? */ stclass_flag = 0; data.last_closep = &last_close; - minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ - &data, -1, NULL, NULL, + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 0); @@ -6243,9 +6868,11 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen - && !(RExC_seen & REG_SEEN_VERBARG) - && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ r->extflags |= RXf_CHECK_ALL; + } scan_commit(pRExC_state, &data,&minlen,0); longest_float_length = CHR_SVLEN(data.longest_float); @@ -6267,7 +6894,7 @@ reStudy: { r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; - if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; SvREFCNT_inc_simple_void_NN(data.longest_float); } @@ -6305,49 +6932,53 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag - && ! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } - /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; r->check_end_shift = r->anchored_end_shift; r->check_substr = r->anchored_substr; r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; - if (r->extflags & RXf_ANCH_SINGLE) - r->extflags |= RXf_NOSCAN; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; } else { + r->substrs->check_ix = 1; r->check_end_shift = r->float_end_shift; r->check_substr = r->float_substr; r->check_utf8 = r->float_utf8; r->check_offset_min = r->float_min_offset; r->check_offset_max = r->float_max_offset; } - /* XXXX Currently intuiting is not compatible with ANCH_GPOS. - This should be changed ASAP! */ - if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) ) { r->extflags |= RXf_USE_INTUIT; if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) if ( (STRLEN)minlen < longest_float_length ) minlen= longest_float_length; @@ -6357,22 +6988,23 @@ reStudy: } else { /* Several toplevels. Best we can is to set minlen. */ - I32 fake; - struct regnode_charclass_class ch_class; - I32 last_close = 0; + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = ri->program + 1; - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; data.last_closep = &last_close; - - minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, - &data, -1, NULL, NULL, - SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS - |(restudied ? SCF_TRIE_DOING_RESTUDY : 0), + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), 0); CHECK_RESTUDY_GOTO_butfirst(NOOP); @@ -6380,51 +7012,61 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - if (! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } } + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", - (IV)minlen, (IV)r->minlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); }); r->minlenret = minlen; if (r->minlen < minlen) r->minlen = minlen; - if (RExC_seen & REG_SEEN_GPOS) - r->extflags |= RXf_GPOS_SEEN; - if (RExC_seen & REG_SEEN_LOOKBEHIND) - r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_SEEN_CANY) - r->extflags |= RXf_CANY_SEEN; - if (RExC_seen & REG_SEEN_VERBARG) + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ } - if (RExC_seen & REG_SEEN_CUTGROUP) + if (RExC_seen & REG_CUTGROUP_SEEN) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) r->intflags |= PREGf_USE_RE_EVAL; @@ -6433,7 +7075,20 @@ reStudy: else RXp_PAREN_NAMES(r) = NULL; + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); regnode *next = NEXTOPER(first); @@ -6443,16 +7098,28 @@ reStudy: r->extflags |= RXf_NULL; else if (PL_regkind[fop] == BOL && nop == END) r->extflags |= RXf_START_ONLY; - else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END) + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) r->extflags |= RXf_WHITE; - else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + #ifdef DEBUGGING if (RExC_paren_names) { - ri->name_list_idx = add_data( pRExC_state, 1, "a" ); - ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); } else #endif ri->name_list_idx = 0; @@ -6467,15 +7134,17 @@ reStudy: /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ + DEBUG_RExC_seen(); PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); #ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_OFFSETS_r(if (ri->u.offsets) { - const U32 len = ri->u.offsets[0]; - U32 i; + const STRLEN len = ri->u.offsets[0]; + STRLEN i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", @@ -6535,7 +7204,8 @@ Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, else if (flags & RXapif_NEXTKEY) return reg_named_buff_nextkey(rx, flags); else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); return NULL; } } @@ -6661,7 +7331,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) { SV *ret; AV *av; - I32 length; + SSize_t length; struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; @@ -6672,11 +7342,12 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) } else if (flags & RXapif_ONE) { ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = MUTABLE_AV(SvRV(ret)); - length = av_len(av); + length = av_tindex(av); SvREFCNT_dec_NN(ret); return newSViv(length + 1); } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); return NULL; } } @@ -6724,19 +7395,29 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, { struct regexp *const rx = ReANY(r); char *s = NULL; - I32 i = 0; - I32 s1, t1; + SSize_t i = 0; + SSize_t s1, t1; I32 n = paren; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - && !(rx->extflags & RXf_PMf_KEEPCOPY) ) - goto ret_undef; + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } if (!rx->subbeg) goto ret_undef; @@ -6773,9 +7454,9 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, } assert(s >= rx->subbeg); - assert(rx->sublen >= (s - rx->subbeg) + i ); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); if (i >= 0) { -#if NO_TAINT_SUPPORT +#ifdef NO_TAINT_SUPPORT sv_setpvn(sv, s, i); #else const int oldtainted = TAINT_get; @@ -6783,7 +7464,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->extflags & RXf_CANY_SEEN) + if ( (rx->intflags & PREGf_CANY_SEEN) ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) : (RXp_MATCH_UTF8(rx)) ) @@ -6842,13 +7523,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + /* Some of this code was originally in C in F */ switch (paren) { case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; @@ -6861,8 +7556,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, return 0; case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; @@ -6874,13 +7567,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } return 0; - case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - - /* $& / ${^MATCH}, $1, $2, ... */ - default: + default: /* $& / ${^MATCH}, $1, $2, ... */ if (paren <= (I32)rx->nparens && (s1 = rx->offs[paren].start) != -1 && (t1 = rx->offs[paren].end) != -1) @@ -6937,7 +7624,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) PERL_ARGS_ASSERT_REG_SCAN_NAME; - if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { /* skip IDFIRST by using do...while */ if (UTF) do { @@ -6948,7 +7637,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) RExC_parse++; } while (isWORDCHAR(*RExC_parse)); } else { - RExC_parse++; /* so the <- from the vFAIL is after the offending character */ + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ vFAIL("Group name must start with a non-digit word character"); } if ( flags ) { @@ -7124,6 +7814,8 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) PERL_ARGS_ASSERT_INVLIST_SET_LEN; + assert(SvTYPE(invlist) == SVt_INVLIST); + SvCUR_set(invlist, (len == 0) ? 0 @@ -7139,6 +7831,8 @@ S_get_invlist_previous_index_addr(pTHX_ SV* invlist) PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); + return &(((XINVLIST*) SvANY(invlist))->prev_index); } @@ -7172,6 +7866,8 @@ S_invlist_max(pTHX_ SV* const invlist) PERL_ARGS_ASSERT_INVLIST_MAX; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Assumes worst case, in which the 0 element is not counted in the * inversion list, so subtracts 1 for that */ return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ @@ -7209,10 +7905,9 @@ Perl__new_invlist(pTHX_ IV initial_size) return new_list; } -#endif -STATIC SV* -S__new_invlist_C_array(pTHX_ const UV* const list) +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) { /* Return a pointer to a newly constructed inversion list, initialized to * point to , which has to be in the exact correct inversion list @@ -7262,8 +7957,11 @@ S__new_invlist_C_array(pTHX_ const UV* const list) /* Initialize the iteration pointer. */ invlist_iterfinish(invlist); + SvREADONLY_on(invlist); + return invlist; } +#endif /* ifndef PERL_IN_XSUB_RE */ STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) @@ -7272,6 +7970,8 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) PERL_ARGS_ASSERT_INVLIST_EXTEND; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Add one to account for the zero element at the beginning which may not * be counted by the calling parameters */ SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); @@ -7282,15 +7982,16 @@ S_invlist_trim(pTHX_ SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_TRIM; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Change the length of the inversion list to how many entries it currently * has */ SvPV_shrink_to_cur((SV *) invlist); } -#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) - STATIC void -S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) { /* Subject to change or removal. Append the range from 'start' to 'end' at * the end of the inversion list. The range must be above any existing @@ -7464,7 +8165,8 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) } void -Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) +Perl__invlist_populate_swatch(pTHX_ SV* const invlist, + const UV start, const UV end, U8* swatch) { /* populates a swatch of a swash the same way swatch_get() does in utf8.c, * but is used when the swash has an inversion list. This makes this much @@ -7557,14 +8259,16 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV } void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. The first list, - * , may be NULL, in which case a copy of the second list is returned. - * If is TRUE, the union is taken of the complement - * (inversion) of instead of b itself. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7605,9 +8309,13 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b /* If either one is empty, the union is the other one */ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + if (*output == a) { if (a != NULL) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } } if (*output != b) { @@ -7616,18 +8324,27 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b _invlist_invert(*output); } } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; if (*output == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } /* The complement of an empty list is a list that has everything in it, * so the union with includes everything too */ if (complement_b) { if (a == *output) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } *output = _new_invlist(1); _append_range_to_invlist(*output, 0, UV_MAX); @@ -7636,6 +8353,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b *output = invlist_clone(a); } /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } @@ -7775,24 +8496,36 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const b } } - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *output || b == *output) { assert(! invlist_is_iterating(*output)); - SvREFCNT_dec_NN(*output); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } } *output = u; + return; } void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. - * If is TRUE, the result will be the intersection of - * and the complement (or inversion) of instead of directly. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7828,8 +8561,9 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, assert(a != b); /* Special case if either one is empty */ - len_a = _invlist_len(a); + len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; if (len_a != 0 && complement_b) { @@ -7838,25 +8572,39 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * must be every possible code point. Thus the intersection is * simply 'a'. */ if (*i != a) { - *i = invlist_clone(a); - if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } + + *i = invlist_clone(a); } /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } return; } /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The * intersection must be empty */ if (*i == a) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } else if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + return; } @@ -7961,7 +8709,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } /* The final length is what we've output so far plus what else is in the - * intersection. At most one of the subexpressions below will be non-zero */ + * intersection. At most one of the subexpressions below will be non-zero + * */ len_r = i_r; if (count >= 2) { len_r += (len_a - i_a) + (len_b - i_b); @@ -7986,13 +8735,21 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *i || b == *i) { assert(! invlist_is_iterating(*i)); - SvREFCNT_dec_NN(*i); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } } *i = r; + return; } @@ -8039,6 +8796,35 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) return invlist; } +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + #endif PERL_STATIC_INLINE SV* @@ -8067,43 +8853,6 @@ Perl__invlist_invert(pTHX_ SV* const invlist) *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); } -void -Perl__invlist_invert_prop(pTHX_ SV* const invlist) -{ - /* Complement the input inversion list (which must be a Unicode property, - * all of which don't match above the Unicode maximum code point.) And - * Perl has chosen to not have the inversion match above that either. This - * adds a 0x110000 if the list didn't end with it, and removes it if it did - */ - - UV len; - UV* array; - - PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; - - _invlist_invert(invlist); - - len = _invlist_len(invlist); - - if (len != 0) { /* If empty do nothing */ - array = invlist_array(invlist); - if (array[len - 1] != PERL_UNICODE_MAX + 1) { - /* Add 0x110000. First, grow if necessary */ - len++; - if (invlist_max(invlist) < len) { - invlist_extend(invlist, len); - array = invlist_array(invlist); - } - invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist)); - array[len - 1] = PERL_UNICODE_MAX + 1; - } - else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist)); - } - } - - return; -} #endif PERL_STATIC_INLINE SV* @@ -8111,7 +8860,7 @@ S_invlist_clone(pTHX_ SV* const invlist) { /* Return a new inversion list that is a copy of the input one, which is - * unchanged */ + * unchanged. The new list will not be mortal even if the old one was. */ /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ @@ -8136,6 +8885,8 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist) PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); + return &(((XINVLIST*) SvANY(invlist))->iterator); } @@ -8269,42 +9020,59 @@ Perl__invlist_contents(pTHX_ SV* const invlist) } #endif -#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP +#ifndef PERL_IN_XSUB_RE void -Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) { - /* Dumps out the ranges in an inversion list. The string 'header' - * if present is output on a line before the first range */ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ UV start, end; + STRLEN count = 0; PERL_ARGS_ASSERT__INVLIST_DUMP; - if (header && strlen(header)) { - PerlIO_printf(Perl_debug_log, "%s\n", header); - } if (invlist_is_iterating(invlist)) { - PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n"); + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); return; } invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); } else if (end != start) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", - start, end); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); } else { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start); + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); } + count += 2; } } #endif -#if 0 +#ifdef PERL_ARGS_ASSERT__INVLISTEQ bool S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) { @@ -8346,7 +9114,6 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -8375,7 +9142,7 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) /* End of inversion list object */ STATIC void -S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) { /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' * constructs, and updates RExC_flags with them. On input, RExC_parse @@ -8435,7 +9202,6 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) } cs = REGEX_LOCALE_CHARSET; has_charset_modifier = LOCALE_PAT_MOD; - RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier) { @@ -8489,7 +9255,8 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); } else if (has_charset_modifier == *(RExC_parse - 1)) { - vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); } else { vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); @@ -8497,12 +9264,15 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) /*NOTREACHED*/ neg_modifier: RExC_parse++; - vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { - const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; if (! (wastedflags & wflagbit) ) { wastedflags |= wflagbit; /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ @@ -8555,13 +9325,17 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) RExC_flags |= posflags; RExC_flags &= ~negflags; set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } return; /*NOTREACHED*/ default: fail_modifiers: - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", - RExC_parse-seqstart, seqstart); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } @@ -8636,7 +9410,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char *start_arg = NULL; unsigned char op = 0; int argok = 1; - int internal_argval = 0; /* internal_argval is only useful if !argok */ + int internal_argval = 0; /* internal_argval is only useful if + !argok */ if (has_intervening_patws && SIZE_ONLY) { ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); @@ -8699,14 +9474,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* [19:06] :: is then */ if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; - RExC_seen |= REG_SEEN_CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; } break; } if ( ! op ) { - RExC_parse++; - vFAIL3("Unknown verb pattern '%.*s'", - verb_len, start_verb); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); } if ( argok ) { if ( start_arg && internal_argval ) { @@ -8719,8 +9495,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, op, internal_argval); if ( ! internal_argval && ! SIZE_ONLY ) { if (start_arg) { - SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); - ARG(ret) = add_data( pRExC_state, 1, "S" ); + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); RExC_rxi->data->data[ARG(ret)]=(void*)sv; ret->flags = 0; } else { @@ -8729,7 +9507,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if (!internal_argval) - RExC_seen |= REG_SEEN_VERBARG; + RExC_seen |= REG_VERBARG_SEEN; } else if ( start_arg ) { vFAIL3("Verb pattern '%.*s' may not have an argument", verb_len, start_verb); @@ -8759,17 +9537,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto named_recursion; } else if (paren == '=') { /* (?P=...) named backref */ - /* this pretty much dupes the code for \k in regatom(), if - you change this make sure you change that */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ char* name_start = RExC_parse; U32 num = 0; SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -8794,7 +9574,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); /*NOTREACHED*/ case '<': /* (?<...) */ if (*RExC_parse == '!') @@ -8808,15 +9590,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '\'': /* (?'...') */ name_start= RExC_parse; svname = reg_scan_name(pRExC_state, - SIZE_ONLY ? /* reverse test from the others */ - REG_RSN_RETURN_NAME : - REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - /*NOTREACHED*/ - } - if (*RExC_parse != paren) + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); if (SIZE_ONLY) { @@ -8856,20 +9633,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if ( count ) { - pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); pv[count] = RExC_npar; SvIV_set(sv_dat, SvIVX(sv_dat) + 1); } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); - sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); SvIOK_on(sv_dat); SvIV_set(sv_dat, 1); } #ifdef DEBUGGING - /* Yes this does cause a memory leak in debugging Perls */ - if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) SvREFCNT_dec_NN(svname); #endif @@ -8879,7 +9660,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = 1; goto capturing_parens; } - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; case '=': /* (?=...) */ @@ -8923,6 +9704,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; *flagp |= POSTPONED; nextchar(pRExC_state); return ret; @@ -8937,6 +9719,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); goto gen_recurse_regop; assert(0); /* NOT REACHED */ case '+': @@ -8996,11 +9780,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ARG2L_SET( ret, RExC_recurse_count++); RExC_emit++; DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); } else { RExC_size++; } - RExC_seen |= REG_SEEN_RECURSE; + RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ @@ -9014,7 +9799,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) is_logical = 1; if (*RExC_parse != '{') { RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } *flagp |= POSTPONED; @@ -9043,14 +9831,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { OP *o = cb->block; if (cb->src_regex) { - n = add_data(pRExC_state, 2, "rl"); + n = add_data(pRExC_state, STR_WITH_LEN("rl")); RExC_rxi->data->data[n] = (void*)SvREFCNT_inc((SV*)cb->src_regex); RExC_rxi->data->data[n+1] = (void*)o; } else { - n = add_data(pRExC_state, 1, - (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); RExC_rxi->data->data[n] = (void*)o; } } @@ -9111,7 +9899,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) (ch == '>' ? '<' : ch)); RExC_parse++; if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -9141,7 +9929,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV *sv_dat; RExC_parse++; sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } ret = reganode(pRExC_state,INSUBP,parno); @@ -9150,6 +9940,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; + char *tmp; parno = atoi(RExC_parse++); while (isDIGIT(*RExC_parse)) @@ -9157,8 +9948,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if ((c = *nextchar(pRExC_state)) != ')') + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); + } insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -9170,14 +9967,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); } else - REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); c = *nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { if (is_define) vFAIL("(?(DEFINE)....) does not allow branches"); - lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { if (flags & RESTART_UTF8) { *flagp = RESTART_UTF8; @@ -9209,7 +10010,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else { - vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); } } case '[': /* (?[ ... ]) */ @@ -9243,7 +10045,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE + if (RExC_seen & REG_RECURSE_SEEN && !RExC_open_parens[parno-1]) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, @@ -9300,7 +10102,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { ender = reganode(pRExC_state, LONGJMP,0); - REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); } if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ @@ -9332,7 +10136,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting close paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ender))); @@ -9364,8 +10168,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("lsbr"); - regprop(RExC_rx, mysv_val1, lastbr); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(lastbr), @@ -9379,20 +10183,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (have_branch && !SIZE_ONLY) { char is_nothing= 1; if (depth==1) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; /* Hook the tails of the branches to the closing node. */ for (br = ret; br; br = regnext(br)) { const U8 op = PL_regkind[OP(br)]; if (op == BRANCH) { REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); - if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender) + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) is_nothing= 0; } else if (op == BRANCHJ) { REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); /* for now we always disable this optimisation * / - if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender) + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) */ is_nothing= 0; } @@ -9403,8 +10209,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("NADA"); - regprop(RExC_rx, mysv_val1, ret); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(ret), @@ -9646,6 +10452,19 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { + if (SIZE_ONLY) { + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); + } + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); + } do_curly: if ((flags&SIMPLE)) { @@ -9687,6 +10506,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; goto nest_check; } @@ -9724,6 +10545,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '*') { min = 0; @@ -9733,6 +10555,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '+') { min = 1; @@ -9745,10 +10568,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) nest_check: if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN3reg(RExC_parse, - "%.*s matches null string many times", - (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), - origparse); + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); (void)ReREFCNT_inc(RExC_rx_sv); } @@ -9778,8 +10603,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } STATIC bool -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, - const bool strict /* Apply stricter parsing rules? */ +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ ) { @@ -9845,7 +10671,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ /* The [^\n] meaning of \N ignores spaces and comments under the /x - * modifier. The other meaning does not */ + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) ? regwhite( pRExC_state, RExC_parse ) : RExC_parse; @@ -9855,17 +10682,18 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (*p != '{' || regcurly(p, FALSE)) { RExC_parse = p; if (! node_p) { - /* no bare \N in a charclass */ + /* no bare \N allowed in a charclass */ if (in_char_class) { vFAIL("\\N in a character class must be a named character: \\N{...}"); } return FALSE; } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ nextchar(pRExC_state); *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; - RExC_parse--; Set_Node_Length(*node_p, 1); /* MJD */ return TRUE; } @@ -9884,8 +10712,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ || ! (endbrace == RExC_parse /* nothing between the {} */ - || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ - && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ { if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); @@ -10091,7 +10921,9 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) } PERL_STATIC_INLINE void -S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) { /* This knows the details about sizing an EXACTish node, setting flags for * it (by setting <*flagp>, and potentially populating it with a single @@ -10106,48 +10938,111 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 * If is zero, the function assumes that the node is to contain only * the single character given by and calculates what * should be. In pass 1, it sizes the node appropriately. In pass 2, it - * additionally will populate the node's STRING with , if - * is 0. In both cases <*flagp> is appropriately set + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set * * It knows that under FOLD, the Latin Sharp S and UTF characters above * 255, must be folded (the former only when the rules indicate it can - * match 'ss') */ + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + if (! len_passed_in) { if (UTF) { - if (FOLD && (! LOC || code_point > 255)) { - _to_uni_fold_flags(NATIVE_TO_UNI(code_point), + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, character, &len, - FOLD_FLAGS_FULL | ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; } else { uvchr_to_utf8( character, code_point); len = UTF8SKIP(character); } - } - else if (! FOLD - || code_point != LATIN_SMALL_LETTER_SHARP_S - || ASCII_FOLD_RESTRICTED - || ! AT_LEAST_UNI_SEMANTICS) - { + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { *character = (U8) code_point; len = 1; - } - else { + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { *character = 's'; *(character + 1) = 's'; len = 2; } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } } if (SIZE_ONLY) { @@ -10171,8 +11066,29 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 { *flagp |= SIMPLE; } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } +} + + +/* return atoi(p), unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + char *q = p; + + for (;isDIGIT(*q); q++); /* calculate length of num */ + if (q - p == 0 || q - p > 9) + return I32_MAX; + return atoi(p); } + /* - regatom - the lowest level @@ -10330,7 +11246,8 @@ tryagain: *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; @@ -10378,7 +11295,7 @@ tryagain: goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); - RExC_seen |= REG_SEEN_GPOS; + RExC_seen |= REG_GPOS_SEEN; *flagp |= SIMPLE; goto finish_meta_pat; case 'K': @@ -10389,7 +11306,7 @@ tryagain: * be necessary here to avoid cases of memory corruption, as * with: C<$_="x" x 80; s/x\K/y/> -- rgs */ - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -10403,7 +11320,7 @@ tryagain: goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_SEEN_CANY; + RExC_seen |= REG_CANY_SEEN; *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'X': @@ -10420,30 +11337,38 @@ tryagain: case 'b': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = BOUND + get_regex_charset(RExC_flags); if (op > BOUNDA) { /* /aa is same as /a */ op = BOUNDA; } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = NBOUND + get_regex_charset(RExC_flags); if (op > NBOUNDA) { /* /aa is same as /a */ op = NBOUNDA; } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); } goto finish_meta_pat; @@ -10487,6 +11412,9 @@ tryagain: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } join_posix_op_known: @@ -10561,6 +11489,7 @@ tryagain: char ch= RExC_parse[1]; if (ch != '<' && ch != '\'' && ch != '{') { RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.2s... not terminated",parse_start); } else { /* this pretty much dupes the code for (?P=...) in reg(), if @@ -10571,10 +11500,11 @@ tryagain: SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -10606,10 +11536,11 @@ tryagain: case '5': case '6': case '7': case '8': case '9': { I32 num; - bool isg = *RExC_parse == 'g'; - bool isrel = 0; bool hasbrace = 0; - if (isg) { + + if (*RExC_parse == 'g') { + bool isrel = 0; + RExC_parse++; if (*RExC_parse == '{') { RExC_parse++; @@ -10623,25 +11554,40 @@ tryagain: if (isrel) RExC_parse--; RExC_parse -= 2; goto parse_named_seq; - } } - num = atoi(RExC_parse); - if (isg && num == 0) { - if (*RExC_parse == '0') { + } + + num = S_backref_value(RExC_parse); + if (num == 0) vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); } - else { - vFAIL("Unterminated \\g... pattern"); + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); } } - if (isrel) { - num = RExC_npar - num; - if (num < 1) - vFAIL("Reference to nonexistent or unclosed group"); - } - if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9') - /* Probably a character specified in octal, e.g. \35 */ - goto defchar; else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } + } + + /* at this point RExC_parse definitely points to a backref + * number */ + { #ifdef RE_TRACK_PATTERN_OFFSETS char * const parse_start = RExC_parse - 1; /* MJD */ #endif @@ -10705,25 +11651,35 @@ tryagain: defchar: { STRLEN len = 0; - UV ender; + UV ender = 0; char *p; char *s; #define MAX_NODE_STRING_SIZE 127 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; - STRLEN foldlen; - U8 node_type; + U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); + /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, * which allows the optimizer more things to look for */ bool maybe_exact; - ender = 0; - node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); /* In pass1, folded, we use a temporary buffer instead of the @@ -10734,10 +11690,9 @@ tryagain: reparse: - /* We do the EXACTFish to EXACT node only if folding, and not if in - * locale, as whether a character folds or not isn't known until - * runtime */ - maybe_exact = FOLD && ! LOC; + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10802,7 +11757,8 @@ tryagain: case 's': case 'S': /* space class */ case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ - case 'X': /* eXtended Unicode "combining character sequence" */ + case 'X': /* eXtended Unicode "combining + character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ --p; goto loopdone; @@ -10850,7 +11806,7 @@ tryagain: p++; break; case 'a': - ender = ASCII_TO_NATIVE('\007'); + ender = '\a'; p++; break; case 'o': @@ -10913,24 +11869,30 @@ tryagain: } case 'c': p++; - ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); + ender = grok_bslash_c(*p++, SIZE_ONLY); break; case '8': case '9': /* must be a backreference */ --p; goto loopdone; case '1': case '2': case '3':case '4': case '5': case '6': case '7': - /* When we parse backslash escapes there is ambiguity between - * backreferences and octal escapes. Any escape from \1 - \9 is - * a backreference, any multi-digit escape which does not start with - * 0 and which when evaluated as decimal could refer to an already - * parsed capture buffer is a backslash. Anything else is octal. + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. * - * Note this implies that \118 could be interpreted as 118 OR as - * "\11" . "8" depending on whether there were 118 capture buffers - * defined already in the pattern. - */ - if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar ) + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) { /* Not to be treated as an octal constant, go find backref */ --p; @@ -10987,7 +11949,7 @@ tryagain: if (! SIZE_ONLY && RExC_flags & RXf_PMf_EXTENDED && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low(p, UTF)) + && is_PATWS_non_low_safe(p, RExC_end, UTF)) { vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), "Escape literal pattern white space under /x"); @@ -11025,7 +11987,10 @@ tryagain: goto loopdone; } - if (! FOLD) { + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { if (UTF) { const STRLEN unilen = reguni(pRExC_state, ender, s); if (unilen > 0) { @@ -11043,81 +12008,100 @@ tryagain: else { REGC((char)ender, s++); } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } } - else /* FOLD */ + else /* FOLD */ if (! ( UTF /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { - *(s++) = (char) ender; - maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender); - } - else { /* UTF */ - - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UNI_IS_INVARIANT(ender)) { - *s = (U8) ender; - foldlen = 1; - } else { - *s = UTF8_TWO_BYTE_HI(ender); - *(s + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) + { + maybe_exactfu = FALSE; } } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ + *(s++) = (char) ender; + } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; + } else { - UV folded = _to_uni_fold_flags( + STRLEN foldlen; + + folded = _to_uni_fold_flags( ender, (U8 *) s, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) - ); + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; - /* If this node only contains non-folding code points - * so far, see if this new one is also non-folding */ - if (maybe_exact) { - if (folded != ender) { - maybe_exact = FALSE; - } - else { - /* Here the fold is the original; we have - * to check further to see if anything - * folds to it */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", - "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = - _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - if (_invlist_contains_cp(PL_utf8_foldable, + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, ender)) - { - maybe_exact = FALSE; - } + { + maybe_exact = FALSE; } } - ender = folded; } - s += foldlen; - - /* The loop increments each time, as all but this - * path (and one other) through it add a single byte to the - * EXACTish node. But this one has changed len to be the - * correct final value, so subtract one to cancel out the - * increment that follows */ - len += foldlen - 1; + ender = folded; } if (next_is_quantifier) { @@ -11166,9 +12150,8 @@ tryagain: if (! UTF) { - /* These two have no multi-char folds to non-UTF characters - */ - if (ASCII_FOLD_RESTRICTED || LOC) { + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { goto loopdone; } @@ -11199,11 +12182,7 @@ tryagain: } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - - /* No Latin1 characters participate in multi-char - * folds under /l */ - if (LOC - || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( *s, *(s+1)))) { break; @@ -11241,6 +12220,15 @@ tryagain: * do any better */ if (len == 0) { len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } } else { /* Here, the node does contain some characters that aren't @@ -11299,14 +12287,26 @@ tryagain: if (len == 0) { OP(ret) = NOTHING; } - else{ - - /* If 'maybe_exact' is still set here, means there are no - * code points in the node that participate in folds */ - if (FOLD && maybe_exact) { - OP(ret) = EXACT; + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * (for /u), or depending on locale for /l */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } } - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); } RExC_parse = p - 1; @@ -11345,7 +12345,7 @@ S_regwhite( RExC_state_t *pRExC_state, char *p ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -11359,7 +12359,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */ + * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -11379,7 +12379,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) } } while (p < e); if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; } else break; @@ -11387,6 +12387,72 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) return p; } +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. Character classes ([:foo:]) can also be negated ([:^foo:]). Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. @@ -11505,8 +12571,9 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) } if (namedclass == OOB_NAMEDCLASS) - Simple_vFAIL3("POSIX class [:%.*s:] unknown", - t - s - 1, s + 1); + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); /* The #defines are structured so each complement is +1 to * the normal one */ @@ -11594,8 +12661,9 @@ S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) } STATIC regnode * -S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth, - char * const oregcomp_parse) +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) { /* Handle the (?[...]) construct to do set operations */ @@ -11631,7 +12699,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), "The regex_sets feature is experimental" REPORT_LOCATION, - (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse); + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); while (RExC_parse < RExC_end) { SV* current = NULL; @@ -12085,7 +13156,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ -#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, @@ -12135,8 +13207,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ SV* properties = NULL; /* Code points that match \p{} \P{} */ - SV* posixes = NULL; /* Code points that match classes like, [:word:], - extended beyond the Latin1 range */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ AV * multi_char_matches = NULL; /* Code points that fold to more than one @@ -12163,11 +13243,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * string is in UTF-8. (Because is under /d) */ SV* depends_list = NULL; - /* inversion list of code points this node matches. For much of the - * function, it includes only those that match regardless of the utf8ness - * of the target string */ + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ SV* cp_list = NULL; + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -12175,14 +13262,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif bool invert = FALSE; /* Is this class to be complemented */ - /* Is there any thing like \W or [:^digit:] that matches above the legal - * Unicode range? */ - bool runtime_posix_matches_above_Unicode = FALSE; + bool warn_super = ALWAYS_WARN_SUPER; regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; - const I32 orig_size = RExC_size; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -12203,9 +13289,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ANYOF_FLAGS(ret) = 0; RExC_emit += ANYOF_SKIP; - if (LOC) { - ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } listsv = newSVpvs_flags("# comment\n", SVs_TEMP); initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ @@ -12348,7 +13431,12 @@ parseit: char *e; /* We will handle any undefined properties ourselves */ - U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF; + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; if (RExC_parse >= RExC_end) vFAIL2("Empty \\%c{}", (U8)value); @@ -12371,6 +13459,7 @@ parseit: } if (!SIZE_ONLY) { SV* invlist; + char* formatted; char* name; if (UCHARAT(RExC_parse) == '^') { @@ -12391,14 +13480,14 @@ parseit: * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - Newx(name, n + sizeof("_i__\n"), char); - - sprintf(name, "%s%.*s%s\n", - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - ); + formatted = Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + ); + name = savepvn(formatted, strlen(formatted)); /* Look up the property name, and get its swash and * inversion list, if the property is found */ @@ -12423,11 +13512,13 @@ parseit: * otherwise add it to the list for run-time look up */ if (ret_invlist) { RExC_parse = e + 1; - vFAIL3("Property '%.*s' is unknown", (int) n, name); + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", (value == 'p' ? '+' : '!'), - name); + UTF8fARG(UTF, n, name)); has_user_defined_property = TRUE; /* We don't know yet, so have to assume that the @@ -12436,7 +13527,7 @@ parseit: * would cause things in to match * inappropriately, except that any \p{}, including * this one forces Unicode semantics, which means there - * is */ + * is no */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; } else { @@ -12444,9 +13535,23 @@ parseit: /* Here, did get the swash and its inversion list. If * the swash is from a user-defined property, then this * whole character class should be regarded as such */ - has_user_defined_property = - (swash_init_flags - & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + /* Invert if asking for the complement */ if (value == 'P') { @@ -12480,7 +13585,7 @@ parseit: case 'f': value = '\f'; break; case 'b': value = '\b'; break; case 'e': value = ASCII_TO_NATIVE('\033');break; - case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'a': value = '\a'; break; case 'o': RExC_parse--; /* function expects to be pointed at the 'o' */ { @@ -12520,7 +13625,7 @@ parseit: goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -12592,31 +13697,8 @@ parseit: /* Here, we have the current token in 'value' */ - /* What matches in a locale is not known until runtime. This includes - * what the Posix classes (like \w, [:space:]) match. Room must be - * reserved (one time per class) to store such classes, either if Perl - * is compiled so that locale nodes always should have this space, or - * if there is such class info to be stored. The space will contain a - * bit for each named class that is to be matched against. This isn't - * needed for \p{} and pseudo-classes, as they are not affected by - * locale, and hence are dealt with separately */ - if (LOC - && ! need_class - && (ANYOF_LOCALE == ANYOF_CLASS - || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX))) - { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; - ANYOF_CLASS_ZERO(ret); - } - ANYOF_FLAGS(ret) |= ANYOF_CLASS; - } - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a * literal, as is the character that began the false range, i.e. @@ -12627,16 +13709,19 @@ parseit: ? RExC_parse - rangebegin : 0; if (strict) { - vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); } else { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN4reg(RExC_parse, - "False [] range \"%*.*s\"", - w, w, rangebegin); + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); (void)ReREFCNT_inc(RExC_rx_sv); cp_list = add_cp_to_invlist(cp_list, '-'); - cp_list = add_cp_to_invlist(cp_list, prevvalue); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); } } @@ -12644,16 +13729,89 @@ parseit: element_count += 2; /* So counts for three values */ } - if (! SIZE_ONLY) { - U8 classnum = namedclass_to_classnum(namedclass); - if (namedclass >= ANYOF_MAX) { /* If a special class */ + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ - /* Here, should be \h, \H, \v, or \V. Neither /d nor - * /l make a difference in what these match. There - * would be problems if these characters had folds - * other than themselves, as cp_list is subject to - * folding. */ + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ if (classnum != _CC_VERTSPACE) { assert( namedclass == ANYOF_HORIZWS || namedclass == ANYOF_NHORIZWS); @@ -12666,265 +13824,46 @@ parseit: _invlist_union_maybe_complement_2nd( cp_list, PL_XPosix_ptrs[classnum], - cBOOL(namedclass % 2), /* Complement if odd + namedclass % 2 != 0, /* Complement if odd (NHORIZWS, NVERTWS) */ &cp_list); } } - else if (classnum == _CC_ASCII) { -#ifdef HAS_ISASCII - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else -#endif /* Not isascii(); just use the hard-coded definition for it */ - _invlist_union_maybe_complement_2nd( - posixes, - PL_ASCII, - cBOOL(namedclass % 2), /* Complement if odd - (NASCII) */ - &posixes); + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); } - else { /* Garden variety class */ - - /* The ascii range inversion list */ - SV* ascii_source = PL_Posix_ptrs[classnum]; - - /* The full Latin1 range inversion list */ - SV* l1_source = PL_L1Posix_ptrs[classnum]; - - /* This code is structured into two major clauses. The - * first is for classes whose complete definitions may not - * already be known. It not, the Latin1 definition - * (guaranteed to already known) is used plus code is - * generated to load the rest at run-time (only if needed). - * If the complete definition is known, it drops down to - * the second clause, where the complete definition is - * known */ - - if (classnum < _FIRST_NON_SWASH_CC) { + continue; /* Go get next character */ + } + } /* end of namedclass \blah */ - /* Here, the class has a swash, which may or not - * already be loaded */ + /* Here, we have a single value. If 'range' is set, it is the ending + * of a range--check its validity. Later, we will handle each + * individual code point in the range. If 'range' isn't set, this + * could be the beginning of a range, so check for that by looking + * ahead to see if the next real character to be processed is the range + * indicator--the minus sign */ - /* The name of the property to use to match the full - * eXtended Unicode range swash for this character - * class */ - const char *Xname = swash_property_names[classnum]; - - /* If returning the inversion list, we can't defer - * getting this until runtime */ - if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) { - PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", Xname, &PL_sv_undef, - 1, /* binary */ - 0, /* not tr/// */ - NULL, /* No inversion list */ - NULL /* No flags */ - ); - assert(PL_utf8_swash_ptrs[classnum]); - } - if ( ! PL_utf8_swash_ptrs[classnum]) { - if (namedclass % 2 == 0) { /* A non-complemented - class */ - /* If not /a matching, there are code points we - * don't know at compile time. Arrange for the - * unknown matches to be loaded at run-time, if - * needed */ - if (! AT_LEAST_ASCII_RESTRICTED) { - Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", - Xname); - } - if (LOC) { /* Under locale, set run-time - lookup */ - ANYOF_CLASS_SET(ret, namedclass); - } - else { - /* Add the current class's code points to - * the running total */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : l1_source, - &posixes); - } - } - else { /* A complemented class */ - if (AT_LEAST_ASCII_RESTRICTED) { - /* Under /a should match everything above - * ASCII, plus the complement of the set's - * ASCII matches */ - _invlist_union_complement_2nd(posixes, - ascii_source, - &posixes); - } - else { - /* Arrange for the unknown matches to be - * loaded at run-time, if needed */ - Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", - Xname); - runtime_posix_matches_above_Unicode = TRUE; - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else { - - /* We want to match everything in - * Latin1, except those things that - * l1_source matches */ - SV* scratch_list = NULL; - _invlist_subtract(PL_Latin1, l1_source, - &scratch_list); - - /* Add the list from this class to the - * running total */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, - scratch_list, - &posixes); - SvREFCNT_dec_NN(scratch_list); - } - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) - |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - } - } - goto namedclass_done; - } - - /* Here, there is a swash loaded for the class. If no - * inversion list for it yet, get it */ - if (! PL_XPosix_ptrs[classnum]) { - PL_XPosix_ptrs[classnum] - = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]); - } - } - - /* Here there is an inversion list already loaded for the - * entire class */ - - if (namedclass % 2 == 0) { /* A non-complemented class, - like ANYOF_PUNCT */ - if (! LOC) { - /* For non-locale, just add it to any existing list - * */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - } - else { /* Locale */ - SV* scratch_list = NULL; - - /* For above Latin1 code points, we use the full - * Unicode range */ - _invlist_intersection(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - /* And set the output to it, adding instead if - * there already is an output. Checking if - * 'posixes' is NULL first saves an extra clone. - * Its reference count will be decremented at the - * next union, etc, or if this is the only - * instance, at the end of the routine */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } - -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_BLANK) { -#endif - /* Set this class in the node for runtime - * matching */ - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* No isblank(), use the hard-coded ASCII-range - * blanks, adding them to the running total. */ - - _invlist_union(posixes, ascii_source, &posixes); - } -#endif - } - } - else { /* A complemented class, like ANYOF_NPUNCT */ - if (! LOC) { - _invlist_union_complement_2nd( - posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - /* Under /d, everything in the upper half of the - * Latin1 range matches this complement */ - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* Locale */ - SV* scratch_list = NULL; - _invlist_subtract(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_NBLANK) { -#endif - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* Get the list of all code points in Latin1 - * that are not ASCII blanks, and add them to - * the running total */ - _invlist_subtract(PL_Latin1, ascii_source, - &scratch_list); - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#endif - } - } - } - namedclass_done: - continue; /* Go get next character */ - } - } /* end of namedclass \blah */ - - /* Here, we have a single value. If 'range' is set, it is the ending - * of a range--check its validity. Later, we will handle each - * individual code point in the range. If 'range' isn't set, this - * could be the beginning of a range, so check for that by looking - * ahead to see if the next real character to be processed is the range - * indicator--the minus sign */ - - if (skip_white) { - RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); - } + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */); + } if (range) { if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; - Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); range = 0; /* not a valid range */ } } @@ -13005,11 +13944,9 @@ parseit: value, foldbuf, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) ); /* Here, should be the first character of the @@ -13077,7 +14014,8 @@ parseit: /* Deal with this element of the class */ if (! SIZE_ONLY) { #ifndef EBCDIC - cp_list = _add_range_to_invlist(cp_list, prevvalue, value); + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); #else SV* this_range = _new_invlist(1); _append_range_to_invlist(this_range, prevvalue, value); @@ -13091,13 +14029,18 @@ parseit: * included. literal_endpoint==2 means both ends of the range used * a literal character, not \x{foo} */ if (literal_endpoint == 2 - && (prevvalue >= 'a' && value <= 'z') - || (prevvalue >= 'A' && value <= 'Z')) + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) { - _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA], + _invlist_intersection(this_range, PL_ASCII, + &this_range); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], &this_range); } - _invlist_union(cp_list, this_range, &cp_list); + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); literal_endpoint = 0; #endif } @@ -13127,7 +14070,7 @@ parseit: #endif /* Look at the longest folds first */ - for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -13182,15 +14125,29 @@ parseit: return ret; } - /* If the character class contains only a single element, it may be - * optimizable into another node type which is smaller and runs faster. - * Check if this is the case for this class */ - if (element_count == 1 && ! ret_invlist) { + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { U8 op = END; U8 arg = 0; - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or - [:digit:] or \p{foo} */ + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ /* All named classes are mapped into POSIXish nodes, with its FLAG * argument giving which class it is */ @@ -13246,14 +14203,6 @@ parseit: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } -#ifndef HAS_ISBLANK - if (op == POSIXL - && (namedclass == ANYOF_BLANK - || namedclass == ANYOF_NBLANK)) - { - op = POSIXA; - } -#endif join_posix: /* The odd numbered ones are the complements of the @@ -13308,13 +14257,16 @@ parseit: /* To get locale nodes to not use the full ANYOF size would * require moving the code above that writes the portions * of it that aren't in other nodes to after this point. - * e.g. ANYOF_CLASS_SET */ + * e.g. ANYOF_POSIXL_SET */ RExC_size = orig_size; } } else { RExC_emit = (regnode *)orig_emit; if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } if (invert) { op += NPOSIXD - POSIXD; } @@ -13330,13 +14282,17 @@ parseit: *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } RExC_parse = (char *) cur_parse; SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); return ret; } } @@ -13347,238 +14303,260 @@ parseit: /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ - if (FOLD && cp_list) { - UV start, end; /* End points of code point ranges */ + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; + } - SV* fold_intersection = NULL; + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, + &fold_intersection); - /* If the highest code point is within Latin1, we can use the - * compiled-in Alphas list, and not have to go out to disk. This - * yields two false positives, the masculine and feminine ordinal - * indicators, which are weeded out below using the - * IS_IN_SOME_FOLD_L1() macro */ - if (invlist_highest(cp_list) < 256) { - _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, - &fold_intersection); - } - else { + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { + + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { - /* Here, there are non-Latin1 code points, so we will have to go - * fetch the list of all the characters that participate in folds - */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { - - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode - * rules hard-coded into Perl. (This case happens legitimately - * during compilation of Perl itself before the Unicode tables - * are generated) */ - if (_invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = newHV(); - } - else { /* If the folds haven't been read in, call a fold function * to force that */ if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; + U8 dummy[UTF8_MAXBYTES_CASE+1]; /* This string is just a short named one above \xff */ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); assert(PL_utf8_tofold); /* Verify that worked */ } - PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); + PL_utf8_foldclosures + = _swash_inversion_hash(PL_utf8_tofold); } } - /* Only the characters in this class that participate in folds need - * be checked. Get the intersection of this class and all the - * possible characters that are foldable. This can quickly narrow - * down a large class */ - _invlist_intersection(PL_utf8_foldable, cp_list, - &fold_intersection); - } - - /* Now look at the foldable characters in this class individually */ - invlist_iterinit(fold_intersection); - while (invlist_iternext(fold_intersection, &start, &end)) { - UV j; - - /* Locale folding for Latin1 characters is deferred until runtime */ - if (LOC && start < 256) { - start = 256; - } - - /* Look at every character in the range */ - for (j = start; j <= end; j++) { - - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - SV** listp; - - if (j < 256) { - - /* We have the latin1 folding rules hard-coded here so that - * an innocent-looking character class, like /[ks]/i won't - * have to go out to disk to find the possible matches. - * XXX It would be better to generate these via regen, in - * case a new version of the Unicode standard adds new - * mappings, though that is not really likely, and may be - * caught by the default: case of the switch below. */ - - if (IS_IN_SOME_FOLD_L1(j)) { - - /* ASCII is always matched; non-ASCII is matched only - * under Unicode rules */ - if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { - cp_list = - add_cp_to_invlist(cp_list, PL_fold_latin1[j]); - } - else { - depends_list = - add_cp_to_invlist(depends_list, PL_fold_latin1[j]); + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; + + if (j < 256) { + + /* We have the latin1 folding rules hard-coded here so + * that an innocent-looking character class, like + * /[ks]/i won't have to go out to disk to find the + * possible matches. XXX It would be better to + * generate these via regen, in case a new version of + * the Unicode standard adds new mappings, though that + * is not really likely, and may be caught by the + * default: case of the switch below. */ + + if (IS_IN_SOME_FOLD_L1(j)) { + + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } } - } - if (HAS_NONLATIN1_FOLD_CLOSURE(j) - && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) - { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for to the - * inversion list. */ - switch (j) { - case 'k': - case 'K': - cp_list = - add_cp_to_invlist(cp_list, KELVIN_SIGN); - break; - case 's': - case 'S': - cp_list = add_cp_to_invlist(cp_list, + if (HAS_NONLATIN1_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + /* Certain Latin1 characters have matches outside + * Latin1. To get here, is one of those + * characters. None of these matches is valid for + * ASCII characters under /aa, which is why the 'if' + * just above excludes those. These matches only + * happen when the target string is utf8. The code + * below adds the single fold closures for to the + * inversion list. */ + + switch (j) { + case 'k': + case 'K': + *use_list = + add_cp_to_invlist(*use_list, KELVIN_SIGN); + break; + case 's': + case 'S': + *use_list = add_cp_to_invlist(*use_list, LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - cp_list = add_cp_to_invlist(cp_list, + break; + case MICRO_SIGN: + *use_list = add_cp_to_invlist(*use_list, GREEK_CAPITAL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - cp_list = - add_cp_to_invlist(cp_list, ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - cp_list = add_cp_to_invlist(cp_list, + *use_list = add_cp_to_invlist(*use_list, + GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *use_list = + add_cp_to_invlist(*use_list, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *use_list = add_cp_to_invlist(*use_list, LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - cp_list = add_cp_to_invlist(cp_list, + break; + case LATIN_SMALL_LETTER_SHARP_S: + *use_list = add_cp_to_invlist(*use_list, LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 to - * express, so they can't match unless the - * target string is in UTF-8, so no action here - * is necessary, as regexec.c properly handles - * the general case for UTF-8 matching and - * multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character + * folds from code points that require UTF8 + * to express, so they can't match unless + * the target string is in UTF-8, so no + * action here is necessary, as regexec.c + * properly handles the general case for + * UTF-8 matching and multi-char folds */ + break; + default: + /* Use deprecated warning to increase the + * chances of this being output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); + break; + } } + continue; } - continue; - } - /* Here is an above Latin1 character. We don't have the rules - * hard-coded for it. First, get its fold. This is the simple - * fold, as the multi-character folds have been handled earlier - * and separated out */ - _to_uni_fold_flags(j, foldbuf, &foldlen, - ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); - - /* Single character fold of above Latin1. Add everything in - * its fold closure to the list that this node should match. - * The fold closures data structure is a hash with the keys - * being the UTF-8 of every character that is folded to, like - * 'k', and the values each an array of all code points that - * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. - * Multi-character folds are not included */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); - - /* /aa doesn't allow folds between ASCII and non-; /l - * doesn't allow them between above and below 256 */ - if ((ASCII_FOLD_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && c < 256)) { - continue; - } + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c = SvUV(*c_p); - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ - if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) - { - cp_list = add_cp_to_invlist(cp_list, c); - } - else { - depends_list = add_cp_to_invlist(depends_list, c); + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } + + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } } } } } + SvREFCNT_dec_NN(fold_intersection); } - SvREFCNT_dec_NN(fold_intersection); + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); } /* And combine the result (if any) with any inversion list from posix * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ - if (posixes) { + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } if (! DEPENDS_SEMANTICS) { if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); @@ -13592,10 +14570,8 @@ parseit: /* Under /d, we put into a separate list the Latin1 things that * match only when the target string is utf8 */ SV* nonascii_but_latin1_properties = NULL; - _invlist_intersection(posixes, PL_Latin1, + _invlist_intersection(posixes, PL_UpperLatin1, &nonascii_but_latin1_properties); - _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII, - &nonascii_but_latin1_properties); _invlist_subtract(posixes, nonascii_but_latin1_properties, &posixes); if (cp_list) { @@ -13629,7 +14605,6 @@ parseit: * , because having a Unicode property forces Unicode * semantics */ if (properties) { - bool warn_super = ! has_user_defined_property; if (cp_list) { /* If it matters to the final outcome, see if a non-property @@ -13640,14 +14615,8 @@ parseit: * are using above-Unicode code points indicates they should know * the issues involved */ if (warn_super) { - bool non_prop_matches_above_Unicode = - runtime_posix_matches_above_Unicode - | (invlist_highest(cp_list) > PERL_UNICODE_MAX); - if (invert) { - non_prop_matches_above_Unicode = - ! non_prop_matches_above_Unicode; - } - warn_super = ! non_prop_matches_above_Unicode; + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); } _invlist_union(properties, cp_list, &cp_list); @@ -13658,7 +14627,7 @@ parseit: } if (warn_super) { - OP(ret) = ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; } } @@ -13671,12 +14640,32 @@ parseit: * shouldn't. Therefore we can't invert folded locale now, as it won't be * folded until runtime */ + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ if (invert - && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { @@ -13706,15 +14695,6 @@ parseit: return orig_emit; } - /* If we didn't do folding, it's because some information isn't available - * until runtime; set the run-time fold flag for these. (We don't have to - * worry about properties folding, as that is taken care of by the swash - * fetching) */ - if (FOLD && LOC) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; - } - /* Some character classes are equivalent to other nodes. Such nodes take * up less room and generally fewer operations to execute than ANYOF nodes. * Above, we checked for and optimized into some such equivalents for @@ -13733,8 +14713,13 @@ parseit: if (cp_list && ! invert && ! depends_list - && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) { UV start, end; U8 op = END; /* The optimzation node-type */ @@ -13758,7 +14743,7 @@ parseit: && (start < 256 || UTF)) { /* Here, the list contains a single code point. Can optimize - * into an EXACT node */ + * into an EXACTish node */ value = start; @@ -13788,12 +14773,6 @@ parseit: } } else { - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } if (_invlist_contains_cp(PL_utf8_foldable, value)) { op = EXACT; } @@ -13833,7 +14812,9 @@ parseit: RExC_parse = (char *)cur_parse; if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } SvREFCNT_dec_NN(cp_list); @@ -13846,55 +14827,8 @@ parseit: * for things that belong in the bitmap, put them there, and delete from * . While we are at it, see if everything above 255 is in the * list, and if so, set a flag to speed up execution */ - ANYOF_BITMAP_ZERO(ret); - if (cp_list) { - - /* This gets set if we actually need to modify things */ - bool change_invlist = FALSE; - UV start, end; - - /* Start looking through */ - invlist_iterinit(cp_list); - while (invlist_iternext(cp_list, &start, &end)) { - UV high; - int i; - - if (end == UV_MAX && start <= 256) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - } - - /* Quit if are above what we should change */ - if (start > 255) { - break; - } - - change_invlist = TRUE; - - /* Set all the bits in the range, up to the max that we are doing */ - high = (end < 255) ? end : 255; - for (i = start; i <= (int) high; i++) { - if (! ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - } - invlist_iterfinish(cp_list); - - /* Done with loop; remove any code points that are in the bitmap from - * */ - if (change_invlist) { - _invlist_subtract(cp_list, PL_Latin1, &cp_list); - } - - /* If have completely emptied it, remove it completely */ - if (_invlist_len(cp_list) == 0) { - SvREFCNT_dec_NN(cp_list); - cp_list = NULL; - } - } + populate_ANYOF_from_invlist(ret, &cp_list); if (invert) { ANYOF_FLAGS(ret) |= ANYOF_INVERT; @@ -13911,6 +14845,7 @@ parseit: else { cp_list = depends_list; } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; } /* If there is a swash and more than one element, we can't use the swash in @@ -13920,56 +14855,104 @@ parseit: swash = NULL; } - if (! cp_list - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - { - ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { - /* av[0] stores the character class description in its textual form: - * used later (regexec.c:Perl_regclass_swash()) to initialize the - * appropriate swash, and is also useful for dumping the regnode. - * av[1] if NULL, is a placeholder to later contain the swash computed - * from av[0]. But if no further computation need be done, the - * swash is stored there now. - * av[2] stores the cp_list inversion list for use in addition or - * instead of av[0]; used only if av[1] is NULL - * av[3] is set if any component of the class is from a user-defined - * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; - av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - ? SvREFCNT_inc(listsv) : &PL_sv_undef); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { av_store(av, 1, swash); SvREFCNT_dec_NN(cp_list); } else { - av_store(av, 1, NULL); + av_store(av, 1, &PL_sv_undef); if (cp_list) { - av_store(av, 2, cp_list); - av_store(av, 3, newSVuv(has_user_defined_property)); + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); } } + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + rv = newRV_noinc(MUTABLE_SV(av)); - n = add_data(pRExC_state, 1, "s"); + n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; - ARG_SET(ret, n); + ARG_SET(node, n); } - - *flagp |= HASWIDTH|SIMPLE; - return ret; } -#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* reg_skipcomment() Absorbs an /x style # comments from the input stream. Returns true if there is more text remaining in the stream. - Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment terminates the pattern without including a newline. Note its the callers responsibility to ensure that we are @@ -13992,7 +14975,7 @@ S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) if (!ended) { /* we ran off the end of the pattern without ending the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; return 0; } else return 1; @@ -14074,7 +15057,8 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", "reg_node", __LINE__, PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] @@ -14131,7 +15115,8 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", __LINE__, PL_reg_name[op], @@ -14150,7 +15135,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) /* - reguni - emit (if appropriate) a Unicode character */ -STATIC STRLEN +PERL_STATIC_INLINE STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; @@ -14211,7 +15196,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) StructCopy(--src, --dst, regnode); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", __LINE__, PL_reg_name[op], @@ -14230,7 +15216,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) place = opnd; /* Op node, where operand used to be. */ #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", __LINE__, PL_reg_name[op], @@ -14255,7 +15242,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) */ /* TODO: All three parms should be const */ STATIC void -S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14276,7 +15264,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tail" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), @@ -14314,7 +15302,8 @@ to control which is which. /* TODO: All four parms should be const */ STATIC U8 -S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14337,8 +15326,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN if (PL_regkind[OP(scan)] == EXACT) { - bool has_exactf_sharp_s; /* Unexamined in this routine */ - if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) return EXACT; } #endif @@ -14346,10 +15336,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, switch (OP(scan)) { case EXACT: case EXACTF: + case EXACTFA_NO_TRIE: case EXACTFA: case EXACTFU: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -14364,7 +15354,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), @@ -14377,8 +15367,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv_val=sv_newmortal(); DEBUG_PARSE_MSG(""); - regprop(RExC_rx, mysv_val, val); - PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", SvPV_nolen_const(mysv_val), (IV)REG_NODE_NUM(val), (IV)(val - scan) @@ -14406,7 +15397,9 @@ S_regdump_intflags(pTHX_ const char *lead, const U32 flags) int bit; int set=0; - for (bit=0; bit<32; bit++) { + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitcheck_substr == r->float_substr && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored")); - if (r->extflags & RXf_NOSCAN) + if (r->intflags & PREGf_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->extflags & RXf_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); @@ -14528,22 +15523,22 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, ") "); if (ri->regstclass) { - regprop(r, sv, ri->regstclass); + regprop(r, sv, ri->regstclass, NULL); PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); } - if (r->extflags & RXf_ANCH) { + if (r->intflags & PREGf_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); - if (r->extflags & RXf_ANCH_BOL) + if (r->intflags & PREGf_ANCH_BOL) PerlIO_printf(Perl_debug_log, "(BOL)"); - if (r->extflags & RXf_ANCH_MBOL) + if (r->intflags & PREGf_ANCH_MBOL) PerlIO_printf(Perl_debug_log, "(MBOL)"); - if (r->extflags & RXf_ANCH_SBOL) + if (r->intflags & PREGf_ANCH_SBOL) PerlIO_printf(Perl_debug_log, "(SBOL)"); - if (r->extflags & RXf_ANCH_GPOS) + if (r->intflags & PREGf_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); PerlIO_putc(Perl_debug_log, ' '); } - if (r->extflags & RXf_GPOS_SEEN) + if (r->intflags & PREGf_GPOS_SEEN) PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) PerlIO_printf(Perl_debug_log, "plus "); @@ -14565,21 +15560,11 @@ Perl_regdump(pTHX_ const regexp *r) } /* -- regprop - printable representation of opcode +- regprop - printable representation of opcode, with run time support */ -#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ -STMT_START { \ - if (do_sep) { \ - Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ - if (flags & ANYOF_INVERT) \ - /*make sure the invert info is in each */ \ - sv_catpvs(sv, "^"); \ - do_sep = 0; \ - } \ -} STMT_END void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) { #ifdef DEBUGGING dVAR; @@ -14595,10 +15580,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) || _CC_VERTSPACE != 16 #error Need to adjust order of anyofs[] #endif - "[\\w]", - "[\\W]", - "[\\d]", - "[\\D]", + "\\w", + "\\W", + "\\d", + "\\D", "[:alpha:]", "[:^alpha:]", "[:lower:]", @@ -14615,8 +15600,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^graph:]", "[:cased:]", "[:^cased:]", - "[\\s]", - "[\\S]", + "\\s", + "\\S", "[:blank:]", "[:^blank:]", "[:xdigit:]", @@ -14627,8 +15612,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^cntrl:]", "[:ascii:]", "[:^ascii:]", - "[\\v]", - "[\\V]" + "\\v", + "\\V" }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -14640,7 +15625,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from regexec.c, and it would be hard to supply pRExC_state. */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; @@ -14671,38 +15657,22 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); DEBUG_TRIE_COMPILE_r( - Perl_sv_catpvf(aTHX_ sv, - "", - (UV)trie->startstate, - (IV)trie->statecount-1, /* -1 because of the unused 0 element */ - (UV)trie->wordcount, - (UV)trie->minlen, - (UV)trie->maxlen, - (UV)TRIE_CHARCOUNT(trie), - (UV)trie->uniquecharcount - ) + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { - int i; - int rangestart = -1; - U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); sv_catpvs(sv, "["); - for (i = 0; i <= 256; i++) { - if (i < 256 && BITMAP_TEST(bitmap,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; - } - } + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); sv_catpvs(sv, "]"); } @@ -14713,7 +15683,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || (OP(o) < NREF)) { @@ -14737,21 +15709,36 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } } + if ( k == REF && reginfo) { + U32 n = ARG(o); /* which paren pair */ + I32 ln = prog->offs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } } else if (k == GOSUB) - Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); else if (k == VERB) { if (!o->flags) Perl_sv_catpvf(aTHX_ sv, ":%"SVf, SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { - int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - if (flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE_FLAGS) sv_catpvs(sv, "{loc}"); if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); @@ -14760,86 +15747,49 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "^"); /* output what the standard cp 0-255 bitmap matches */ - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - do_sep = 1; - rangestart = -1; - } - } + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - /* output any special charclass tests (used entirely under use locale) */ - if (ANYOF_CLASS_TEST_ANY_SET(o)) - for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) - if (ANYOF_CLASS_TEST(o,i)) { + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { sv_catpv(sv, anyofs[i]); do_sep = 1; } - - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - - if (flags & ANYOF_NON_UTF8_LATIN1_ALL) { - sv_catpvs(sv, "{non-utf8-latin1-all}"); + } } - /* output information about the unicode matching */ - if (flags & ANYOF_UNICODE_ALL) - sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) - sv_catpvs(sv, "{unicode}"); - if (flags & ANYOF_NONBITMAP_NON_UTF8) - sv_catpvs(sv, "{outside bitmap}"); - - if (ANYOF_NONBITMAP(o)) { - SV *lv; /* Set if there is something outside the bit map */ - SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL); - bool byte_output = FALSE; /* If something in the bitmap has been - output */ - - if (lv && lv != &PL_sv_undef) { - if (sw) { - U8 s[UTF8_MAXBYTES_CASE+1]; - - for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */ - uvchr_to_utf8(s, i); + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); + } - if (i < 256 - && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate - things already - output as part - of the bitmap */ - && swash_fetch(sw, s, TRUE)) - { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - byte_output = TRUE; - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) { - put_byte(sv, rangestart); - } - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i-1); - } - rangestart = -1; - } - } - } + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } - { + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { char *s = savesvpv(lv); char * const origs = s; @@ -14849,6 +15799,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (*s == '\n') { const char * const t = ++s; + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } + if (byte_output) { sv_catpvs(sv, " "); } @@ -14880,8 +15837,29 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) out_dump: Safefree(origs); + SvREFCNT_dec_NN(lv); + } + + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); } - SvREFCNT_dec_NN(lv); } } @@ -14889,11 +15867,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; - if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } } else { - sv_catpv(sv, anyofs[index]); + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); } } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -14903,9 +15887,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) PERL_UNUSED_ARG(sv); PERL_UNUSED_ARG(o); PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); #endif /* DEBUGGING */ } + + SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ @@ -15174,7 +16161,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } break; default: - Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); } } Safefree(ri->data->what); @@ -15277,7 +16265,6 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) so we need to copy it locally. */ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); ret->mother_re = NULL; - ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ @@ -15308,7 +16295,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) len = ProgLen(ri); - Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); reti->num_code_blocks = ri->num_code_blocks; @@ -15350,9 +16338,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) break; case 'f': /* This is cheating. */ - Newx(d->data[i], 1, struct regnode_charclass_class); - StructCopy(ri->data->data[i], d->data[i], - struct regnode_charclass_class); + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); reti->regstclass = (regnode*)d->data[i]; break; case 'T': @@ -15372,7 +16359,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) d->data[i] = ri->data->data[i]; break; default: - Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); } } @@ -15412,7 +16400,8 @@ Perl_regnext(pTHX_ regnode *p) return(NULL); if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); } offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); @@ -15424,7 +16413,7 @@ Perl_regnext(pTHX_ regnode *p) #endif STATIC void -S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) { va_list args; STRLEN l1 = strlen(pat1); @@ -15443,20 +16432,15 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) Copy(pat2, buf + l1, l2 , char); buf[l1 + l2] = '\n'; buf[l1 + l2 + 1] = '\0'; -#ifdef I_STDARG - /* ANSI variant takes additional second argument */ va_start(args, pat2); -#else - va_start(args); -#endif msv = vmess(buf, &args); va_end(args); message = SvPV_const(msv,l1); if (l1 > 512) l1 = 512; Copy(message, buf, l1 , char); - buf[l1-1] = '\0'; /* Overwrite \n */ - Perl_croak(aTHX_ "%s", buf); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); } /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ @@ -15474,7 +16458,8 @@ Perl_save_re_context(pTHX) U32 i; for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; - const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); GV *const *const gvp = (GV**)hv_fetch(PL_defstash, digits, len, 0); @@ -15496,25 +16481,17 @@ S_put_byte(pTHX_ SV *sv, int c) { PERL_ARGS_ASSERT_PUT_BYTE; - /* Our definition of isPRINT() ignores locales, so only bytes that are - not part of UTF-8 are considered printable. I assume that the same - holds for UTF-EBCDIC. - Also, code point 255 is not printable in either (it's E0 in EBCDIC, - which Wikipedia says: - - EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all - ones (binary 1111 1111, hexadecimal FF). It is similar, but not - identical, to the ASCII delete (DEL) or rubout control character. ... - it is typically mapped to hexadecimal code 9F, in order to provide a - unique character mapping in both directions) - - So the old condition can be simplified to !isPRINT(c) */ if (!isPRINT(c)) { - if (c < 256) { - Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; } } else { @@ -15525,14 +16502,93 @@ S_put_byte(pTHX_ SV *sv, int c) } } +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end' */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) + put_byte(sv, start); + } + else if ( end > 255 + || ! isALPHANUMERIC(start) + || ! isALPHANUMERIC(end) + || isDIGIT(start) != isDIGIT(end) + || isUPPER(start) != isUPPER(end) + || isLOWER(start) != isLOWER(end) + + /* This final test should get optimized out except on EBCDIC + * platforms, where it causes ranges that cross discontinuities + * like i/j to be shown as hex instead of the misleading, + * e.g. H-K (since that range includes more than H, I, J, K). + * */ + || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start)) + { + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + } + else { /* Here, the ends of the range are both digits, or both uppercase, + or both lowercase; and there's no discontinuity in the range + (which could happen on EBCDIC platforms) */ + put_byte(sv, start); + sv_catpvs(sv, "-"); + put_byte(sv, end); + } +} + +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i < 256; i++) { + if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j <= 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); + has_output_anything = TRUE; + i = j; + } + } + + return has_output_anything; +} #define CLEAR_OPTSTART \ - if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ } STMT_END -#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); STATIC const regnode * S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, @@ -15574,14 +16630,15 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else CLEAR_OPTSTART; - regprop(r, sv, node); + regprop(r, sv, node, NULL); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); if (OP(node) != OPTIMIZED) { if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, " (0)"); - else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) PerlIO_printf(Perl_debug_log, " (FAIL)"); else PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); @@ -15614,7 +16671,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const reg_trie_data * const trie = (reg_trie_data*)ri->data->data[optrie]; #ifdef DEBUGGING - AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); #endif const regnode *nextbranch= NULL; I32 word_idx; @@ -15624,18 +16682,22 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PerlIO_printf(Perl_debug_log, "%*s%s ", (int)(2*(indent+3)), "", - elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, - PL_colors[0], PL_colors[1], - (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELLIPSES | - PERL_PV_PRETTY_LTGT + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT ) - : "???" + : "???" ); if (trie->jump) { U16 dist= trie->jump[word_idx+1]; PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", - (UV)((dist ? this_trie + dist : next) - start)); + (UV)((dist ? this_trie + dist : next) - start)); if (dist) { if (!nextbranch) nextbranch= this_trie + trie->jump[0]; @@ -15665,8 +16727,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS) - ? ANYOF_CLASS_SKIP : ANYOF_SKIP); + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) { diff --git a/src/5019002/regexec.c b/src/5021000/regexec.c similarity index 78% rename from src/5019002/regexec.c rename to src/5021000/regexec.c index fadebe1..5bf42fe 100644 --- a/src/5019002/regexec.c +++ b/src/5021000/regexec.c @@ -37,16 +37,6 @@ #include "re_top.h" #endif -/* At least one required character in the target string is expressible only in - * UTF-8. */ -static const char* const non_utf8_target_but_utf8_required - = "Can't match, because target string needs to be in UTF-8\n"; - -#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ - goto target; \ -} STMT_END - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * @@ -94,16 +84,28 @@ static const char* const non_utf8_target_but_utf8_required #include "inline_invlist.c" #include "unicode_constants.h" +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #ifndef STATIC #define STATIC static #endif -/* Valid for non-utf8 strings: avoids the reginclass +/* Valid only for non-utf8 strings: avoids the reginclass * call if there are no complications: i.e., if everything matchable is * straight forward in the bitmap */ -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ : ANYOF_BITMAP_TEST(p,*(c))) /* @@ -118,6 +120,7 @@ static const char* const non_utf8_target_but_utf8_required ? reghop3((U8*)pos, off, \ (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ : (U8*)(pos + off)) + #define HOPBACKc(pos, off) \ (char*)(reginfo->is_utf8_target \ ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ @@ -128,6 +131,24 @@ static const char* const non_utf8_target_but_utf8_required #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ #define NEXTCHR_IS_EOS (nextchr < 0) @@ -140,11 +161,11 @@ static const char* const non_utf8_target_but_utf8_required SET_nextchr -#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \ +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ if (!swash_ptr) { \ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ - 1, 0, NULL, &flags); \ + 1, 0, invlist, &flags); \ assert(swash_ptr); \ } \ } STMT_END @@ -153,28 +174,33 @@ static const char* const non_utf8_target_but_utf8_required #ifdef DEBUGGING # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); #else # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name) + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) #endif #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ PL_utf8_swash_ptrs[_CC_WORDCHAR], \ - swash_property_names[_CC_WORDCHAR], \ - GREEK_SMALL_LETTER_IOTA_UTF8) + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ STMT_START { \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ "_X_regular_begin", \ - GREEK_SMALL_LETTER_IOTA_UTF8); \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ "_X_extend", \ + NULL, \ COMBINING_GRAVE_ACCENT_UTF8); \ } STMT_END @@ -191,14 +217,14 @@ static const char* const non_utf8_target_but_utf8_required * although it may be done at run time beause of the REF possibility - more * investigation required. -- demerphq */ -#define JUMPABLE(rn) ( \ - OP(rn) == OPEN || \ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ - OP(rn) == EVAL || \ - OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ - OP(rn) == PLUS || OP(rn) == MINMOD || \ - OP(rn) == KEEPS || \ - (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ ) #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) @@ -208,13 +234,13 @@ static const char* const non_utf8_target_but_utf8_required /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -224,7 +250,7 @@ static const char* const non_utf8_target_but_utf8_required Search for mandatory following text node; for lookahead, the text must follow but for lookbehind (rn->flags != 0) we skip to the next step. */ -#define FIND_NEXT_IMPT(rn) STMT_START { \ +#define FIND_NEXT_IMPT(rn) STMT_START { \ while (JUMPABLE(rn)) { \ const OPCODE type = OP(rn); \ if (type == SUSPEND || PL_regkind[type] == CURLY) \ @@ -275,8 +301,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) PERL_ARGS_ASSERT_REGCPPUSH; if (paren_elems_to_push < 0) - Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", - paren_elems_to_push); + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i", + paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS); if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf @@ -297,8 +323,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) ); for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(rex->offs[p].end); - SSPUSHINT(rex->offs[p].start); + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", @@ -370,10 +396,10 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) ); paren = *maxopenparen_p; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { - I32 tmps; + SSize_t tmps; rex->offs[paren].start_tmp = SSPOPINT; - rex->offs[paren].start = SSPOPINT; - tmps = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; if (paren <= rex->lastparen) rex->offs[paren].end = tmps; DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, @@ -485,7 +511,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } if (classnum < _FIRST_NON_SWASH_CC) { @@ -493,8 +519,11 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) /* Initialize the swash unless done already */ if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags); + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) @@ -527,7 +556,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) */ I32 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, - char *strbeg, I32 minend, SV *screamer, U32 nosave) + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -544,68 +573,70 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, } #endif -/* - * Need to implement the following flags for reg_anch: - * - * USE_INTUIT_NOML - Useful to call re_intuit_start() first - * USE_INTUIT_ML - * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer - * INTUIT_AUTORITATIVE_ML - * INTUIT_ONCE_NOML - Intuit can match in one location only. - * INTUIT_ONCE_ML - * - * Another flag for this function: SECOND_TIME (so that float substrs - * with giant delta may be not rechecked). - */ - -/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ - -/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. - Otherwise, only SvCUR(sv) is used to get strbeg. */ - -/* XXXX We assume that strpos is strbeg unless sv. */ -/* XXXX Some places assume that there is a fixed substring. - An update may be needed if optimizer marks as "INTUITable" - RExen without fixed substrings. Similarly, it is assumed that - lengths of all the strings are no more than minlen, thus they - cannot come from lookahead. - (Or minlen should take into account lookahead.) - NOTE: Some of this comment is not correct. minlen does now take account - of lookahead/behind. Further research is required. -- demerphq -*/ - -/* A failure to find a constant substring means that there is no need to make - an expensive call to REx engine, thus we celebrate a failure. Similarly, - finding a substring too deep into the string means that fewer calls to - regtry() should be needed. - - REx compiler's optimizer found 4 possible hints: - a) Anchored substring; - b) Fixed substring; - c) Whether we are anchored (beginning-of-line or \G); - d) First node (of those at offset 0) which may distinguish positions; - We use a)b)d) and multiline-part of c), and try to find a position in the - string which does not contradict any of them. - */ - -/* Most of decisions we do here should have been done at compile time. - The nodes of the REx which we used for the search should have been - deleted from the finite automaton. */ - -/* args: - * rx: the regex to match against - * sv: the SV being matched: only used for utf8 flag; the string - * itself is accessed via the pointers below. Note that on - * something like an overloaded SV, SvPOK(sv) may be false - * and the string pointers may point to something unrelated to - * the SV itself. - * strbeg: real beginning of string - * strpos: the point in the string at which to begin matching - * strend: pointer to the byte following the last char of the string - * flags currently unused; set to 0 - * data: currently unused; set to NULL +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. */ char * @@ -620,34 +651,60 @@ Perl_re_intuit_start(pTHX_ { dVAR; struct regexp *const prog = ReANY(rx); - I32 start_shift = 0; + SSize_t start_shift = prog->check_offset_min; /* Should be nonnegative! */ - I32 end_shift = 0; - char *s; + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; SV *check; - char *t; const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ - I32 ml_anch; - char *other_last = NULL; /* other substr checked before this */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ char *check_at = NULL; /* check substr found at this pos */ - char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; -#ifdef DEBUGGING - const char * const i_strpos = strpos; -#endif GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_START; PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(data); - /* CHR_DIST() would be more correct here but it makes things slow. */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ if (prog->minlen > strend - strpos) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "String too short... [re_intuit_start]\n")); + " String too short...\n")); goto fail; } @@ -672,551 +729,702 @@ Perl_re_intuit_start(pTHX_ } check = prog->check_substr; } - if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */ - ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) - || ( (prog->extflags & RXf_ANCH_BOL) - && !multiline ) ); /* Check after \n? */ - - if (!ml_anch) { - if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */ - && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ - && (strpos != strbeg)) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); - goto fail; + + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); } - if (prog->check_offset_min == prog->check_offset_max - && !(prog->extflags & RXf_CANY_SEEN) - && ! multiline) /* /m can cause \n's to match that aren't - accounted for in the string max length. - See [perl #115242] */ - { - /* Substring at constant offset from beg-of-str... */ - I32 slen; + }); - s = HOP3c(strpos, prog->check_offset_min, strend); + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ - if (SvTAIL(check)) { - slen = SvCUR(check); /* >= 1 */ + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); - if ( strend - s > slen || strend - s < slen - 1 - || (strend - s == slen && strend[-1] != '\n')) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); - goto fail_finish; + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; + } + + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; } - /* Now should match s[0..slen-2] */ - slen--; if (slen && (*SvPVX_const(check) != *s - || (slen > 1 - && memNE(SvPVX_const(check), s, slen)))) { - report_neq: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); goto fail_finish; } + + check_at = s; + goto success_at_start; } - else if (*SvPVX_const(check) != *s - || ((slen = SvCUR(check)) > 1 - && memNE(SvPVX_const(check), s, slen))) - goto report_neq; - check_at = s; - goto success_at_start; - } - } - /* Match is anchored, but substr is not anchored wrt beg-of-str. */ - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - - if (!ml_anch) { - const I32 end = prog->check_offset_max + CHR_SVLEN(check) - - (SvTAIL(check) != 0); - const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; - - if (end_shift < eshift) - end_shift = eshift; } } - else { /* Can match at random position */ - ml_anch = 0; - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - /* end shift should be non negative here */ - } + end_shift = prog->check_end_shift; -#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", (IV)end_shift, RX_PRECOMP(prog)); #endif restart: - /* Find a possible match in the region s..strend by looking for - the "check" substring in the region corrected by start/end_shift. */ + + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ { - I32 srch_start_shift = start_shift; - I32 srch_end_shift = end_shift; U8* start_point; U8* end_point; - if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { - srch_end_shift -= ((strbeg - s) - srch_start_shift); - srch_start_shift = strbeg - s; - } - DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", - (IV)prog->check_offset_min, - (IV)srch_start_shift, - (IV)srch_end_shift, - (IV)prog->check_end_shift); - }); - if (prog->extflags & RXf_CANY_SEEN) { - start_point= (U8*)(s + srch_start_shift); - end_point= (U8*)(strend - srch_end_shift); + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); + + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; } else { - start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); - end_point= HOP3(strend, -srch_end_shift, strbeg); + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } + } + DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", (int)(end_point - start_point), (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), start_point); }); - s = fbm_instr( start_point, end_point, + check_at = fbm_instr( start_point, end_point, check, multiline ? FBMrf_MULTILINE : 0); - } - /* Update the count-of-usability, remove useless subpatterns, - unshift s. */ - - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(check), RE_SV_DUMPLEN(check), 30); - PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", - (s ? "Found" : "Did not find"), - (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) - ? "anchored" : "floating"), - quoted, - RE_SV_TAIL(check), - (s ? " at offset " : "...\n") ); - }); - if (!s) - goto fail_finish; - /* Finish the diagnostic message */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ - /* XXX dmq: first branch is for positive lookbehind... - Our check string is offset from the beginning of the pattern. - So we need to do any stclass tests offset forward from that - point. I think. :-( - */ + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ - check_at=s; + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); + } - /* Got a candidate. Check MBOL anchoring, and the *other* substr. - Start with the other substr. - XXXX no SCREAM optimization yet - and a very coarse implementation - XXXX /ttx+/ results in anchored="ttx", floating="x". floating will - *always* match. Probably should be marked during compile... - Probably it is right to do no SCREAM here... - */ + /* now look for the 'other' substring if defined */ - if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8) - : (prog->float_substr && prog->anchored_substr)) + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) { /* Take into account the "other" substring. */ - /* XXXX May be hopelessly wrong for UTF... */ - if (!other_last) - other_last = strpos; - if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { - do_other_anchored: - { - char * const last = HOP3c(s, -start_shift, strbeg); - char *last1, *last2; - char * const saved_s = s; - SV* must; - - t = s - prog->check_offset_max; - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) - && t > strpos))) - NOOP; - else - t = strpos; - t = HOP3c(t, prog->anchored_offset, strend); - if (t < other_last) /* These positions already checked */ - t = other_last; - last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); - if (last < last1) - last1 = last; - /* XXXX It is not documented what units *_offsets are in. - We assume bytes, but this is clearly wrong. - Meaning this code needs to be carefully reviewed for errors. - dmq. - */ + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ - /* On end-of-str: see comment below. */ - must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->anchored_utf8); /* for debug */ - } - else - s = fbm_instr( - (unsigned char*)t, - HOP3(HOP3(last1, prog->anchored_offset, strend) - + SvCUR(must), -(SvTAIL(must)!=0), strbeg), - must, - multiline ? FBMrf_MULTILINE : 0 - ); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); - if (!s) { - if (last1 >= last2) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying floating at offset %ld...\n", - (long)(HOP3c(saved_s, 1, strend) - i_strpos))); - other_last = HOP3c(last1, prog->anchored_offset+1, strend); - s = HOP3c(last, 1, strend); - goto restart; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - t = HOP3c(s, -prog->anchored_offset, strbeg); - other_last = HOP3c(s, 1, strend); - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; - } - } + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); } - else { /* Take into account the floating substring. */ - char *last, *last1; - char * const saved_s = s; - SV* must; - - t = HOP3c(s, -start_shift, strbeg); - last1 = last = - HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); - if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) - last = HOP3c(t, prog->float_max_offset, strend); - s = HOP3c(t, prog->float_min_offset, strend); - if (s < other_last) - s = other_last; - /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ - must = utf8_target ? prog->float_utf8 : prog->float_substr; - /* fbm_instr() takes into account exact value of end-of-str - if the check is SvTAIL(ed). Since false positives are OK, - and end-of-str is not later than strend we are OK. */ - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->float_utf8); /* for debug message */ - } - else - s = fbm_instr((unsigned char*)s, - (unsigned char*)last + SvCUR(must) - - (SvTAIL(must)!=0), - must, multiline ? FBMrf_MULTILINE : 0); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); - if (!s) { - if (last1 == last) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying anchored starting at offset %ld...\n", - (long)(saved_s + 1 - i_strpos))); - other_last = last; - s = HOP3c(t, 1, strend); - goto restart; + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; } else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - other_last = s; /* Fix this later. --Hugo */ - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); } } } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); + } + postprocess_substr_matches: - t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); + /* handle the extra constraint of /^.../m if present */ - DEBUG_OPTIMISE_MORE_r( - PerlIO_printf(Perl_debug_log, - "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", - (IV)prog->check_offset_min, - (IV)prog->check_offset_max, - (IV)(s-strpos), - (IV)(t-strpos), - (IV)(t-s), - (IV)(strend-strpos) - ) - ); + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) - && t > strpos))) - { - /* Fixed substring is found far enough so that the match - cannot start at strpos. */ - try_at_offset: - if (ml_anch && t[-1] != '\n') { - /* Eventually fbm_*() should handle this, but often - anchored_offset is not 0, so this check will not be wasted. */ - /* XXXX In the code below we prefer to look for "^" even in - presence of anchored substrings. And we search even - beyond the found float position. These pessimizations - are historical artefacts only. */ - find_anchor: - while (t < strend - prog->minlen) { - if (*t == '\n') { - if (t < check_at - prog->check_offset_min) { - if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { - /* Since we moved from the found position, - we definitely contradict the found anchored - substr. Due to the above check we do not - contradict "check" substr. - Thus we can arrive here only if check substr - is float. Redo checking for "other"=="fixed". - */ - strpos = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); - goto do_other_anchored; - } - /* We don't contradict the found floating substring. */ - /* XXXX Why not check for STCLASS? */ - s = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(s - i_strpos))); - goto set_useful; - } - /* Position contradicts check-string */ - /* XXXX probably better to look for check-string - than for "\n", so one should lower the limit for t? */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos))); - other_last = strpos = s = t + 1; - goto restart; - } - t++; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); goto fail_finish; } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); - } - s = t; - set_useful: - ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ - } - else { - /* The found string does not prohibit matching at strpos, - - no optimization of calling REx engine can be performed, - unless it was an MBOL and we are not after MBOL, - or a future STCLASS check will fail this. */ - try_at_start: - /* Even in this situation we may use MBOL flag if strpos is offset - wrt the start of the string. */ - if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n' - /* May be due to an implicit anchor of m{.*foo} */ - && !(prog->intflags & PREGf_IMPLICIT)) + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) { - t = strpos; - goto find_anchor; + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; } - DEBUG_EXECUTE_r( if (ml_anch) - PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", - (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]); - ); - success_at_start: - if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ - && (utf8_target ? ( - prog->check_utf8 /* Could be deleted already */ - && --BmUSEFUL(prog->check_utf8) < 0 - && (prog->check_utf8 == prog->float_utf8) - ) : ( - prog->check_substr /* Could be deleted already */ - && --BmUSEFUL(prog->check_substr) < 0 - && (prog->check_substr == prog->float_substr) - ))) - { - /* If flags & SOMETHING - do not do it many times on the same match */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); - /* XXX Does the destruction order has to change with utf8_target? */ - SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); - SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); - prog->check_substr = prog->check_utf8 = NULL; /* disable */ - prog->float_substr = prog->float_utf8 = NULL; /* clear */ - check = NULL; /* abort */ - s = strpos; - /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag - see http://bugs.activestate.com/show_bug.cgi?id=87173 */ - if (prog->intflags & PREGf_IMPLICIT) - prog->extflags &= ~RXf_ANCH_MBOL; - /* XXXX This is a remnant of the old implementation. It - looks wasteful, since now INTUIT can use many - other heuristics. */ - prog->extflags &= ~RXf_USE_INTUIT; - /* XXXX What other flags might need to be cleared in this branch? */ + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; } - else - s = strpos; + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); } - /* Last resort... */ - /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ - /* trie stclasses are too expensive to use here, we are better off to - leave it to regmatch itself */ + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ + if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { - /* minlen == 0 is possible if regstclass is \b or \B, - and the fixed substr is ''$. - Since minlen is already taken into account, s+1 is before strend; - accidentally, minlen >= 1 guaranties no false positives at s + 1 - even for \b or \B. But (minlen? 1 : 0) below assumes that - regstclass does not come from lookahead... */ - /* If regstclass takes bytelength more than 1: If charlength==1, OK. - This leaves EXACTF-ish only, which are dealt with in find_byclass(). */ const U8* const str = (U8*)STRING(progi->regstclass); + + /* XXX this value could be pre-computed */ const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT - ? CHR_DIST(str+STR_LEN(progi->regstclass), str) + ? (reginfo->is_utf8_pat + ? utf8_distance(str + STR_LEN(progi->regstclass), str) + : STR_LEN(progi->regstclass)) : 1); char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) - endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); - else if (prog->float_substr || prog->float_utf8) - endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } else endpos= strend; - if (checked_upto < s) - checked_upto = s; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); - t = s; - s = find_byclass(prog, progi->regstclass, checked_upto, endpos, + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, reginfo); - if (s) { - checked_upto = s; - } else { -#ifdef DEBUGGING - const char *what = NULL; -#endif + if (!s) { if (endpos == strend) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); + " Could not match STCLASS...\n") ); goto fail; } DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "This position contradicts STCLASS...\n") ); - if ((prog->extflags & RXf_ANCH) && !ml_anch) + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) goto fail; - checked_upto = HOPBACKc(endpos, start_shift); - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { - if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { - DEBUG_EXECUTE_r( what = "anchored" ); - hop_and_restart: - s = HOP3c(t, 1, strend); - if (s + start_shift + end_shift > strend) { - /* XXXX Should be taken into account earlier? */ + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); - goto fail; + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; } - if (!check) - goto giveup; + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for %s substr starting at offset %ld...\n", - what, (long)(s + start_shift - i_strpos)) ); - goto restart; + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; } - /* Have both, check_string is floating */ - if (t + start_shift >= check_at) /* Contradicts floating=check */ - goto retry_floating_check; - /* Recheck anchored substring, but not floating... */ - s = check_at; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for anchored substr starting at offset %ld...\n", - (long)(other_last - i_strpos)) ); - goto do_other_anchored; - } - /* Another way we could have checked stclass at the - current position only: */ - if (ml_anch) { - s = t = t + 1; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for /%s^%s/m starting at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); - goto try_at_offset; + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; } - if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); goto fail; - /* Check is floating substring. */ - retry_floating_check: - t = check_at - start_shift; - DEBUG_EXECUTE_r( what = "floating" ); - goto hop_and_restart; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; } - if (t != s) { + + /* Success !!! */ + + if (rx_origin != s) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "By STCLASS: moving %ld --> %ld\n", - (long)(t - i_strpos), (long)(s - i_strpos)) + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) ); } else { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "Does not contradict STCLASS...\n"); + " Does not contradict STCLASS...\n"); ); } } - giveup: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", - PL_colors[4], (check ? "Guessed" : "Giving up"), - PL_colors[5], (long)(s - i_strpos)) ); - return s; + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; + } + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; fail_finish: /* Substring not found */ if (prog->check_substr || prog->check_utf8) /* could be removed already */ @@ -1227,46 +1435,57 @@ Perl_re_intuit_start(pTHX_ return NULL; } + #define DECL_TRIE_TYPE(scan) \ - const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ trie_type = ((scan->flags == EXACT) \ ? (utf8_target ? trie_utf8 : trie_plain) \ - : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ -STMT_START { \ +STMT_START { \ STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ - uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ len = UTF8SKIP(uc); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_latin_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ case trie_utf8: \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ case trie_plain: \ uvc = (UV)*uc; \ @@ -1337,28 +1556,28 @@ REXEC_FBC_SCAN( \ tmp = 1; \ ) -#define REXEC_FBC_TRYIT \ +#define REXEC_FBC_TRYIT \ if ((reginfo->intuit || regtry(reginfo, &s))) \ goto got_it #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ - if (utf8_target) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ } -#define DUMP_EXEC_POS(li,s,doutf8) \ +#define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ startpos, doutf8) -#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ tmp = TEST_NON_UTF8(tmp); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1367,18 +1586,19 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ -#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ if (s == reginfo->strbeg) { \ tmp = '\n'; \ } \ else { \ U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ } \ tmp = TeSt1_UtF8; \ - LOAD_UTF8_CHARCLASS_ALNUM(); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! (TeSt2_UtF8)) { \ + if (tmp == ! (TeSt2_UtF8)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1413,9 +1633,9 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ * one, and compare it with the wordness of this one. If they differ, we have * a boundary. At the beginning of the string, pretend that the previous * character was a new-line */ -#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ if (utf8_target) { \ - UTF8_CODE \ + UTF8_CODE \ } \ else { /* Not utf8 */ \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ @@ -1430,7 +1650,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ } \ - if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ + if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ goto got_it; /* We know what class REx starts with. Try to find this position... */ @@ -1469,11 +1689,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { case ANYOF: - case ANYOF_SYNTHETIC: - case ANYOF_WARN_SUPER: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( - reginclass(prog, c, (U8*)s, utf8_target)); + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); } else { REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); @@ -1488,6 +1706,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: if (is_utf8_pat || utf8_target) { utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; @@ -1497,10 +1718,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1; /* /a, except the sharp s one which */ goto do_exactf_non_utf8; /* isn't dealt with by these */ - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); if (utf8_target) { - - /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1509,8 +1729,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; case EXACTFL: - if (is_utf8_pat || utf8_target) { - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; } fold_array = PL_fold_locale; @@ -1523,7 +1743,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_TRICKYFOLD: case EXACTFU: if (is_utf8_pat || utf8_target) { utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -1556,7 +1775,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * characters, and there are only 2 availabe, we know without * trying that it will fail; so don't start a match past the * required minimum number from the far end */ - e = HOP3c(strend, -((I32)ln), s); + e = HOP3c(strend, -((SSize_t)ln), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1602,7 +1821,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * only 2 are left, it's guaranteed to fail, so don't start a * match that would require us to go beyond the end of the string */ - e = HOP3c(strend, -((I32)lnc), s); + e = HOP3c(strend, -((SSize_t)lnc), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1628,15 +1847,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; } case BOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -1683,7 +1900,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); break; @@ -1747,7 +1963,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), classnum)))) { if (tmp && (reginfo->intuit || regtry(reginfo, &s))) @@ -1805,8 +2022,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } /* This is a copy of the loop above for swash classes, though using the @@ -2048,13 +2267,163 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, return s; } +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +static void +S_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) + { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + /* - regexec_flags - match a regexp against a string */ I32 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, - char *strbeg, I32 minend, SV *sv, void *data, U32 flags) + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -2062,21 +2431,17 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* sv: SV being matched: only used for utf8 flag, pos() etc; string * itself is accessed via the pointers above */ /* data: May be used for some additional optimizations. - Currently its only used, with a U32 cast, for transmitting - the ganch offset when doing a /g match. This will change */ -/* nosave: For optimizations. */ + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ { dVAR; struct regexp *const prog = ReANY(rx); char *s; regnode *c; - char *startpos = stringarg; - I32 minlen; /* must match at least this many chars */ - I32 dontbother = 0; /* how many characters not to try at end */ - I32 end_shift = 0; /* Same for the end. */ /* CC */ - I32 scream_pos = -1; /* Internal iterator of scream. */ - char *scream_olds = NULL; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); @@ -2090,16 +2455,70 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PERL_UNUSED_ARG(data); /* Be paranoid... */ - if (prog == NULL || startpos == NULL) { + if (prog == NULL || stringarg == NULL) { Perl_croak(aTHX_ "NULL regexp parameter"); return 0; } DEBUG_EXECUTE_r( - debug_start_match(rx, utf8_target, startpos, strend, + debug_start_match(rx, utf8_target, stringarg, strend, "Matching"); ); + startpos = stringarg; + + if (prog->intflags & PREGf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if prog->gofs is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ + + if (prog->intflags & PREGf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->intflags & PREGf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; + } /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), * which will call destuctors to reset PL_regmatch_state, free higher @@ -2108,10 +2527,54 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, oldsave = PL_savestack_ix; + s = startpos; + + if ((prog->extflags & RXf_USE_INTUIT) + && !(flags & REXEC_CHECKED)) + { + s = re_intuit_start(rx, sv, strbeg, startpos, strend, + flags, NULL); + if (!s) + return 0; + + if (prog->extflags & RXf_CHECK_ALL) { + /* we can match based purely on the result of INTUIT. + * Set up captures etc just for $& and $-[0] + * (an intuit-only match wont have $1,$2,..) */ + assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + /* match via INTUIT shouldn't have any captures. + * Let @-, @+, $^N know */ + prog->lastparen = prog->lastcloseparen = 0; + RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, strend, + sv, flags, utf8_target); + + return 1; + } + } + multiline = prog->extflags & RXf_PMf_MULTILINE; - minlen = prog->minlen; - if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too short [regexec_flags]...\n")); goto phooey; @@ -2134,7 +2597,20 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, reginfo->poscache_maxiter = 0; /* not yet started a countdown */ reginfo->strend = strend; /* see how far we have to get to not match where we matched before */ - reginfo->till = startpos+minend; + reginfo->till = stringarg + minend; + + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { + /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after + S_cleanup_regmatch_info_aux has executed (registered by + SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies + magic belonging to this SV. + Not newSVsv, either, as it does not COW. + */ + assert(!IS_PADGV(sv)); + reginfo->sv = newSV(0); + SvSetSV_nosteal(reginfo->sv, sv); + SAVEFREESV(reginfo->sv); + } /* reserve next 2 or 3 slots in PL_regmatch_state: * slot N+0: may currently be in use: skip it @@ -2187,41 +2663,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* If there is a "must appear" string, look for it. */ - s = startpos; - if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */ - MAGIC *mg; - if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */ - reginfo->ganch = startpos + prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs)); - } else if (sv && (mg = mg_find_mglob(sv)) - && mg->mg_len >= 0) { - reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */ - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len)); - - if (prog->extflags & RXf_ANCH_GPOS) { - if (s > reginfo->ganch) - goto phooey; - s = reginfo->ganch - prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs)); - if (s < strbeg) - goto phooey; - } - } - else if (data) { - reginfo->ganch = strbeg + PTR2UV(data); - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data))); - - } else { /* pos() not defined */ - reginfo->ganch = strbeg; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS: reginfo->ganch = strbeg\n")); - } - } if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { /* We have to be careful. If the previous successful match was from this regex we don't want a subsequent partially @@ -2240,27 +2682,13 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PTR2UV(prog->offs) )); } - if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { - re_scream_pos_data d; - - d.scream_olds = &scream_olds; - d.scream_pos = &scream_pos; - s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d); - if (!s) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); - goto phooey; /* not present */ - } - } - - /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ - if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { - if (s == startpos && regtry(reginfo, &startpos)) + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { + if (s == startpos && regtry(reginfo, &s)) goto got_it; - else if (multiline || (prog->intflags & PREGf_IMPLICIT) - || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ { char *end; @@ -2334,14 +2762,15 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* end search for newline */ } /* end anchored/multiline check string search */ goto phooey; - } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) + } else if (prog->intflags & PREGf_ANCH_GPOS) { - /* the warning about reginfo->ganch being used without initialization - is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN - and we only enter this block when the same bit is set. */ - char *tmp_s = reginfo->ganch - prog->gofs; - - if (tmp_s >= strbeg && regtry(reginfo, &tmp_s)) + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) goto got_it; goto phooey; } @@ -2397,8 +2826,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, || ((prog->float_substr != NULL || prog->float_utf8 != NULL) && prog->float_max_offset < strend - s)) { SV *must; - I32 back_max; - I32 back_min; + SSize_t back_max; + SSize_t back_min; char *last; char *last1; /* Last position checked before */ #ifdef DEBUGGING @@ -2443,7 +2872,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, last = strend; } else { last = HOP3c(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) + -(SSize_t)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min), strbeg); } if (s > reginfo->strbeg) @@ -2453,11 +2882,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* XXXX check_substr already used to find "s", can optimize if check_substr==must. */ - scream_pos = -1; - dontbother = end_shift; + dontbother = 0; strend = HOPc(strend, -dontbother); while ( (s <= last) && - (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), (unsigned char*)strend, must, multiline ? FBMrf_MULTILINE : 0)) ) { DEBUG_EXECUTE_r( did_match = 1 ); @@ -2510,7 +2938,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, c); + regprop(prog, prop, c, reginfo); { RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); @@ -2647,6 +3075,18 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, goto phooey; got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + DEBUG_BUFFERS_r( if (swap) PerlIO_printf(Perl_debug_log, @@ -2669,123 +3109,10 @@ got_it: RX_MATCH_UTF8_set(rx, utf8_target); /* make sure $`, $&, $', and $digit will work later */ - if ( !(flags & REXEC_NOT_FIRST) ) { - if (flags & REXEC_COPY_STR) { -#ifdef PERL_ANY_COW - if (SvCANCOW(sv)) { - if (DEBUG_C_TEST) { - PerlIO_printf(Perl_debug_log, - "Copy on write: regexp capture, type %d\n", - (int) SvTYPE(sv)); - } - RX_MATCH_COPY_FREE(rx); - prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); - prog->subbeg = (char *)SvPVX_const(prog->saved_copy); - assert (SvPOKp(prog->saved_copy)); - prog->sublen = reginfo->strend - strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - } else -#endif - { - I32 min = 0; - I32 max = reginfo->strend - strbeg; - I32 sublen; - - if ( (flags & REXEC_COPY_SKIP_POST) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_RIGHT) - ) { /* don't copy $' part of string */ - U32 n = 0; - max = -1; - /* calculate the right-most part of the string covered - * by a capture. Due to look-ahead, this may be to - * the right of $&, so we have to scan all captures */ - while (n <= prog->lastparen) { - if (prog->offs[n].end > max) - max = prog->offs[n].end; - n++; - } - if (max == -1) - max = (PL_sawampersand & SAWAMPERSAND_LEFT) - ? prog->offs[0].start - : 0; - assert(max >= 0 && max <= reginfo->strend - strbeg); - } - - if ( (flags & REXEC_COPY_SKIP_PRE) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_LEFT) - ) { /* don't copy $` part of string */ - U32 n = 0; - min = max; - /* calculate the left-most part of the string covered - * by a capture. Due to look-behind, this may be to - * the left of $&, so we have to scan all captures */ - while (min && n <= prog->lastparen) { - if ( prog->offs[n].start != -1 - && prog->offs[n].start < min) - { - min = prog->offs[n].start; - } - n++; - } - if ((PL_sawampersand & SAWAMPERSAND_RIGHT) - && min > prog->offs[0].end - ) - min = prog->offs[0].end; - - } - - assert(min >= 0 && min <= max - && min <= reginfo->strend - strbeg); - sublen = max - min; - - if (RX_MATCH_COPIED(rx)) { - if (sublen > prog->sublen) - prog->subbeg = - (char*)saferealloc(prog->subbeg, sublen+1); - } - else - prog->subbeg = (char*)safemalloc(sublen+1); - Copy(strbeg + min, prog->subbeg, sublen, char); - prog->subbeg[sublen] = '\0'; - prog->suboffset = min; - prog->sublen = sublen; - RX_MATCH_COPIED_on(rx); - } - prog->subcoffset = prog->suboffset; - if (prog->suboffset && utf8_target) { - /* Convert byte offset to chars. - * XXX ideally should only compute this if @-/@+ - * has been seen, a la PL_sawampersand ??? */ - - /* If there's a direct correspondence between the - * string which we're matching and the original SV, - * then we can use the utf8 len cache associated with - * the SV. In particular, it means that under //g, - * sv_pos_b2u() will use the previously cached - * position to speed up working out the new length of - * subcoffset, rather than counting from the start of - * the string each time. This stops - * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; - * from going quadratic */ - if (SvPOKp(sv) && SvPVX(sv) == strbeg) - sv_pos_b2u(sv, &(prog->subcoffset)); - else - prog->subcoffset = utf8_length((U8*)strbeg, - (U8*)(strbeg+prog->suboffset)); - } - } - else { - RX_MATCH_COPY_FREE(rx); - prog->subbeg = strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - /* use reginfo->strend, as strend may have been modified */ - prog->sublen = reginfo->strend - strbeg; - } - } + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); return 1; @@ -2816,7 +3143,7 @@ phooey: /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. * Do inc before dec, in case old and new rex are the same */ -#define SET_reg_curpm(Re2) \ +#define SET_reg_curpm(Re2) \ if (reginfo->info_aux_eval) { \ (void)ReREFCNT_inc(Re2); \ ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ @@ -2834,7 +3161,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) CHECKPOINT lastcp; REGEXP *const rx = reginfo->prog; regexp *const prog = ReANY(rx); - I32 result; + SSize_t result; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -3081,11 +3408,11 @@ regmatch(), slabs allocated since entry are freed. #define DEBUG_STATE_pp(pp) \ DEBUG_STATE_r({ \ - DUMP_EXEC_POS(locinput, scan, utf8_target); \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ PerlIO_printf(Perl_debug_log, \ " %*s"pp" %s%s%s%s%s\n", \ depth*2, "", \ - PL_reg_name[st->resume_state], \ + PL_reg_name[st->resume_state], \ ((st==yes_state||st==mark_state) ? "[" : ""), \ ((st==yes_state) ? "Y" : ""), \ ((st==mark_state) ? "M" : ""), \ @@ -3289,6 +3616,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, dVAR; U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; if (OP(text_node) == EXACT) { @@ -3308,136 +3636,193 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = c1 = valid_utf8_to_uvchr(pat, NULL); } } - else /* an EXACTFish node */ - if ((is_utf8_pat - && is_MULTI_CHAR_FOLD_utf8_safe(pat, - pat + STR_LEN(text_node))) - || (!is_utf8_pat - && is_MULTI_CHAR_FOLD_latin1_safe(pat, - pat + STR_LEN(text_node)))) - { - /* Multi-character folds require more context to sort out. Also - * PL_utf8_foldclosures used below doesn't handle them, so have to be - * handled outside this routine */ - use_chrtest_void = TRUE; - } - else { /* an EXACTFish node which doesn't begin with a multi-char fold */ - c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; - if (c1 > 256) { - /* Load the folds hash, if not already done */ - SV** listp; - if (! PL_utf8_foldclosures) { - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - - /* Force loading this by folding an above-Latin1 char */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); - } - - /* The fold closures data structure is a hash with the keys being - * the UTF-8 of every character that is folded to, like 'k', and - * the values each an array of all code points that fold to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are - * not included */ - if ((! (listp = hv_fetch(PL_utf8_foldclosures, - (char *) pat, - UTF8SKIP(pat), - FALSE)))) - { - /* Not found in the hash, therefore there are no folds - * containing it, so there is only a single character that - * could match */ - c2 = c1; } - else { /* Does participate in folds */ - AV* list = (AV*) *listp; - if (av_len(list) != 1) { + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; - /* If there aren't exactly two folds to this, it is outside - * the scope of this function */ - use_chrtest_void = TRUE; + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } } - else { /* There are two. Get them */ - SV** c_p = av_fetch(list, 0, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + + pat = folded; + pat_end = d; + } + } + + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 256) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* Force loading this by folding an above-Latin1 char */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ } - c1 = SvUV(*c_p); + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + } - c_p = av_fetch(list, 1, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { + + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; } - c2 = SvUV(*c_p); - - /* Folds that cross the 255/256 boundary are forbidden if - * EXACTFL, or EXACTFA and one is ASCIII. Since the - * pattern character is above 256, and its only other match - * is below 256, the only legal match will be to itself. - * We have thrown away the original, so have to compute - * which is the one above 255 */ - if ((c1 < 256) != (c2 < 256)) { - if (OP(text_node) == EXACTFL - || (OP(text_node) == EXACTFA - && (isASCII(c1) || isASCII(c2)))) - { - if (c1 < 256) { - c1 = c2; - } - else { - c2 = c1; + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 256, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255 */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } } } } } } - } - else /* Here, c1 is < 255 */ - if (utf8_target - && HAS_NONLATIN1_FOLD_CLOSURE(c1) - && OP(text_node) != EXACTFL - && (OP(text_node) != EXACTFA || ! isASCII(c1))) - { - /* Here, there could be something above Latin1 in the target which - * folds to this character in the pattern. All such cases except - * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters - * involved in their folds, so are outside the scope of this - * function */ - if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { - c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; - } - else { - use_chrtest_void = TRUE; + else /* Here, c1 is < 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } } - } - else { /* Here nothing above Latin1 can fold to the pattern character */ - switch (OP(text_node)) { + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { - case EXACTFL: /* /l rules */ - c2 = PL_fold_locale[c1]; - break; + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; - case EXACTF: - if (! utf8_target) { /* /d rules */ - c2 = PL_fold[c1]; + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; break; - } - /* FALLTHROUGH */ - /* /u rules for all these. This happens to work for - * EXACTFA as nothing in Latin1 folds to ASCII */ - case EXACTFA: - case EXACTFU_TRICKYFOLD: - case EXACTFU_SS: - case EXACTFU: - c2 = PL_fold_latin1[c1]; - break; - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); - assert(0); /* NOTREACHED */ + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } } } } @@ -3479,7 +3864,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } /* returns -1 on failure, $+[0] on success */ -STATIC I32 +STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { #if PERL_VERSION < 9 && !defined(PERL_CORE) @@ -3497,7 +3882,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regnode *scan; regnode *next; U32 n = 0; /* general value; init to avoid compiler warning */ - I32 ln = 0; /* len or last; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput) */ @@ -3525,7 +3910,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; - SV* const oreplsv = GvSV(PL_replgv); + SV* const oreplsv = GvSVn(PL_replgv); /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops @@ -3555,6 +3940,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) GET_RE_DEBUG_FLAGS_DECL; #endif + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ multicall_oldcatch = 0; multicall_cv = NULL; @@ -3580,7 +3968,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SV * const prop = sv_newmortal(); regnode *rnext=regnext(scan); DUMP_EXEC_POS( locinput, scan, utf8_target ); - regprop(rex, prop, scan); + regprop(rex, prop, scan, reginfo); PerlIO_printf(Perl_debug_log, "%3"IVdf":%*s%s(%"IVdf")\n", @@ -3603,7 +3991,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); switch (state_num) { - case BOL: /* /^../ */ + case BOL: /* /^../ */ + case SBOL: /* /^../s */ if (locinput == reginfo->strbeg) break; sayNO; @@ -3616,11 +4005,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } sayNO; - case SBOL: /* /^../s */ - if (locinput == reginfo->strbeg) - break; - sayNO; - case GPOS: /* \G */ if (locinput == reginfo->ganch) break; @@ -3638,16 +4022,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO_SILENT; assert(0); /*NOTREACHED*/ - case EOL: /* /..$/ */ - goto seol; - case MEOL: /* /..$/m */ if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; break; + case EOL: /* /..$/ */ + /* FALL THROUGH */ case SEOL: /* /..$/s */ - seol: if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; if (reginfo->strend - locinput > 1) @@ -3945,7 +4327,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (chars) { if (utf8_target) { - uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; } @@ -3958,7 +4340,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (foldlen) { if (!--chars) break; - uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, uniflags); uscan += len; foldlen -= len; @@ -4049,7 +4431,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { sayNO; } l += 2; @@ -4072,7 +4455,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { sayNO; } s += 2; @@ -4102,27 +4486,31 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * s; U32 fold_utf8_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - fold_utf8_flags = FOLDEQ_UTF8_LOCALE; + fold_utf8_flags = FOLDEQ_LOCALE; goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ - case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: /* /abc/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; - case EXACTF: /* /abc/i */ + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); folder = foldEQ; fold_array = PL_fold; fold_utf8_flags = 0; @@ -4131,7 +4519,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) { + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { /* Either target or the pattern are utf8, or has the issue where * the fold lengths may differ. */ const char * const l = locinput; @@ -4166,8 +4558,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * have to set the FLAGS fields of these */ case BOUNDL: /* /\b/l */ case NBOUNDL: /* /\B/l */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* FALL THROUGH */ case BOUND: /* /\b/ */ case BOUNDU: /* /\b/u */ case BOUNDA: /* /\b/a */ @@ -4185,7 +4575,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 * const r = reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); - ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); } if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isWORDCHAR_uni(ln); @@ -4198,7 +4589,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -4247,11 +4638,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case ANYOF: /* /[abc]/ */ - case ANYOF_WARN_SUPER: if (NEXTCHR_IS_EOS) sayNO; if (utf8_target) { - if (!reginclass(rex, scan, (U8*)locinput, utf8_target)) + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) sayNO; locinput += UTF8SKIP(locinput); } @@ -4273,10 +4664,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (NEXTCHR_IS_EOS) sayNO; - /* The locale hasn't influenced the outcome before this, so defer - * tainting until now */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* Use isFOO_lc() for characters within Latin1. (Note that * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else * wouldn't be invariant) */ @@ -4287,7 +4674,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_UNI(nextchr, + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -4368,7 +4755,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), FLAGS(scan))))) { @@ -4386,8 +4773,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } if (! (to_complement ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], @@ -4649,11 +5037,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 *fold_array; UV utf8_fold_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; type = REFFL; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; case NREFFA: /* /\g{name}/iaa */ @@ -4694,10 +5081,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_nref_ref_common; case REFFL: /* /\1/il */ - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_ref; case REFFA: /* /\1/iaa */ @@ -4737,8 +5123,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = reginfo->strbeg + ln; if (type != REF /* REF can do byte comparison */ - && (utf8_target || type == REFFU)) - { /* XXX handle REFFL better */ + && (utf8_target || type == REFFU || type == REFFL)) + { char * limit = reginfo->strend; /* This call case insensitively compares the entire buffer @@ -4812,7 +5198,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) startpoint = rei->program+1; ST.close_paren = 0; } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ goto eval_recurse_doit; + assert(0); /* NOTREACHED */ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ @@ -4924,8 +5317,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->offs[0].end = locinput - reginfo->strbeg; if (reginfo->info_aux_eval->pos_magic) - reginfo->info_aux_eval->pos_magic->mg_len - = locinput - reginfo->strbeg; + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); @@ -4963,20 +5357,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* /(??{}) */ /* if its overloaded, let the regex compiler handle * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); if (!SvAMAGIC(ret)) { SV *sv = ret; if (SvROK(sv)) sv = SvRV(sv); if (SvTYPE(sv) == SVt_REGEXP) re_sv = (REGEXP*) sv; - else if (SvSMAGICAL(sv)) { - MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); if (mg) re_sv = (REGEXP *) mg->mg_obj; } - /* force any magic, undef warnings here */ - if (!re_sv) { + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { ret = sv_mortalcopy(ret); (void) SvPV_force_nolen(ret); } @@ -5030,17 +5426,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) pm_flags); if (!(SvFLAGS(ret) - & (SVs_TEMP | SVs_PADTMP | SVf_READONLY - | SVs_GMG))) { + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { /* This isn't a first class regexp. Instead, it's caching a regexp onto an existing, Perl visible scalar. */ sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); } - /* safe to do now that any $1 etc has been - * interpolated into the new pattern string and - * compiled */ - S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); } SAVEFREESV(re_sv); re = ReANY(re_sv); @@ -5050,6 +5442,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) re->sublen = rex->sublen; re->suboffset = rex->suboffset; re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; rei = RXi_GET(re); DEBUG_EXECUTE_r( debug_start_match(re_sv, utf8_target, locinput, @@ -5057,18 +5451,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ - - eval_recurse_doit: /* Share code with GOSUB below this line */ - /* run the pattern returned from (??{...}) */ - - /* Save *all* the positions. */ + /* Save all the seen positions so far. */ ST.cp = regcppush(rex, 0, maxopenparen); REGCP_SET(ST.lastcp); - - re->lastparen = 0; - re->lastcloseparen = 0; - + /* and set maxopenparen to 0, since we are starting a "fresh" match */ maxopenparen = 0; + /* run the pattern returned from (??{...}) */ + + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ /* invalidate the S-L poscache. We're now executing a * different set of WHILEM ops (and their associated @@ -5080,6 +5472,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * pattern again */ reginfo->poscache_maxiter = 0; + /* the new regexp might have a different is_utf8_pat than we do */ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); ST.prev_rex = rex_sv; @@ -5104,7 +5497,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); rexi = RXi_GET(rex); - regcpblow(ST.cp); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; @@ -5454,10 +5854,10 @@ NULL if (reginfo->poscache_iter-- == 0) { /* initialise cache */ - const I32 size = (reginfo->poscache_maxiter + 7)/8; + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; regmatch_info_aux *const aux = reginfo->info_aux; if (aux->poscache) { - if ((I32)reginfo->poscache_size < size) { + if ((SSize_t)reginfo->poscache_size < size) { Renew(aux->poscache, size, char); reginfo->poscache_size = size; } @@ -5475,7 +5875,7 @@ NULL if (reginfo->poscache_iter < 0) { /* have we already failed at this position? */ - I32 offset, mask; + SSize_t offset, mask; reginfo->poscache_iter = -1; /* stop eventual underflow */ offset = (scan->flags & 0xf) - 1 @@ -5800,7 +6200,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", (int)(REPORT_CODE_OFF+(depth*2)),"", valid_utf8_to_uvchr((U8 *) locinput, NULL), valid_utf8_to_uvchr(ST.c1_utf8, NULL), @@ -5814,7 +6214,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", (int)(REPORT_CODE_OFF+(depth*2)),"", (int) nextchr, ST.c1, ST.c2) ); @@ -6553,6 +6953,10 @@ yes: * When popping the save stack, all these locals would be undone; * bypass this by setting the outermost saved $^R to the latest * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ if (oreplsv != GvSV(PL_replgv)) sv_setsv(oreplsv, GvSV(PL_replgv)); } @@ -6755,7 +7159,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -6782,21 +7186,23 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; case EXACTFL: - RXp_MATCH_TAINTED_on(prog); - utf8_flags = FOLDEQ_UTF8_LOCALE; + utf8_flags = FOLDEQ_LOCALE; goto do_exactf; - case EXACTF: - utf8_flags = 0; - goto do_exactf; + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6860,11 +7266,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; } case ANYOF: - case ANYOF_WARN_SUPER: if (utf8_target) { while (hardcount < max && scan < loceol - && reginclass(prog, p, (U8*)scan, utf8_target)) + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; @@ -6882,7 +7287,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); if (! utf8_target) { while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) @@ -6982,8 +7386,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, - *(scan + 1)), + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), classnum)))) { break; @@ -7065,8 +7469,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init( - "utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } while (hardcount < max && scan < loceol @@ -7134,7 +7540,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, GET_RE_DEBUG_FLAGS_DECL; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, p); + regprop(prog, prop, p, reginfo); PerlIO_printf(Perl_debug_log, "%*s %s can match %"IVdf" times out of %"IVdf"...\n", REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); @@ -7160,31 +7566,39 @@ Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, *altsvp = NULL; } - return newSVsv(core_regclass_swash(prog, node, doinit, listsvp)); + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); } -#endif -STATIC SV * -S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp) +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) { - /* Returns the swash for the input 'node' in the regex 'prog'. - * If is true, will attempt to create the swash if not already + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. + * If is 'true', will attempt to create the swash if not already * done. - * If is non-null, will return the swash initialization string in - * it. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). * Tied intimately to how regcomp.c sets up the data structure */ dVAR; SV *sw = NULL; - SV *si = NULL; + SV *si = NULL; /* Input swash initialization string */ SV* invlist = NULL; RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; - PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; - assert(ANYOF_NONBITMAP(node)); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); if (data && data->count) { const U32 n = ARG(node); @@ -7197,25 +7611,38 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit si = *ary; /* ary[0] = the string to initialize the swash with */ - /* Elements 2 and 3 are either both present or both absent. [2] is - * any inversion list generated at compile time; [3] indicates if + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if * that inversion list has any user-defined properties in it. */ - if (av_len(av) >= 2) { - invlist = ary[2]; - if (SvUV(ary[3])) { - swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; } - } - else { - invlist = NULL; } /* Element [1] is reserved for the set-up swash. If already there, * return it; if not, create it and store it there */ - if (SvROK(ary[1])) { + if (ary[1] && SvROK(ary[1])) { sw = ary[1]; } - else if (si && doinit) { + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { sw = _core_swash_init("utf8", /* the utf8 package */ "", /* nameless */ @@ -7229,16 +7656,18 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit } } + /* If requested, return a printable version of what this swash matches */ if (listsvp) { SV* matches_string = newSVpvn("", 0); - /* Use the swash, if any, which has to have incorporated into it all - * possibilities */ + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) && (si && si != &PL_sv_undef)) { - - /* If no swash, use the input initialization string, if available */ sv_catsv(matches_string, si); } @@ -7252,12 +7681,14 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit return sw; } +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ /* - reginclass - determine if a character falls into a character class n is the ANYOF regnode p is the target string + p_end points to one byte beyond the end of the target string utf8_target tells whether p is in UTF-8. Returns true if matched; false otherwise. @@ -7269,7 +7700,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit */ STATIC bool -S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target) +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); @@ -7282,7 +7713,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_IS_INVARIANT() works even if not in UTF-8 */ if (! UTF8_IS_INVARIANT(c) && utf8_target) { STRLEN c_len = 0; - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, + c = utf8n_to_uvchr(p, p_end - p, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for @@ -7295,21 +7726,19 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_NON_UTF8_LATIN1_ALL + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL && ! utf8_target && ! isASCII(c)) { match = TRUE; } - else if (flags & ANYOF_LOCALE) { - RXp_MATCH_TAINTED_on(prog); - - if ((flags & ANYOF_LOC_FOLD) - && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) - { - match = TRUE; + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } } - else if (ANYOF_CLASS_TEST_ANY_SET(n)) { + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { /* The data structure is arranged so bits 0, 2, 4, ... are set * if the class includes the Posix character class given by @@ -7343,8 +7772,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const int count = 0; int to_complement = 0; + while (count < ANYOF_MAX) { - if (ANYOF_CLASS_TEST(n, count) + if (ANYOF_POSIXL_TEST(n, count) && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) { match = TRUE; @@ -7357,27 +7787,22 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } } + /* If the bitmap didn't (or couldn't) match, and something outside the - * bitmap could match, try that. Locale nodes specify completely the - * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except in - * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes. So allow locale - * to match for the synthetic start class, which will give a false - * positive that will be resolved when the match is done again as not part - * of the synthetic start class */ + * bitmap could match, try that. */ if (!match) { - if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { match = TRUE; /* Everything above 255 matches */ } - else if (ANYOF_NONBITMAP(n) - && ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target - && (c >=256 - || (! (flags & ANYOF_LOCALE)) - || OP(n) == ANYOF_SYNTHETIC)))) + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) { - SV * const sw = core_regclass_swash(prog, n, TRUE, 0); + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); if (sw) { U8 * utf8_p; if (utf8_target) { @@ -7394,23 +7819,32 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const /* If we allocated a string above, free it */ if (! utf8_target) Safefree(utf8_p); } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } } if (UNICODE_IS_SUPER(c) - && OP(n) == ANYOF_WARN_SUPER + && (flags & ANYOF_WARN_SUPER) && ckWARN_d(WARN_NON_UNICODE)) { Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); } } +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ - return cBOOL(flags & ANYOF_INVERT) ^ match; + return (flags & ANYOF_INVERT) ^ match; } STATIC U8 * -S_reghop3(U8 *s, I32 off, const U8* lim) +S_reghop3(U8 *s, SSize_t off, const U8* lim) { /* return the position 'off' UTF-8 characters away from 's', forward if * 'off' >= 0, backwards if negative. But don't go outside of position @@ -7439,13 +7873,8 @@ S_reghop3(U8 *s, I32 off, const U8* lim) return s; } -#ifdef XXX_dmq -/* there are a bunch of places where we use two reghop3's that should - be replaced with this routine. but since thats not done yet - we ifdef it out - dmq -*/ STATIC U8 * -S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) { dVAR; @@ -7469,10 +7898,12 @@ S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) } return s; } -#endif + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ STATIC U8 * -S_reghopmaybe3(U8* s, I32 off, const U8* lim) +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) { dVAR; @@ -7540,6 +7971,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) } eval_state->pos_magic = mg; eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; } else eval_state->pos_magic = NULL; @@ -7556,7 +7988,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) /* this regexp is also owned by the new PL_reg_curpm, which will try to free it. */ av_push(PL_regex_padav, repointer); - PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); PL_regex_pad = AvARRAY(PL_regex_padav); } #endif @@ -7614,7 +8046,12 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } if (eval_state->pos_magic) + { eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } PL_curpm = eval_state->curpm; } diff --git a/src/5019002/dquote_static.c b/src/5021001/dquote_static.c similarity index 86% rename from src/5019002/dquote_static.c rename to src/5021001/dquote_static.c index d5241ca..3554d33 100644 --- a/src/5019002/dquote_static.c +++ b/src/5021001/dquote_static.c @@ -15,12 +15,9 @@ Pulled from regcomp.c. */ PERL_STATIC_INLINE I32 -S_regcurly(pTHX_ const char *s, - const bool rbrace_must_be_escaped /* Should the terminating '} be - preceded by a backslash? This - is an abnormal case */ - ) +S_regcurly(pTHX_ const char *s) { + PERL_UNUSED_CONTEXT; PERL_ARGS_ASSERT_REGCURLY; if (*s++ != '{') @@ -35,9 +32,7 @@ S_regcurly(pTHX_ const char *s, s++; } - return (rbrace_must_be_escaped) - ? *s == '\\' && *(s+1) == '}' - : *s == '}'; + return *s == '}'; } /* XXX Add documentation after final interface and behavior is decided */ @@ -46,44 +41,36 @@ S_regcurly(pTHX_ const char *s, */ STATIC char -S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning) +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) { U8 result; - if (utf8) { - /* Trying to deprecate non-ASCII usages. This construct has never - * worked for a utf8 variant. So, even though are accepting non-ASCII - * Latin1 in 5.14, no need to make them work under utf8 */ - if (! isASCII(source)) { - Perl_croak(aTHX_ "Character following \"\\c\" must be ASCII"); - } + if (! isPRINT_A(source)) { + Perl_croak(aTHX_ "%s", + "Character following \"\\c\" must be printable ASCII"); } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); - result = toCTRL(source); - if (! isASCII(source)) { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "Character following \"\\c\" must be ASCII"); + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); } - else if (! isCNTRL(result) && output_warning) { - if (source == '{') { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "\"\\c{\" is deprecated and is more clearly written as \";\""); - } - else { - U8 clearer[3]; - U8 i = 0; - if (! isWORDCHAR(result)) { - clearer[i++] = '\\'; - } - clearer[i++] = result; - clearer[i++] = '\0'; - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "\"\\c%c\" is more clearly written simply as \"%s\"", - source, - clearer); + result = toCTRL(source); + if (output_warning && isPRINT_A(result)) { + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); } return result; diff --git a/src/5021001/inline_invlist.c b/src/5021001/inline_invlist.c new file mode 100644 index 0000000..6cdeff4 --- /dev/null +++ b/src/5021001/inline_invlist.c @@ -0,0 +1,66 @@ +/* inline_invlist.c + * + * Copyright (C) 2012 by Larry Wall and others + * + * You may distribute under the terms of either the GNU General Public + * License or the Artistic License, as specified in the README file. + */ + +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) + +/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, + * etc */ +#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) +#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) + +/* This converts to/from our UVs to what the SV code is expecting: bytes. */ +#define TO_INTERNAL_SIZE(x) ((x) * sizeof(UV)) +#define FROM_INTERNAL_SIZE(x) ((x)/ sizeof(UV)) + +PERL_STATIC_INLINE bool* +S_get_invlist_offset_addr(SV* invlist) +{ + /* Return the address of the field that says whether the inversion list is + * offset (it contains 1) or not (contains 0) */ + PERL_ARGS_ASSERT_GET_INVLIST_OFFSET_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->is_offset); +} + +PERL_STATIC_INLINE UV +S__invlist_len(SV* const invlist) +{ + /* Returns the current number of elements stored in the inversion list's + * array */ + + PERL_ARGS_ASSERT__INVLIST_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return (SvCUR(invlist) == 0) + ? 0 + : FROM_INTERNAL_SIZE(SvCUR(invlist)) - *get_invlist_offset_addr(invlist); +} + +PERL_STATIC_INLINE bool +S__invlist_contains_cp(SV* const invlist, const UV cp) +{ + /* Does contain code point as part of the set? */ + + IV index = _invlist_search(invlist, cp); + + PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; + + return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); +} + +# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGEXEC_C) + +/* These symbols are only needed later in regcomp.c */ +# undef TO_INTERNAL_SIZE +# undef FROM_INTERNAL_SIZE +# endif + +#endif diff --git a/src/5019002/orig/dquote_static.c b/src/5021001/orig/dquote_static.c similarity index 86% rename from src/5019002/orig/dquote_static.c rename to src/5021001/orig/dquote_static.c index da1b5b9..2fcb0fa 100644 --- a/src/5019002/orig/dquote_static.c +++ b/src/5021001/orig/dquote_static.c @@ -15,12 +15,9 @@ Pulled from regcomp.c. */ PERL_STATIC_INLINE I32 -S_regcurly(pTHX_ const char *s, - const bool rbrace_must_be_escaped /* Should the terminating '} be - preceded by a backslash? This - is an abnormal case */ - ) +S_regcurly(pTHX_ const char *s) { + PERL_UNUSED_CONTEXT; PERL_ARGS_ASSERT_REGCURLY; if (*s++ != '{') @@ -35,9 +32,7 @@ S_regcurly(pTHX_ const char *s, s++; } - return (rbrace_must_be_escaped) - ? *s == '\\' && *(s+1) == '}' - : *s == '}'; + return *s == '}'; } /* XXX Add documentation after final interface and behavior is decided */ @@ -46,44 +41,36 @@ S_regcurly(pTHX_ const char *s, */ STATIC char -S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning) +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) { U8 result; - if (utf8) { - /* Trying to deprecate non-ASCII usages. This construct has never - * worked for a utf8 variant. So, even though are accepting non-ASCII - * Latin1 in 5.14, no need to make them work under utf8 */ - if (! isASCII(source)) { - Perl_croak(aTHX_ "Character following \"\\c\" must be ASCII"); - } + if (! isPRINT_A(source)) { + Perl_croak(aTHX_ "%s", + "Character following \"\\c\" must be printable ASCII"); } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); - result = toCTRL(source); - if (! isASCII(source)) { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "Character following \"\\c\" must be ASCII"); + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); } - else if (! isCNTRL(result) && output_warning) { - if (source == '{') { - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), - "\"\\c{\" is deprecated and is more clearly written as \";\""); - } - else { - U8 clearer[3]; - U8 i = 0; - if (! isWORDCHAR(result)) { - clearer[i++] = '\\'; - } - clearer[i++] = result; - clearer[i++] = '\0'; - - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "\"\\c%c\" is more clearly written simply as \"%s\"", - source, - clearer); - } + + result = toCTRL(source); + if (output_warning && isPRINT_A(result)) { + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; + } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); } return result; diff --git a/src/5021001/orig/inline_invlist.c b/src/5021001/orig/inline_invlist.c new file mode 100644 index 0000000..1875a05 --- /dev/null +++ b/src/5021001/orig/inline_invlist.c @@ -0,0 +1,66 @@ +/* inline_invlist.c + * + * Copyright (C) 2012 by Larry Wall and others + * + * You may distribute under the terms of either the GNU General Public + * License or the Artistic License, as specified in the README file. + */ + +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) + +/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, + * etc */ +#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) +#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) + +/* This converts to/from our UVs to what the SV code is expecting: bytes. */ +#define TO_INTERNAL_SIZE(x) ((x) * sizeof(UV)) +#define FROM_INTERNAL_SIZE(x) ((x)/ sizeof(UV)) + +PERL_STATIC_INLINE bool* +S_get_invlist_offset_addr(SV* invlist) +{ + /* Return the address of the field that says whether the inversion list is + * offset (it contains 1) or not (contains 0) */ + PERL_ARGS_ASSERT_GET_INVLIST_OFFSET_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->is_offset); +} + +PERL_STATIC_INLINE UV +S__invlist_len(SV* const invlist) +{ + /* Returns the current number of elements stored in the inversion list's + * array */ + + PERL_ARGS_ASSERT__INVLIST_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return (SvCUR(invlist) == 0) + ? 0 + : FROM_INTERNAL_SIZE(SvCUR(invlist)) - *get_invlist_offset_addr(invlist); +} + +PERL_STATIC_INLINE bool +S__invlist_contains_cp(SV* const invlist, const UV cp) +{ + /* Does contain code point as part of the set? */ + + IV index = _invlist_search(invlist, cp); + + PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; + + return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); +} + +# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGEXEC_C) + +/* These symbols are only needed later in regcomp.c */ +# undef TO_INTERNAL_SIZE +# undef FROM_INTERNAL_SIZE +# endif + +#endif diff --git a/src/5019001/orig/regcomp.c b/src/5021001/orig/regcomp.c similarity index 70% rename from src/5019001/orig/regcomp.c rename to src/5021001/orig/regcomp.c index e660e5e..b7a7b35 100644 --- a/src/5019001/orig/regcomp.c +++ b/src/5021001/orig/regcomp.c @@ -81,7 +81,7 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" -extern const struct regexp_engine my_reg_engine; +EXTERN_C const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif @@ -91,51 +91,48 @@ extern const struct regexp_engine my_reg_engine; #include "inline_invlist.c" #include "unicode_constants.h" -#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) -#ifdef op -#undef op -#endif /* op */ - -#ifdef MSDOS -# if defined(BUGGY_MSC6) - /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ -# pragma optimize("a",off) - /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ -# pragma optimize("w",on ) -# endif /* BUGGY_MSC6 */ -#endif /* MSDOS */ - #ifndef STATIC #define STATIC static #endif -typedef struct RExC_state_t { +struct RExC_state_t { U32 flags; /* RXf_* are we folding, multilining? */ U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ - regexp_internal *rxi; /* internal data for regexp object pprivate field */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ - I32 whilem_seen; /* number of WHILEM in this expr */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ - regnode *emit_bound; /* First regnode outside of the allocated space */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ regnode *emit; /* Code-emit pointer; if = &emit_dummy, implies compiling, so don't emit */ - regnode emit_dummy; /* placeholder for emit to point to */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; - I32 size; /* Code size. */ - I32 npar; /* Capture buffer count, (OPEN). */ - I32 cpar; /* Capture buffer count, (CLOSE). */ - I32 nestroot; /* root parens we are in - used by accept */ + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ I32 extralen; I32 seen_zerolen; regnode **open_parens; /* pointers to open parens */ @@ -149,18 +146,23 @@ typedef struct RExC_state_t { rules, even if the pattern is not in utf8 */ HV *paren_names; /* Paren names */ - + regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ I32 in_lookbehind; I32 contains_locale; + I32 contains_i; I32 override_recoding; I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ -#if ADD_TO_REGEXEC + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif @@ -173,7 +175,7 @@ typedef struct RExC_state_t { #define RExC_lastnum (pRExC_state->lastnum) #define RExC_paren_name_list (pRExC_state->paren_name_list) #endif -} RExC_state_t; +}; #define RExC_flags (pRExC_state->flags) #define RExC_pm_flags (pRExC_state->pm_flags) @@ -186,7 +188,8 @@ typedef struct RExC_state_t { #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) #ifdef RE_TRACK_PATTERN_OFFSETS -#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ #endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_dummy (pRExC_state->emit_dummy) @@ -196,6 +199,7 @@ typedef struct RExC_state_t { #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) @@ -209,19 +213,20 @@ typedef struct RExC_state_t { #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) #define RExC_override_recoding (pRExC_state->override_recoding) #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ - ((*s) == '{' && regcurly(s, FALSE))) + ((*s) == '{' && regcurly(s))) -#ifdef SPSTART -#undef SPSTART /* dratted cpp namespace... */ -#endif /* * Flags to be passed up and down. */ @@ -267,106 +272,129 @@ typedef struct RExC_state_t { #define namedclass_to_classnum(class) ((int) ((class) / 2)) #define classnum_to_namedclass(classnum) ((classnum) * 2) +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + /* About scan_data_t. During optimisation we recurse through the regexp program performing various inplace (keyhole style) optimisations. In addition study_chunk and scan_commit populate this data structure with information about - what strings MUST appear in the pattern. We look for the longest + what strings MUST appear in the pattern. We look for the longest string that must appear at a fixed location, and we look for the longest string that may appear at a floating location. So for instance in the pattern: - + /FOO[xX]A.*B[xX]BAR/ - + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating strings (because they follow a .* construct). study_chunk will identify both FOO and BAR as being the longest fixed and floating strings respectively. - + The strings can be composites, for instance - + /(f)(o)(o)/ - + will result in a composite fixed substring 'foo'. - + For each string some basic information is maintained: - + - offset or min_offset This is the position the string must appear at, or not before. It also implicitly (when combined with minlenp) tells us how many characters must match before the string we are searching for. Likewise when combined with minlenp and the length of the string it - tells us how many characters must appear after the string we have + tells us how many characters must appear after the string we have found. - + - max_offset Only used for floating strings. This is the rightmost point that - the string can appear at. If set to I32 max it indicates that the + the string can appear at. If set to SSize_t_MAX it indicates that the string can occur infinitely far to the right. - + - minlenp A pointer to the minimum number of characters of the pattern that the string was found inside. This is important as in the case of positive - lookahead or positive lookbehind we can have multiple patterns + lookahead or positive lookbehind we can have multiple patterns involved. Consider - + /(?=FOO).*F/ - + The minimum length of the pattern overall is 3, the minimum length of the lookahead part is 3, but the minimum length of the part that - will actually match is 1. So 'FOO's minimum length is 3, but the + will actually match is 1. So 'FOO's minimum length is 3, but the minimum length for the F is 1. This is important as the minimum length - is used to determine offsets in front of and behind the string being + is used to determine offsets in front of and behind the string being looked for. Since strings can be composites this is the length of the pattern at the time it was committed with a scan_commit. Note that the length is calculated by study_chunk, so that the minimum lengths - are not known until the full pattern has been compiled, thus the + are not known until the full pattern has been compiled, thus the pointer to the value. - + - lookbehind - + In the case of lookbehind the string being searched for can be - offset past the start point of the final matching string. + offset past the start point of the final matching string. If this value was just blithely removed from the min_offset it would invalidate some of the calculations for how many chars must match before or after (as they are derived from min_offset and minlen and - the length of the string being searched for). + the length of the string being searched for). When the final pattern is compiled and the data is moved from the scan_data_t structure into the regexp structure the information - about lookbehind is factored in, with the information that would - have been lost precalculated in the end_shift field for the + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the associated string. The fields pos_min and pos_delta are used to store the minimum offset - and the delta to the maximum offset at the current point in the pattern. + and the delta to the maximum offset at the current point in the pattern. */ typedef struct scan_data_t { /*I32 len_min; unused */ /*I32 len_delta; unused */ - I32 pos_min; - I32 pos_delta; + SSize_t pos_min; + SSize_t pos_delta; SV *last_found; - I32 last_end; /* min value, <0 unless valid. */ - I32 last_start_min; - I32 last_start_max; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; SV **longest; /* Either &l_fixed, or &l_float. */ SV *longest_fixed; /* longest fixed string found in pattern */ - I32 offset_fixed; /* offset where it starts */ - I32 *minlen_fixed; /* pointer to the minlen relevant to the string */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ I32 lookbehind_fixed; /* is the position of the string modfied by LB */ SV *longest_float; /* longest floating string found in pattern */ - I32 offset_float_min; /* earliest point in string it can appear */ - I32 offset_float_max; /* latest point in string it can appear */ - I32 *minlen_float; /* pointer to the minlen relevant to the string */ - I32 lookbehind_float; /* is the position of the string modified by LB */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ I32 flags; I32 whilem_c; - I32 *last_closep; - struct regnode_charclass_class *start_class; + SSize_t *last_closep; + regnode_ssc *start_class; } scan_data_t; +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + /* * Forward declarations for pregcomp()'s friends. */ @@ -380,13 +408,8 @@ static const scan_data_t zero_scan_data = #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) -#ifdef NO_UNARY_PLUS -# define SF_FIX_SHIFT_EOL (0+2) -# define SF_FL_SHIFT_EOL (0+4) -#else -# define SF_FIX_SHIFT_EOL (+2) -# define SF_FL_SHIFT_EOL (+4) -#endif +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) @@ -404,22 +427,32 @@ static const scan_data_t zero_scan_data = #define SCF_WHILEM_VISITED_POS 0x2000 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ -#define SCF_SEEN_ACCEPT 0x8000 +#define SCF_SEEN_ACCEPT 0x8000 #define SCF_TRIE_DOING_RESTUDY 0x10000 #define UTF cBOOL(RExC_utf8) /* The enums for all these are ordered so things work out correctly */ #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) -#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) -#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) -#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + #define OOB_NAMEDCLASS -1 /* There is no code point that is out-of-bounds, so this is problematic. But @@ -442,7 +475,12 @@ static const scan_data_t zero_scan_data = #define MARKER1 "<-- HERE" /* marker as it appears in the description */ #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ -#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given @@ -464,12 +502,12 @@ static const scan_data_t zero_scan_data = } STMT_END #define FAIL(msg) _FAIL( \ - Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) #define FAIL2(msg,arg) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ - arg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan @@ -477,7 +515,7 @@ static const scan_data_t zero_scan_data = #define Simple_vFAIL(m) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -494,8 +532,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL2(m,a1) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -513,8 +551,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL3(m, a1, a2) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -531,8 +569,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vFAIL4(m,a1,a2,a3) STMT_START { \ @@ -541,80 +579,90 @@ static const scan_data_t zero_scan_data = Simple_vFAIL4(m, a1, a2, a3); \ } STMT_END +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + /* m is not necessarily a "literal string", in this macro */ #define reg_warn_non_literal_string(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN_dep(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END -#define ckWARN2regdep(loc,m, a1) STMT_START { \ +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN3(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ } STMT_END @@ -623,7 +671,7 @@ static const scan_data_t zero_scan_data = if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ } STMT_END -/* Macros for recording node offsets. 20001227 mjd@plover.com +/* Macros for recording node offsets. 20001227 mjd@plover.com * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in * element 2*n-1 of the array. Element #2n holds the byte length node #n. * Element 0 holds the number n. @@ -636,8 +684,8 @@ static const scan_data_t zero_scan_data = #define Set_Node_Length_To_R(node,len) #define Set_Node_Length(node,len) #define Set_Node_Cur_Length(node,start) -#define Node_Offset(n) -#define Node_Length(n) +#define Node_Offset(n) +#define Node_Length(n) #define Set_Node_Offset_Length(node,offset,len) #define ProgLen(ri) ri->u.proglen #define SetProgLen(ri,x) ri->u.proglen = x @@ -649,7 +697,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ __LINE__, (int)(node), (int)(byte))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)-1] = (byte); \ } \ @@ -665,7 +714,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ __LINE__, (int)(node), (int)(len))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)] = (len); \ } \ @@ -691,6 +741,49 @@ static const scan_data_t zero_scan_data = #define EXPERIMENTAL_INPLACESCAN #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log, \ @@ -730,7 +823,8 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ floating substrings if needed. */ STATIC void -S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) { const STRLEN l = CHR_SVLEN(data->last_found); const STRLEN old_l = CHR_SVLEN(*data->longest); @@ -754,9 +848,12 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min data->offset_float_min = l ? data->last_start_min : data->pos_min; data->offset_float_max = (l ? data->last_start_max - : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta)); - if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) - data->offset_float_max = I32_MAX; + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; if (data->flags & SF_BEFORE_EOL) data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); @@ -780,299 +877,591 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min DEBUG_STUDYDATA("commit: ",data,0); } -/* These macros set, clear and test whether the synthetic start class ('ssc', - * given by the parameter) matches an empty string (EOS). This uses the - * 'next_off' field in the node, to save a bit in the flags field. The ssc - * stands alone, so there is never a next_off, so this field is otherwise - * unused. The EOS information is used only for compilation, but theoretically - * it could be passed on to the execution code. This could be used to store - * more than one bit of information, but only this one is currently used. */ -#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END -#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END -#define TEST_SSC_EOS(node) cBOOL((node)->next_off) - -/* Can match anything (initialization) */ +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + STATIC void -S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) { - PERL_ARGS_ASSERT_CL_ANYTHING; + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ - ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_UNICODE_ALL; - SET_SSC_EOS(cl); + PERL_ARGS_ASSERT_SSC_INIT; + + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); /* If any portion of the regex is to operate under locale rules, * initialization includes it. The reason this isn't done for all regexes * is that the optimizer was written under the assumption that locale was * all-or-nothing. Given the complexity and lack of documentation in the - * optimizer, and that there are inadequate test cases for locale, so many + * optimizer, and that there are inadequate test cases for locale, many * parts of it may not work properly, it is safest to avoid locale unless * necessary. */ if (RExC_contains_locale) { - ANYOF_CLASS_SETALL(cl); /* /l uses class */ - cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD; + ANYOF_POSIXL_SETALL(ssc); } else { - ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ + ANYOF_POSIXL_ZERO(ssc); } } -/* Can match anything (initialization) */ STATIC int -S_cl_is_anything(const struct regnode_charclass_class *cl) +S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) { - int value; + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ - PERL_ARGS_ASSERT_CL_IS_ANYTHING; + UV start, end; + bool ret; - for (value = 0; value < ANYOF_MAX; value += 2) - if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) - return 1; - if (!(cl->flags & ANYOF_UNICODE_ALL)) - return 0; - if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) - return 0; - return 1; + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; } -/* Can match anything (initialization) */ -STATIC void -S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) { - PERL_ARGS_ASSERT_CL_INIT; + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ + + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } + + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } + + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } - Zero(cl, 1, struct regnode_charclass_class); - cl->type = ANYOF; - cl_anything(pRExC_state, cl); - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + return invlist; } /* These two functions currently do the exact same thing */ -#define cl_init_zero S_cl_init +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ -/* 'AND' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if - * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_and(struct regnode_charclass_class *cl, - const struct regnode_charclass_class *and_with) +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) { - PERL_ARGS_ASSERT_CL_AND; + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } - assert(PL_regkind[and_with->type] == ANYOF); + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ - /* I (khw) am not sure all these restrictions are necessary XXX */ - if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) - && !(ANYOF_CLASS_TEST_ANY_SET(cl)) - && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(and_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD)) { - int i; + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; - if (and_with->flags & ANYOF_INVERT) - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= ~and_with->bitmap[i]; - else - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= and_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ - - if (and_with->flags & ANYOF_INVERT) { - - /* Here, the and'ed node is inverted. Get the AND of the flags that - * aren't affected by the inversion. Those that are affected are - * handled individually below */ - U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; - cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); - cl->flags |= affected_flags; - - /* We currently don't know how to deal with things that aren't in the - * bitmap, but we know that the intersection is no greater than what - * is already in cl, so let there be false positives that get sorted - * out after the synthetic start class succeeds, and the node is - * matched for real. */ - - /* The inversion of these two flags indicate that the resulting - * intersection doesn't have them */ - if (and_with->flags & ANYOF_UNICODE_ALL) { - cl->flags &= ~ANYOF_UNICODE_ALL; - } - if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { - cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* and'd node is not inverted */ - U8 outside_bitmap_but_not_utf8; /* Temp variable */ - - if (! ANYOF_NONBITMAP(and_with)) { - - /* Here 'and_with' doesn't match anything outside the bitmap - * (except possibly ANYOF_UNICODE_ALL), which means the - * intersection can't either, except for ANYOF_UNICODE_ALL, in - * which case we don't know what the intersection is, but it's no - * greater than what cl already has, so can just leave it alone, - * with possible false positives */ - if (! (and_with->flags & ANYOF_UNICODE_ALL)) { - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); - cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; - } - } - else if (! ANYOF_NONBITMAP(cl)) { - - /* Here, 'and_with' does match something outside the bitmap, and cl - * doesn't have a list of things to match outside the bitmap. If - * cl can match all code points above 255, the intersection will - * be those above-255 code points that 'and_with' matches. If cl - * can't match all Unicode code points, it means that it can't - * match anything outside the bitmap (since the 'if' that got us - * into this block tested for that), so we leave the bitmap empty. - */ - if (cl->flags & ANYOF_UNICODE_ALL) { - ARG_SET(cl, ARG(and_with)); + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); - /* and_with's ARG may match things that don't require UTF8. - * And now cl's will too, in spite of this being an 'and'. See - * the comments below about the kludge */ - cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8; - } - } - else { - /* Here, both 'and_with' and cl match something outside the - * bitmap. Currently we do not do the intersection, so just match - * whatever cl had at the beginning. */ - } - - - /* Take the intersection of the two sets of flags. However, the - * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a - * kludge around the fact that this flag is not treated like the others - * which are initialized in cl_anything(). The way the optimizer works - * is that the synthetic start class (SSC) is initialized to match - * anything, and then the first time a real node is encountered, its - * values are AND'd with the SSC's with the result being the values of - * the real node. However, there are paths through the optimizer where - * the AND never gets called, so those initialized bits are set - * inappropriately, which is not usually a big deal, as they just cause - * false positives in the SSC, which will just mean a probably - * imperceptible slow down in execution. However this bit has a - * higher false positive consequence in that it can cause utf8.pm, - * utf8_heavy.pl ... to be loaded when not necessary, which is a much - * bigger slowdown and also causes significant extra memory to be used. - * In order to prevent this, the code now takes a different tack. The - * bit isn't set unless some part of the regular expression needs it, - * but once set it won't get cleared. This means that these extra - * modules won't get loaded unless there was some path through the - * pattern that would have required them anyway, and so any false - * positives that occur by not ANDing them out when they could be - * aren't as severe as they would be if we treated this bit like all - * the others */ - outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags) - & ANYOF_NONBITMAP_NON_UTF8; - cl->flags &= and_with->flags; - cl->flags |= outside_bitmap_but_not_utf8; + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); + + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } + } + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } } } -/* 'OR' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if - * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) { - PERL_ARGS_ASSERT_CL_OR; - - if (or_with->flags & ANYOF_INVERT) { - - /* Here, the or'd node is to be inverted. This means we take the - * complement of everything not in the bitmap, but currently we don't - * know what that is, so give up and match anything */ - if (ANYOF_NONBITMAP(or_with)) { - cl_anything(pRExC_state, cl); - } - /* We do not use - * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) - * <= (B1 | !B2) | (CL1 | !CL2) - * which is wasteful if CL2 is small, but we ignore CL2: - * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 - * XXXX Can we handle case-fold? Unclear: - * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = - * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) - */ - else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(or_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD) ) { - int i; + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= ~or_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise */ - else { - cl_anything(pRExC_state, cl); - } + SV* ored_cp_list; + U8 ored_flags; - /* And, we can just take the union of the flags that aren't affected - * by the inversion */ - cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; + PERL_ARGS_ASSERT_SSC_OR; - /* For the remaining flags: - ANYOF_UNICODE_ALL and inverted means to not match anything above - 255, which means that the union with cl should just be - what cl has in it, so can ignore this flag - ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord - is 127-255 to match them, but then invert that, so the - union with cl should just be what cl has in it, so can - ignore this flag - */ - } else { /* 'or_with' is not inverted */ - /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ - if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_LOC_FOLD) - || (cl->flags & ANYOF_LOC_FOLD)) ) { - int i; + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ - /* OR char bitmap and class bitmap separately */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= or_with->bitmap[i]; - if (or_with->flags & ANYOF_CLASS) { - ANYOF_CLASS_OR(or_with, cl); + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); + } } - } - else { /* XXXX: logic is complicated, leave it along for a moment. */ - cl_anything(pRExC_state, cl); - } + } + } - if (ANYOF_NONBITMAP(or_with)) { + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} - /* Use the added node's outside-the-bit-map match if there isn't a - * conflict. If there is a conflict (both nodes match something - * outside the bitmap, but what they match outside is not the same - * pointer, and hence not easily compared until XXX we extend - * inversion lists this far), give up and allow the start class to - * match everything outside the bitmap. If that stuff is all above - * 255, can just set UNICODE_ALL, otherwise caould be anything. */ - if (! ANYOF_NONBITMAP(cl)) { - ARG_SET(cl, ARG(or_with)); - } - else if (ARG(cl) != ARG(or_with)) { +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; - if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { - cl_anything(pRExC_state, cl); - } - else { - cl->flags |= ANYOF_UNICODE_ALL; - } - } - } + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); - /* Take the union */ - cl->flags |= or_with->flags; + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) -#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) #ifdef DEBUGGING @@ -1115,13 +1504,13 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 0 ; state < trie->uniquecharcount ; state++ ) { SV ** const tmp = av_fetch( revcharmap, state, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + PerlIO_printf( Perl_debug_log, "%*s", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) + PERL_PV_ESCAPE_FIRSTCHAR + ) ); } } @@ -1135,10 +1524,12 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; - PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); if ( trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); } else { PerlIO_printf( Perl_debug_log, "%6s", "" ); } @@ -1150,19 +1541,23 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, while( ( base + ofs < trie->uniquecharcount ) || ( base + ofs - trie->uniquecharcount < trie->lasttrans - && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) ofs++; PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) && - ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && - trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) { PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, - (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); } else { PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); } @@ -1173,17 +1568,18 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, } PerlIO_printf( Perl_debug_log, "\n" ); } - PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, ""); + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); for (word=1; word <= trie->wordcount; word++) { PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", (int)word, (int)(trie->wordinfo[word].prev), (int)(trie->wordinfo[word].len)); } PerlIO_printf(Perl_debug_log, "\n" ); -} +} /* Dumps a fully constructed but uncompressed trie in list form. - List tries normally only are used for construction when the number of + List tries normally only are used for construction when the number of possible chars (trie->uniquecharcount) is very high. Used for debugging make_trie(). */ @@ -1203,10 +1599,10 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", "------:-----+-----------------\n" ); - + for( state=1 ; state < next_alloc ; state ++ ) { U16 charid; - + PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", (int)depth * 2 + 2,"", (UV)state ); if ( ! trie->states[ state ].wordnum ) { @@ -1217,31 +1613,33 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, ); } for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { - SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); if ( tmp ) { PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR ) , TRIE_LIST_ITEM(state,charid).forid, (UV)TRIE_LIST_ITEM(state,charid).newstate ); - if (!(charid % 10)) + if (!(charid % 10)) PerlIO_printf(Perl_debug_log, "\n%*s| ", (int)((depth * 2) + 14), ""); } } PerlIO_printf( Perl_debug_log, "\n"); } -} +} /* Dumps a fully constructed but uncompressed trie in table form. - This is the normal DFA style state transition table, with a few - twists to facilitate compression later. + This is the normal DFA style state transition table, with a few + twists to facilitate compression later. Used for debugging make_trie(). */ STATIC void @@ -1256,24 +1654,24 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; - + /* print out the table precompression so that we can do a visual check that they are identical. */ - + PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { SV ** const tmp = av_fetch( revcharmap, charid, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + PerlIO_printf( Perl_debug_log, "%*s", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) + PERL_PV_ESCAPE_FIRSTCHAR + ) ); } } @@ -1288,7 +1686,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { - PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", + PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", (int)depth * 2 + 2,"", (UV)TRIE_NODENUM( state ) ); @@ -1300,9 +1698,11 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); } if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); } else { - PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, trie->states[ TRIE_NODENUM( state ) ].wordnum ); } } @@ -1319,7 +1719,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, May be the same as tail. tail : item following the branch sequence count : words in the sequence - flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/ depth : indent depth Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. @@ -1421,7 +1821,7 @@ and would end up looking like: 8: EXACT (10) 10: END(0) - d = uvuni_to_utf8_flags(d, uv, 0); + d = uvchr_to_utf8_flags(d, uv, 0); is the recommended Unicode-aware way of saying @@ -1433,7 +1833,7 @@ is the recommended Unicode-aware way of saying if (UTF) { \ SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ @@ -1444,31 +1844,28 @@ is the recommended Unicode-aware way of saying } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - /* if it is UTF then it is either already folded, or does not need folding */ \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ - } \ - else if (folder == PL_fold_latin1) { \ - /* if we use this folder we have to obey unicode rules on latin-1 data */ \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ - skiplen = UNISKIP(uvc); \ - foldlen -= skiplen; \ - scan = foldbuf + skiplen; \ - } \ - } else { \ - /* raw data, will be folded later if needed */ \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1511,7 +1908,8 @@ is the recommended Unicode-aware way of saying \ if ( noper_next < tail ) { \ if (!trie->jump) \ - trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ trie->jump[curword] = (U16)(noper_next - convert); \ if (!jumper) \ jumper = noper_next; \ @@ -1546,7 +1944,9 @@ is the recommended Unicode-aware way of saying #define MADE_EXACT_TRIE 4 STATIC I32 -S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) { dVAR; /* first pass, loop through and scan words */ @@ -1554,7 +1954,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs HV *widecharmap = NULL; AV *revcharmap = newAV(); regnode *cur; - const U32 uniflags = UTF8_ALLOW_DEFAULT; STRLEN len = 0; UV uvc = 0; U16 curword = 0; @@ -1567,13 +1966,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 * folder = NULL; #ifdef DEBUGGING - const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); AV *trie_words = NULL; /* along with revcharmap, this only used during construction but both are * useful during debugging so we store them in the struct when debugging. */ #else - const U32 data_slot = add_data( pRExC_state, 2, "tu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); STRLEN trie_charcount=0; #endif SV *re_trie_maxbuff; @@ -1588,10 +1987,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs case EXACT: break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; - case EXACTFL: folder = PL_fold_locale; break; default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } @@ -1611,18 +2008,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs }); re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + assert(re_trie_maxbuff); if (!SvIOK(re_trie_maxbuff)) { sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } DEBUG_TRIE_COMPILE_r({ - PerlIO_printf( Perl_debug_log, - "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - (int)depth * 2 + 2, "", - REG_NODE_NUM(startbranch),REG_NODE_NUM(first), - REG_NODE_NUM(last), REG_NODE_NUM(tail), - (int)depth); + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); }); - + /* Find the node we are going to overwrite */ if ( first == startbranch && OP( last ) != BRANCH ) { /* whole branch chain */ @@ -1631,7 +2028,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* branch sub-chain */ convert = NEXTOPER( first ); } - + /* -- First loop and Setup -- We first traverse the branches and scan each word to determine if it @@ -1640,9 +2037,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store Unicode characters. We use the - native representation of the character value as the key and IV's for the - coded index. + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. *TODO* If we keep track of how many times each character is used we can remap the columns so that the table compression later on is more @@ -1659,13 +2056,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); const U8 *e = uc + STR_LEN( noper ); - STRLEN foldlen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; - const U8 *scan = (U8*)NULL; + int foldlen = 0; U32 wordlen = 0; /* required init */ - STRLEN chars = 0; - bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1685,13 +2081,77 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regardless of encoding */ if (OP( noper ) == EXACTFU_SS) { /* false positives are ok, so just set this */ - TRIE_BITMAP_SET(trie,0xDF); + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; - chars++; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ if ( uvc < 256 ) { if ( folder ) { U8 folded= folder[ (U8) uvc ]; @@ -1715,13 +2175,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !UTF ) { /* store first byte of utf8 representation of variant codepoints */ - if (! UNI_IS_INVARIANT(uvc)) { + if (! UVCHR_IS_INVARIANT(uvc)) { TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); } } set_bit = 0; /* We've done our bit :-) */ } } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + SV** svpp; if ( !widecharmap ) widecharmap = newHV(); @@ -1736,30 +2204,22 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs TRIE_STORE_REVCHAR(uvc); } } - } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ if( cur == first ) { - trie->minlen = chars; - trie->maxlen = chars; - } else if (chars < trie->minlen) { - trie->minlen = chars; - } else if (chars > trie->maxlen) { - trie->maxlen = chars; + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; } - if (OP( noper ) == EXACTFU_SS) { - /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ - if (trie->minlen > 1) - trie->minlen= 1; - } - if (OP( noper ) == EXACTFU_TRICKYFOLD) { - /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" - * - We assume that any such sequence might match a 2 byte string */ - if (trie->minlen > 2 ) - trie->minlen= 2; - } - } /* end first pass */ DEBUG_TRIE_COMPILE_r( - PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", (int)depth * 2 + 2,"", ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, @@ -1791,7 +2251,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); prev_states[1] = 0; - if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { /* Second Pass -- Array Of Lists Representation @@ -1806,7 +2268,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN transcount = 1; - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sCompiling trie using list compiler\n", (int)depth * 2 + 2, "")); @@ -1823,11 +2285,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1846,14 +2304,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); if ( !svpp ) { charid = 0; } else { charid=(U16)SvIV( *svpp ); } } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ if ( charid ) { U16 check; @@ -1863,8 +2325,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !trie->states[ state ].trans.list ) { TRIE_LIST_NEW( state ); } - for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { - if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { newstate = TRIE_LIST_ITEM( state, check ).newstate; break; } @@ -1886,7 +2353,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } /* end second pass */ /* next alloc is the NEXT state to be allocated */ - trie->statecount = next_alloc; + trie->statecount = next_alloc; trie->states = (reg_trie_state *) PerlMemShared_realloc( trie->states, next_alloc @@ -1934,7 +2401,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, transcount * sizeof(reg_trie_trans) ); - Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); } base = trie->uniquecharcount + tp - minid; if ( maxid == minid ) { @@ -1942,22 +2411,27 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( ; zp < tp ; zp++ ) { if ( ! trie->trans[ zp ].next ) { base = trie->uniquecharcount + zp - minid; - trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ zp ].check = state; set = 1; break; } } if ( !set ) { - trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ tp ].check = state; tp++; zp = tp; } } else { for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; - trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; trie->trans[ tid ].check = state; } tp += ( maxid - minid + 1 ); @@ -1977,26 +2451,26 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* Second Pass -- Flat Table Representation. - we dont use the 0 slot of either trans[] or states[] so we add 1 to each. - We know that we will need Charcount+1 trans at most to store the data - (one row per char at worst case) So we preallocate both structures - assuming worst case. + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. We then construct the trie using only the .next slots of the entry structs. - We use the .check field of the first entry of the node temporarily to - make compression both faster and easier by keeping track of how many non - zero fields are in the node. + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. Since trans are numbered from 1 any 0 pointer in the table is a FAIL transition. - There are two terms at use here: state as a TRIE_NODEIDX() which is a - number representing the first entry of the node, and state as a - TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and - TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there - are 2 entrys per node. eg: + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: A B A B 1. 2 4 1. 3 7 @@ -2004,12 +2478,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs 3. 0 0 5. 0 0 4. 0 0 7. 0 0 - The table is internally in the right hand, idx form. However as we also - have to deal with the states array which is indexed by nodenum we have to - use TRIE_NODENUM() to convert. + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. */ - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sCompiling trie using table compiler\n", (int)depth * 2 + 2, "")); @@ -2033,12 +2507,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U16 charid = 0; /* sanity init */ U32 accept_state = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - STRLEN skiplen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -2057,7 +2527,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); charid = svpp ? (U16)SvIV(*svpp) : 0; } if ( charid ) { @@ -2073,7 +2546,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } else { Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ } } accept_state = TRIE_NODENUM( state ); @@ -2160,7 +2634,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U32 used = trie->trans[ stateidx ].check; trie->trans[ stateidx ].check = 0; - for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { if ( flag || trie->trans[ stateidx + charid ].next ) { if ( trie->trans[ stateidx + charid ].next ) { if (o_used == 1) { @@ -2169,8 +2646,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs break; } } - trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; - trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); trie->trans[ zp ].check = state; if ( ++zp > pos ) pos = zp; break; @@ -2179,9 +2661,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( !flag ) { flag = 1; - trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; } - trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); trie->trans[ pos ].check = state; pos++; } @@ -2192,19 +2677,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->states, laststate * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, - "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", - (int)depth * 2 + 2,"", - (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), - (IV)next_alloc, - (IV)pos, - ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); ); } /* end table compress */ } DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", (int)depth * 2 + 2, "", (UV)trie->statecount, (UV)trie->lasttrans) @@ -2214,10 +2701,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, trie->lasttrans * sizeof(reg_trie_trans) ); - { /* Modify the program and insert the new TRIE node */ + { /* Modify the program and insert the new TRIE node */ U8 nodetype =(U8)(flags & 0xFF); char *str=NULL; - + #ifdef DEBUGGING regnode *optimize = NULL; #ifdef RE_TRACK_PATTERN_OFFSETS @@ -2255,12 +2742,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs }); } DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); #endif - /* But first we check to see if there is a common prefix we can + /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; if ( trie->bitmap && !widecharmap && !trie->jump ) { @@ -2319,11 +2807,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlIO_printf( Perl_debug_log, "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", (int)depth * 2 + 2, "", - (UV)state, (UV)idx, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, + (UV)state, (UV)idx, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR + PERL_PV_ESCAPE_FIRSTCHAR ) ); }); @@ -2336,7 +2824,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs while (len--) *str++ = *ch++; } else { -#ifdef DEBUGGING +#ifdef DEBUGGING if (state>1) DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); #endif @@ -2387,17 +2875,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } } } - if (!jumper) - jumper = last; + if (!jumper) + jumper = last; if ( trie->maxlen ) { NEXT_OFF( convert ) = (U16)(tail - convert); ARG_SET( convert, data_slot ); - /* Store the offset to the first unabsorbed branch in - jump[0], which is otherwise unused by the jump logic. + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. We use this when dumping a trie and during optimisation. */ - if (trie->jump) + if (trie->jump) trie->jump[0] = (U16)(nextbranch - convert); - + /* If the start state is not accepting (meaning there is no empty string/NOTHING) * and there is a bitmap * and the first "jump target" node we found leaves enough room @@ -2412,17 +2900,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); PerlMemShared_free(trie->bitmap); trie->bitmap= NULL; - } else + } else OP( convert ) = TRIE; /* store the type in the flags */ convert->flags = nodetype; DEBUG_r({ - optimize = convert - + NODE_STEP_REGNODE + optimize = convert + + NODE_STEP_REGNODE + regarglen[ OP( convert ) ]; }); - /* XXX We really should free up the resource in trie now, + /* XXX We really should free up the resource in trie now, as we won't use them - (which resources?) dmq */ } /* needed for dumping*/ @@ -2432,8 +2920,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs while ( ++opt < optimize) { Set_Node_Offset_Length(opt,0,0); } - /* - Try to clean up some of the debris left after the + /* + Try to clean up some of the debris left after the optimisation. */ while( optimize < jumper ) { @@ -2488,32 +2976,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #else SvREFCNT_dec_NN(revcharmap); #endif - return trie->jump - ? MADE_JUMP_TRIE - : trie->startstate>1 - ? MADE_EXACT_TRIE + return trie->jump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE : MADE_TRIE; } -STATIC void -S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) { -/* The Trie is constructed and compressed now so we can build a fail array if it's needed +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed - This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the - "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 ISBN 0-201-10088-6 - We find the fail state for each state in the trie, this state is the longest proper - suffix of the current state's 'word' that is also a proper prefix of another word in our - trie. State 1 represents the word '' and is thus the default fail state. This allows - the DFA not to have to restart after its tried and failed a word at a given point, it - simply continues as though it had been matching the other word in the first place. + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. Consider 'abcdgu'=~/abcdefg|cdgu/ - When we get to 'd' we are still matching the first word, we would encounter 'g' which would - fail, which would bring us to the state representing 'd' in the second word where we would - try 'g' and succeed, proceeding to match 'cdgu'. + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. */ /* add a fail transition */ const U32 trie_offset = ARG(source); @@ -2528,14 +3021,28 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 base = trie->states[ 1 ].trans.base; U32 *fail; reg_ac_data *aho; - const U32 data_slot = add_data( pRExC_state, 1, "T" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; + PERL_UNUSED_CONTEXT; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ ARG_SET( stclass, data_slot ); aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); @@ -2592,7 +3099,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode fail[ 0 ] = fail[ 1 ] = 0; DEBUG_TRIE_COMPILE_r({ PerlIO_printf(Perl_debug_log, - "%*sStclass Failtable (%"UVuf" states): 0", + "%*sStclass Failtable (%"UVuf" states): 0", (int)(depth * 2), "", (UV)numstates ); for( q_read=1; q_read%3d: %s (%d)\n", \ (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ Next ? (REG_NODE_NUM(Next)) : 0 ); \ @@ -2639,49 +3136,58 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta of the input nodes. + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. * - * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF - * and contains LATIN SMALL LETTER SHARP S + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) * * This is as good a place as any to discuss the design of handling these * multi-character fold sequences. It's been wrong in Perl for a very long * time. There are three code points in Unicode whose multi-character folds * were long ago discovered to mess things up. The previous designs for * dealing with these involved assigning a special node for them. This - * approach doesn't work, as evidenced by this example: + * approach doesn't always work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node that + * Both sides fold to "sss", but if the pattern is parsed to create a node that * would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node - * that is "sss". + * that is "sss" in this case. * * It turns out that there are problems with all multi-character folds, and not - * just these three. Now the code is general, for all such cases, but the - * three still have some special handling. The approach taken is: + * just these three. Now the code is general, for all such cases. The + * approach taken is: * 1) This routine examines each EXACTFish node that could contain multi- - * character fold sequences. It returns in *min_subtract how much to - * subtract from the the actual length of the string to get a real minimum - * match length; it is 0 if there are no multi-char folds. This delta is - * used by the caller to adjust the min length of the match, and the delta - * between min and max, so that the optimizer doesn't reject these - * possibilities based on size constraints. - * 2) Certain of these sequences require special handling by the trie code, - * so, if found, this code changes the joined node type to special ops: - * EXACTFU_TRICKYFOLD and EXACTFU_SS. - * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS * is used for an EXACTFU node that contains at least one "ss" sequence in * it. For non-UTF-8 patterns and strings, this is the only case where * there is a possible fold length change. That means that a regular * EXACTFU node without UTF-8 involvement doesn't have to concern itself * with length changes, and so can be processed faster. regexec.c takes * advantage of this. Generally, an EXACTFish node that is in UTF-8 is - * pre-folded by regcomp.c. This saves effort in regex matching. - * However, the pre-folding isn't done for non-UTF8 patterns because the - * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things - * down by forcing the pattern into UTF8 unless necessary. Also what - * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold * possibilities for the non-UTF8 patterns are quite simple, except for * the sharp s. All the ones that don't involve a UTF-8 target string are * members of a fold-pair, and arrays are set up for all of them so that @@ -2689,45 +3195,63 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * this file makes sure that in EXACTFU nodes, the sharp s gets folded to * 'ss', even if the pattern isn't UTF-8. This avoids the issues * described in the next item. - * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the - * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a - * UTF-8 pattern.) An assumption that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes is that a - * character in the pattern corresponds to at most a single character in - * the target string. (And I do mean character, and not byte here, unlike - * other parts of the documentation that have never been updated to - * account for multibyte Unicode.) sharp s in EXACTF nodes can match the - * two character string 'ss'; in EXACTFA nodes it can match - * "\x{17F}\x{17F}". These violate the assumption, and they are the only - * instances where it is violated. I'm reluctant to try to change the - * assumption, as the code involved is impenetrable to me (khw), so - * instead the code here punts. This routine examines (when the pattern - * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a - * boolean indicating whether or not the node contains a sharp s. When it - * is true, the caller sets a flag that later causes the optimizer in this - * file to not set values for the floating and fixed string lengths, and - * thus avoids the optimizer code in regexec.c that makes the invalid + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid * assumption. Thus, there is no optimization based on string lengths for - * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. - * (The reason the assumption is wrong only in these two cases is that all - * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all - * other folds to their expanded versions. We can't prefold sharp s to - * 'ss' in EXACTF nodes because we don't know at compile time if it - * actually matches 'ss' or not. It will match iff the target string is - * in UTF-8, unlike the EXACTFU nodes, where it always matches; and - * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 - * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; - * but in a non-UTF8 pattern, folding it to that above-Latin1 string would - * require the pattern to be forced into UTF-8, the overhead of which we - * want to avoid.) - */ - -#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2756,7 +3280,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b && NEXT_OFF(n) && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { - + if (OP(n) == TAIL || n > next) stringok = 0; if (PL_regkind[OP(n)] == NOTHING) { @@ -2773,12 +3297,13 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); - /* XXX I (khw) kind of doubt that this works on platforms where - * U8_MAX is above 255 because of lots of other assumptions */ + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ /* Don't join if the sum can't fit into a single node */ if (oldl + STR_LEN(n) > U8_MAX) break; - + DEBUG_PEEP("merg",n,depth); merged++; @@ -2809,7 +3334,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } *min_subtract = 0; - *has_exactf_sharp_s = FALSE; + *unfolded_multi_char = FALSE; /* Here, all the adjacent mergeable EXACTish nodes have been merged. We * can now analyze for sequences of problematic code points. (Prior to @@ -2817,15 +3342,68 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - const U8 * const s0 = (U8*) STRING(scan); - const U8 * s = s0; - const U8 * const s_end = s0 + STR_LEN(scan); + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ /* One pass is made over the node's string looking for all the - * possibilities. to avoid some tests in the loop, there are two main + * possibilities. To avoid some tests in the loop, there are two main * cases, for UTF-8 patterns (which can't have EXACTF nodes) and * non-UTF-8 */ if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ /* Examine the string for a multi-character fold sequence. UTF-8 * patterns have all characters pre-folded by the time this code is @@ -2833,60 +3411,32 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b while (s < s_end - 1) /* Can stop 1 before the end, as minimum length sequence we are looking for is 2 */ { - int count = 0; + int count = 0; /* How many characters in a multi-char fold */ int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); if (! len) { /* Not a multi-char fold: get next char */ s += UTF8SKIP(s); continue; } - /* Nodes with 'ss' require special handling, except for EXACTFL - * and EXACTFA for which there is no multi-char fold to this */ + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ if (len == 2 && *s == 's' && *(s+1) == 's' - && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) { count = 2; - OP(scan) = EXACTFU_SS; - s += 2; - } - else if (len == 6 /* len is the same in both ASCII and EBCDIC - for these */ - && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6) - || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6))) - { - count = 3; - - /* These two folds require special handling by trie's, so - * change the node type to indicate this. If EXACTFA and - * EXACTFL were ever to be handled by trie's, this would - * have to be changed. If this node has already been - * changed to EXACTFU_SS in this loop, leave it as is. (I - * (khw) think it doesn't matter in regexec.c for UTF - * patterns, but no need to change it */ - if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_TRICKYFOLD; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; } - s += 6; + s += 2; } else { /* Here is a generic multi-char fold. */ - const U8* multi_end = s + len; + U8* multi_end = s + len; - /* Count how many characters in it. In the case of /l and + /* Count how many characters are in it. In the case of * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. (In - * EXACTFL, no folds are allowed to any Latin1 code point, - * not just ASCII. But there aren't any of these - * currently, nor ever likely, so don't take the time to - * test for them. The code that generates the - * is_MULTI_foo() macros croaks should one actually get put - * into Unicode .) */ - if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + * allowed, so check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; } @@ -2906,70 +3456,78 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b /* The delta is how long the sequence is minus 1 (1 is how long * the character that folds to the sequence is) */ - *min_subtract += count - 1; - next_iteration: ; + total_count_delta += count - 1; + next_iteration: ; } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); } else if (OP(scan) == EXACTFA) { /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char * fold to the ASCII range (and there are no existing ones in the * upper latin1 range). But, as outlined in the comments preceding - * this function, we need to flag any occurrences of the sharp s */ + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { - *has_exactf_sharp_s = TRUE; + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; break; } s++; continue; } } - else if (OP(scan) != EXACTFL) { - - /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the - * multi-char folds that are all Latin1. (This code knows that - * there are no current multi-char folds possible with EXACTFL, - * relying on fold_grind.t to catch any errors if the very unlikely - * event happens that some get added in future Unicode versions.) - * As explained in the comments preceding this function, we look - * also for the sharp s in EXACTF nodes; it can be in the final - * position. Otherwise we can stop looking 1 byte earlier because - * have to find at least two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; - - /* The below is perhaps overboard, but this allows us to save a - * test each time through the loop at the expense of a mask. This - * is because on both EBCDIC and ASCII machines, 'S' and 's' differ - * by a single bit. On ASCII they are 32 apart; on EBCDIC, they - * are 64. This uses an exclusive 'or' to find that bit and then - * inverts it to form a mask, with just a single 0, in the bit - * position where 'S' and 's' differ. */ - const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); - const U8 s_masked = 's' & S_or_s_mask; + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; while (s < upper) { int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) { - *has_exactf_sharp_s = TRUE; + *unfolded_multi_char = TRUE; } s++; continue; } if (len == 2 - && ((*s & S_or_s_mask) == s_masked) - && ((*(s+1) & S_or_s_mask) == s_masked)) + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) { /* EXACTF nodes need to know that the minimum length * changed so that a sharp s in the string can match this * ss in the pattern, but they remain EXACTF nodes, as they * won't match this unless the target string is is UTF-8, - * which we don't know until runtime */ - if (OP(scan) != EXACTF) { + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { OP(scan) = EXACTFU_SS; } } @@ -3003,7 +3561,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b #define INIT_AND_WITHP \ assert(!and_withp); \ - Newx(and_withp,1,struct regnode_charclass_class); \ + Newx(and_withp,1, regnode_ssc); \ SAVEFREEPV(and_withp) /* this is a chain of data about sub patterns we are processing that @@ -3014,20 +3572,19 @@ typedef struct scan_frame { regnode *last; /* last node to process in this frame */ regnode *next; /* next node to process when last is reached */ struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; I32 stop; /* what stopparen do we use */ } scan_frame; -#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) - -STATIC I32 +STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, - I32 *minlenp, I32 *deltap, + SSize_t *minlenp, SSize_t *deltap, regnode *last, scan_data_t *data, I32 stopparen, - U8* recursed, - struct regnode_charclass_class *and_withp, + U32 recursed_depth, + regnode_ssc *and_withp, U32 flags, U32 depth) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ @@ -3038,17 +3595,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0; /* There must be at least this number of characters to match */ + /* There must be at least this number of characters to match */ + SSize_t min = 0; I32 pars = 0, code; regnode *scan = *scanp, *next; - I32 delta = 0; + SSize_t delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; SV *re_trie_maxbuff = NULL; regnode *first_non_open = scan; - I32 stopmin = I32_MAX; + SSize_t stopmin = SSize_t_MAX; scan_frame *frame = NULL; GET_RE_DEBUG_FLAGS_DECL; @@ -3057,7 +3615,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif - if ( depth == 0 ) { while (first_non_open && OP(first_non_open) == OPEN) first_non_open=regnext(first_non_open); @@ -3069,15 +3626,40 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV min_subtract = 0; /* How mmany chars to subtract from the minimum node length to get a real minimum (because the folded version may be shorter) */ - bool has_exactf_sharp_s = FALSE; + bool unfolded_multi_char = FALSE; /* Peephole optimizer: */ - DEBUG_STUDYDATA("Peep:", data,depth); - DEBUG_PEEP("Peep",scan,depth); + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); + - /* Its not clear to khw or hv why this is done here, and not in the - * clauses that deal with EXACT nodes. khw's guess is that it's - * because of a previous design */ - JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -3110,24 +3692,29 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, || OP(scan) == IFTHEN) { next = regnext(scan); code = OP(scan); - /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ if (OP(next) == code || code == IFTHEN) { - /* NOTE - There is similar code to this block below for handling - TRIE nodes on a re-study. If you change stuff here check there - too. */ - I32 max1 = 0, min1 = I32_MAX, num = 0; - struct regnode_charclass_class accum; + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; regnode * const startbranch=scan; - if (flags & SCF_DO_SUBSTR) - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ - if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); while (OP(scan) == code) { - I32 deltanext, minnext, f = 0, fake; - struct regnode_charclass_class this_class; + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; num++; data_fake.flags = 0; @@ -3144,7 +3731,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (code != BRANCH) scan = NEXTOPER(scan); if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -3152,21 +3739,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - next, &data_fake, - stopparen, recursed, NULL, f,depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); if (min1 > minnext) min1 = minnext; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; + max1 = SSize_t_MAX; } else if (max1 < minnext + deltanext) max1 = minnext + deltanext; scan = next; if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > minnext) + if ( stopmin > minnext) stopmin = min + min1; flags &= ~SCF_DO_SUBSTR; if (data) @@ -3178,63 +3765,64 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; if (flags & SCF_DO_SUBSTR) { data->pos_min += min1; - if (data->pos_delta >= I32_MAX - (max1 - min1)) - data->pos_delta = I32_MAX; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; else data->pos_delta += max1 - min1; if (max1 != min1 || is_inf) data->longest = &(data->longest_float); } min += min1; - if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0) - delta = I32_MAX; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; else delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } - if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { /* demq. - Assuming this was/is a branch we are dealing with: 'scan' now - points at the item that follows the branch sequence, whatever - it is. We now start at the beginning of the sequence and look - for subsequences of + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of BRANCH->EXACT=>x1 BRANCH->EXACT=>x2 tail - which would be constructed from a pattern like /A|LIST|OF|WORDS/ + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ If we can find such a subsequence we need to turn the first element into a trie and then add the subsequent branch exact @@ -3242,7 +3830,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, We have two cases - 1. patterns where the whole set of branches can be converted. + 1. patterns where the whole set of branches can be + converted. 2. patterns where only a subset can be converted. @@ -3253,7 +3842,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, 'BRANCH EXACT; BRANCH EXACT; BRANCH X' becomes BRANCH TRIE; BRANCH X; - There is an additional case, that being where there is a + There is an additional case, that being where there is a common prefix, which gets split out into an EXACT like node preceding the TRIE node. @@ -3279,7 +3868,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U32 count=0; #ifdef DEBUGGING - SV * const mysv = sv_newmortal(); /* for dumping */ + SV * const mysv = sv_newmortal(); /* for dumping */ #endif /* var tail is used because there may be a TAIL regop in the way. Ie, the exacts will point to the @@ -3294,49 +3883,60 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, tail = regnext( tail ); } - + DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, tail ); + regprop(RExC_rx, mysv, tail, NULL); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", - (int)depth * 2 + 2, "", - "Looking for TRIE'able sequences. Tail node is: ", - SvPV_nolen_const( mysv ) + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) ); }); - + /* Step through the branches cur represents each branch, - noper is the first thing to be matched as part of that branch + noper is the first thing to be matched as part + of that branch noper_next is the regnext() of that node. - We normally handle a case like this /FOO[xyz]|BAR[pqr]/ - via a "jump trie" but we also support building with NOJUMPTRIE, - which restricts the trie logic to structures like /FOO|BAR/. - - If noper is a trieable nodetype then the branch is a possible optimization - target. If we are building under NOJUMPTRIE then we require that noper_next - is the same as scan (our current position in the regex program). - - Once we have two or more consecutive such branches we can create a - trie of the EXACT's contents and stitch it in place into the program. - - If the sequence represents all of the branches in the alternation we - replace the entire thing with a single TRIE node. - - Otherwise when it is a subsequence we need to stitch it in place and - replace only the relevant branches. This means the first branch has - to remain as it is used by the alternation logic, and its next pointer, - and needs to be repointed at the item on the branch chain following - the last branch we have optimized away. - - This could be either a BRANCH, in which case the subsequence is internal, - or it could be the item following the branch sequence in which case the - subsequence is at the end (which does not necessarily mean the first node - is the start of the alternation). - - TRIE_TYPE(X) is a define which maps the optype to a trietype. + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. optype | trietype ----------------+----------- @@ -3344,14 +3944,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, EXACT | EXACT EXACTFU | EXACTFU EXACTFU_SS | EXACTFU - EXACTFU_TRICKYFOLD | EXACTFU - EXACTFA | 0 + EXACTFA | EXACTFA */ #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ ( EXACT == (X) ) ? EXACT : \ - ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ 0 ) /* dont use tail as the end marker for this traverse */ @@ -3366,27 +3966,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); - regprop(RExC_rx, mysv, noper); + regprop(RExC_rx, mysv, noper, NULL); PerlIO_printf( Perl_debug_log, " -> %s", SvPV_nolen_const(mysv)); if ( noper_next ) { - regprop(RExC_rx, mysv, noper_next ); + regprop(RExC_rx, mysv, noper_next, NULL); PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), - PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] + PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] ); }); - /* Is noper a trieable nodetype that can be merged with the - * current trie (if there is one)? */ + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ if ( noper_trietype && ( @@ -3399,10 +3999,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif && count < U16_MAX) { - /* Handle mergable triable node - * Either we are the first node in a new trieable sequence, - * in which case we do some bookkeeping, otherwise we update - * the end pointer. */ + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ if ( !first ) { first = cur; if ( noper_trietype == NOTHING ) { @@ -3415,8 +4015,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_next_trietype ) { trietype = noper_next_trietype; } else if (noper_next_type) { - /* a NOTHING regop is 1 regop wide. We need at least two - * for a trie so we can't merge this in */ + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ first = NULL; } } else { @@ -3432,31 +4033,39 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle mergable triable node */ else { /* handle unmergable node - - * noper may either be a triable node which can not be tried - * together with the current trie, or a non triable node */ + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ if ( last ) { - /* If last is set and trietype is not NOTHING then we have found - * at least two triable branch sequences in a row of a similar - * trietype so we can turn them into a trie. If/when we - * allow NOTHING to start a trie sequence this condition will be - * required, and it isn't expensive so we leave it in for now. */ + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ if ( trietype && trietype != NOTHING ) make_trie( pRExC_state, - startbranch, first, cur, tail, count, - trietype, depth+1 ); - last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ } if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ - /* noper is triable, so we can start a new trie sequence */ + /* noper is triable, so we can start a new + * trie sequence */ count = 1; first = cur; trietype = noper_trietype; } else if (first) { - /* if we already saw a first but the current node is not triable then we have + /* if we already saw a first but the + * current node is not triable then we have * to reset the first information. */ count = 0; first = NULL; @@ -3465,18 +4074,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle unmergable node */ } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) \n", (int)depth * 2 + 2, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); if ( last && trietype ) { if ( trietype != NOTHING ) { - /* the last branch of the sequence was part of a trie, - * so we have to construct it here outside of the loop - */ - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) @@ -3486,20 +4098,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( startbranch == first && scan == tail ) { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } } #endif } else { - /* at this point we know whatever we have is a NOTHING sequence/branch - * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING */ if ( startbranch == first ) { regnode *opt; - /* the entire thing is a NOTHING sequence, something like this: - * (?:|) So we can turn it into a plain NOTHING op. */ + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); @@ -3513,9 +4128,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } /* end if ( last) */ } /* TRIE_MAXBUF is non zero */ - + } /* do trie */ - + } else if ( code == BRANCHJ ) { /* single branch is optimized. */ scan = NEXTOPER(NEXTOPER(scan)); @@ -3527,9 +4142,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 paren; regnode *start; regnode *end; + U32 my_recursed_depth= recursed_depth; if (OP(scan) != SUSPEND) { - /* set the pointer */ + /* set the pointer */ if (OP(scan) == GOSUB) { paren = ARG(scan); RExC_recurse[ARG2L(scan)] = scan; @@ -3540,21 +4156,33 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, start = RExC_rxi->program + 1; end = RExC_opend; } - if (!recursed) { - Newxz(recursed, (((RExC_npar)>>3) +1), U8); - SAVEFREEPV(recursed); - } - if (!PAREN_TEST(recursed,paren+1)) { - PAREN_SET(recursed,paren+1); + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; Newx(newframe,1,scan_frame); } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } } else { @@ -3571,17 +4199,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, newframe->last = last; newframe->stop = stopparen; newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); frame = newframe; scan = start; stopparen = paren; last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; continue; } } else if (OP(scan) == EXACT) { - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc; if (UTF) { const U8 * const s = (U8*)STRING(scan); @@ -3597,7 +4231,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data->last_end == -1) { /* Update the start info. */ data->last_start_min = data->pos_min; data->last_start_max = is_inf - ? I32_MAX : data->pos_min + data->pos_delta; + ? SSize_t_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); if (UTF) @@ -3608,83 +4242,48 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) mg->mg_len += utf8_length((U8*)STRING(scan), - (U8*)STRING(scan)+STR_LEN(scan)); + (U8*)STRING(scan)+STR_LEN(scan)); } data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - - - /* If compatible, we or it in below. It is compatible if is - * in the bitmp and either 1) its bit or its fold is set, or 2) - * it's for a locale. Even if there isn't unicode semantics - * here, at runtime there may be because of matching against a - * utf8 string, so accept a possible false positive for - * latin1-range folds */ - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && (!(data->start_class->flags & ANYOF_LOC_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - ) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) - ANYOF_BITMAP_SET(data->start_class, uc); - else if (uc >= 0x100) { - int i; - - /* Some Unicode code points fold to the Latin1 range; as - * XXX temporary code, instead of figuring out if this is - * one, just assume it is and set all the start class bits - * that could be some such above 255 code point's fold - * which will generate fals positives. As the code - * elsewhere that does compute the fold settles down, it - * can be extracted out and re-used here */ - for (i = 0; i < 256; i++){ - if (HAS_NONLATIN1_FOLD_CLOSURE(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } - } - } - CLEAR_SSC_EOS(data->start_class); - if (uc < 0x100) - data->start_class->flags &= ~ANYOF_UNICODE_ALL; + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); } else if (flags & SCF_DO_STCLASS_OR) { - /* false positive possible if the class is case-folded */ - if (uc < 0x100) - ANYOF_BITMAP_SET(data->start_class, uc); - else - data->start_class->flags |= ANYOF_UNICODE_ALL; - CLEAR_SSC_EOS(data->start_class); - cl_and(data->start_class, and_withp); + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } - else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ - I32 l = STR_LEN(scan); + else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is + EXACTFish */ + SSize_t l = STR_LEN(scan); UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ + const U8 * s = (U8*)STRING(scan); /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { - const U8 * const s = (U8 *)STRING(scan); uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - if (has_exactf_sharp_s) { - RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; } min += l - min_subtract; assert (min >= 0); @@ -3699,99 +4298,164 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) { - ANYOF_BITMAP_SET(data->start_class, uc); - CLEAR_SSC_EOS(data->start_class); - if (OP(scan) == EXACTFL) { - /* XXX This set is probably no longer necessary, and - * probably wrong as LOCALE now is on in the initial - * state */ - data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; - } - else { - /* Also set the other member of the fold pair. In case - * that unicode semantics is called for at runtime, use - * the full latin1 fold. (Can't do this for locale, - * because not known until runtime) */ - ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); - - /* All other (EXACTFL handled above) folds except under - * /iaa that include s, S, and sharp_s also may include - * the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } - } - } - } - else if (uc >= 0x100) { - int i; - for (i = 0; i < 256; i++){ - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } - } - } + if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + + if (! UTF) { + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { + + /* Any Latin1 range character can potentially match any + * other depending on the locale */ + if (OP(scan) == EXACTFL) { + _invlist_union(EXACTF_invlist, PL_Latin1, + &EXACTF_invlist); + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) { + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); + } + } + + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, + (U8) uc, + &EXACTF_invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + STR_LEN(scan); + SV** listp; + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(scan) == EXACTFL + && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) + { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); + } + } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; + } + + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c); + } + } + } + } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); } else if (flags & SCF_DO_STCLASS_OR) { - if (data->start_class->flags & ANYOF_LOC_FOLD) { - /* false positive possible if the class is case-folded. - Assume that the locale settings are the same... */ - if (uc < 0x100) { - ANYOF_BITMAP_SET(data->start_class, uc); - if (OP(scan) != EXACTFL) { - - /* And set the other member of the fold pair, but - * can't do that in locale because not known until - * run-time */ - ANYOF_BITMAP_SET(data->start_class, - PL_fold_latin1[uc]); - - /* All folds except under /iaa that include s, S, - * and sharp_s also may include the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } - } - } - } - CLEAR_SSC_EOS(data->start_class); - } - cl_and(data->start_class, and_withp); + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); } else if (REGNODE_VARIES(OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, fl = 0; - I32 f = flags, pos_before = 0; + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; regnode * const oscan = scan; - struct regnode_charclass_class this_class; - struct regnode_charclass_class *oclass = NULL; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; I32 next_is_eval = 0; switch (PL_regkind[OP(scan)]) { @@ -3812,7 +4476,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_SUBSTR) data->pos_min++; min++; - /* Fall through. */ + /* FALLTHROUGH */ case STAR: if (flags & SCF_DO_STCLASS) { mincount = 0; @@ -3821,12 +4485,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan = NEXTOPER(scan); goto do_curly; } - is_inf = is_inf_internal = 1; - scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } + is_inf = is_inf_internal = 1; + scan = regnext(scan); goto optimize_curly_tail; case CURLY: if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) @@ -3847,7 +4512,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ pos_before = data->pos_min; } if (data) { @@ -3857,7 +4524,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags |= SF_IS_INF; } if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); oclass = data->start_class; data->start_class = &this_class; f |= SCF_DO_STCLASS_AND; @@ -3876,36 +4543,36 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f &= ~SCF_WHILEM_VISITED_POS; /* This will finish on WHILEM, setting scan, or on NULL: */ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - last, data, stopparen, recursed, NULL, - (mincount == 0 - ? (f & ~SCF_DO_SUBSTR) : f),depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); if (flags & SCF_DO_STCLASS) data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&this_class, data->start_class, - struct regnode_charclass_class); + StructCopy(&this_class, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); - cl_and(data->start_class, and_withp); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); } else if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); flags &= ~SCF_DO_STCLASS; } if (!scan) /* It was not CURLYX, but CURLY. */ @@ -3915,24 +4582,26 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3) /* Complement check for big count */ + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ { /* Fatal warnings may leak the regexp without this: */ SAVEFREESV(RExC_rx_sv); ckWARNreg(RExC_parse, - "Quantifier unexpected on zero-length expression"); + "Quantifier unexpected on zero-length expression"); (void)ReREFCNT_inc(RExC_rx_sv); } min += minnext * mincount; - is_inf_internal |= deltanext == I32_MAX - || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); is_inf |= is_inf_internal; - if (is_inf) - delta = I32_MAX; - else - delta += (minnext + deltanext) * maxcount - minnext * mincount; - + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } /* Try powerful optimization CURLYX => CURLYN. */ if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR @@ -3983,7 +4652,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ - && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4030,7 +4702,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif /* Optimize again: */ study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, - NULL, stopparen, recursed, NULL, 0,depth+1); + NULL, stopparen, recursed_depth, NULL, 0,depth+1); } else oscan->flags = 0; @@ -4055,43 +4727,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, pars++; if (flags & SCF_DO_SUBSTR) { SV *last_str = NULL; + STRLEN last_chrs = 0; int counted = mincount != 0; - if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ -#if defined(SPARC64_GCC_WORKAROUND) - I32 b = 0; - STRLEN l = 0; - const char *s = NULL; - I32 old = 0; - - if (pos_before >= data->last_start_min) - b = pos_before; - else - b = data->last_start_min; - - l = 0; - s = SvPV_const(data->last_found, l); - old = b - data->last_start_min; - -#else - I32 b = pos_before >= data->last_start_min + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min ? pos_before : data->last_start_min; STRLEN l; const char * const s = SvPV_const(data->last_found, l); - I32 old = b - data->last_start_min; -#endif + SSize_t old = b - data->last_start_min; if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); repeatcpy(SvPVX(last_str) + l, - SvPVX_const(last_str), l, mincount - 1); + SvPVX_const(last_str), l, + mincount - 1); SvCUR_set(last_str, SvCUR(last_str) * mincount); /* Add additional parts. */ SvCUR_set(data->last_found, @@ -4103,34 +4764,41 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str) - l; + mg->mg_len += last_chrs * (mincount-1); } + last_chrs *= mincount; data->last_end += l * (mincount - 1); } } else { /* start offset must point into the last copy */ data->last_start_min += minnext * (mincount - 1); - data->last_start_max += is_inf ? I32_MAX + data->last_start_max += is_inf ? SSize_t_MAX : (maxcount - 1) * (minnext + data->pos_delta); } } /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n", - counted, deltanext, I32_MAX, minnext, maxcount, mincount); -if (deltanext != I32_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta); +PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf + " SSize_t_MAX=%"UVuf" minnext=%"UVuf + " maxcount=%"UVuf" mincount=%"UVuf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif - if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta) - data->pos_delta = I32_MAX; + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; else data->pos_delta += - counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount; if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); if (mincount && last_str) { SV * const sv = data->last_found; MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? @@ -4140,12 +4808,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext mg->mg_len = -1; sv_setsv(sv, last_str); data->last_end = data->pos_min; - data->last_start_min = - data->pos_min - CHR_SVLEN(last_str); + data->last_start_min = data->pos_min - last_chrs; data->last_start_max = is_inf - ? I32_MAX - : data->pos_min + data->pos_delta - - CHR_SVLEN(last_str); + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; } data->longest = &(data->longest_float); } @@ -4160,164 +4826,212 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext NEXT_OFF(oscan) += NEXT_OFF(next); } continue; - default: /* REF, and CLUMP only? */ + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) - cl_anything(pRExC_state, data->start_class); + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } flags &= ~SCF_DO_STCLASS; break; } } else if (OP(scan) == LNBREAK) { if (flags & SCF_DO_STCLASS) { - int value = 0; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ if (flags & SCF_DO_STCLASS_AND) { - for (value = 0; value < 256; value++) - if (!is_VERTWS_cp(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - else { - for (value = 0; value < 256; value++) - if (is_VERTWS_cp(value)) - ANYOF_BITMAP_SET(data->start_class, value); + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } min++; delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += 1; data->pos_delta += 1; data->longest = &(data->longest_float); } } else if (REGNODE_SIMPLE(OP(scan))) { - int value = 0; if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min++; } min++; if (flags & SCF_DO_STCLASS) { - int loop_max = 256; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; /* Some of the logic below assumes that switching locale on will only add false positives. */ - switch (PL_regkind[OP(scan)]) { - U8 classnum; + switch (OP(scan)) { - case SANY: default: #ifdef DEBUGGING - Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); #endif - do_default: + case CANY: + case SANY: if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_match_all_cp(data->start_class); break; + case REG_ANY: - if (OP(scan) == SANY) - goto do_default; - if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ - value = (ANYOF_BITMAP_TEST(data->start_class,'\n') - || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); - cl_anything(pRExC_state, data->start_class); + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); } - if (flags & SCF_DO_STCLASS_AND || !value) - ANYOF_BITMAP_CLEAR(data->start_class,'\n'); break; - case ANYOF: + + case ANYOF: if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, - (struct regnode_charclass_class*)scan); + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); else - cl_or(pRExC_state, data->start_class, - (struct regnode_charclass_class*)scan); + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); break; - case POSIXA: - loop_max = 128; - /* FALL THROUGH */ - case POSIXL: - case POSIXD: - case POSIXU: - classnum = FLAGS(scan); - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } - } - } - else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum)); - } - else { - /* Even if under locale, set the bits for non-locale - * in case it isn't a true locale-node. This will - * create false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } - } - } - } - break; - case NPOSIXA: - loop_max = 128; - /* FALL THROUGH */ case NPOSIXL: - case NPOSIXU: - case NPOSIXD: - classnum = FLAGS(scan); - if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } - } - } - else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1); - } - else { + invert = 1; + /* FALLTHROUGH */ - /* Even if under locale, set the bits for non-locale in - * case it isn't a true locale-node. This will create - * false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } + case POSIXL: + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; + if (flags & SCF_DO_STCLASS_AND) { + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } - if (PL_regkind[OP(scan)] == NPOSIXD) { - data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL; + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); + } + else { + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } - } - break; + } + break; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; + /* FALLTHROUGH */ + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + + case NPOSIXD: + case NPOSIXU: + invert = 1; + /* FALLTHROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } + + join_posix: + + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); + } + else { + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); + } } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -4325,7 +5039,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } else if ( PL_regkind[OP(scan)] == BRANCHJ @@ -4344,11 +5058,12 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("OPFAIL",data,depth); /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, mysv_val, upto); - PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(mysv_val), - (IV)REG_NODE_NUM(upto), - (IV)(upto - scan) + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) ); }); OP(scan) = OPFAIL; @@ -4358,16 +5073,16 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext scan= upto; continue; } - if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY || OP(scan) == UNLESSM ) { /* Negative Lookahead/lookbehind In this case we can't do fixed string optimisation. */ - I32 deltanext, minnext, fake = 0; + SSize_t deltanext, minnext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; data_fake.flags = 0; @@ -4380,7 +5095,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.pos_delta = delta; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4388,14 +5103,16 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext f |= SCF_WHILEM_VISITED_POS; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f, depth+1); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (minnext > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)minnext; } @@ -4414,14 +5131,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext * *** HACK *** for now just treat as "no information". * See [perl #56690]. */ - cl_init(pRExC_state, data->start_class); + ssc_init(pRExC_state, data->start_class); } else { /* AND before and after: combine and continue */ - const int was = TEST_SSC_EOS(data->start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } } } @@ -4434,26 +5147,26 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext length of the pattern, something we won't know about until after the recurse. */ - I32 deltanext, fake = 0; + SSize_t deltanext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; - /* We use SAVEFREEPV so that when the full compile - is finished perl will clean up the allocated + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated minlens when it's all done. This way we don't have to worry about freeing them when we know they wont be used, which would be a pain. */ - I32 *minnextp; - Newx( minnextp, 1, I32 ); + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); SAVEFREEPV(minnextp); if (data) { StructCopy(data, &data_fake, scan_data_t); if ((flags & SCF_DO_SUBSTR) && data->last_found) { f |= SCF_DO_SUBSTR; - if (scan->flags) - SCAN_COMMIT(pRExC_state, &data_fake,minlenp); + if (scan->flags) + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); data_fake.last_found=newSVsv(data->last_found); } } @@ -4465,7 +5178,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.flags |= SF_IS_INF; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4474,14 +5187,17 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f,depth+1); + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (*minnextp > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)*minnextp; } @@ -4489,11 +5205,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext *minnextp += min; if (f & SCF_DO_STCLASS_AND) { - const int was = TEST_SSC_EOS(data.start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4504,10 +5216,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { if (RExC_rx->minlen<*minnextp) RExC_rx->minlen=*minnextp; - SCAN_COMMIT(pRExC_state, &data_fake, minnextp); + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); SvREFCNT_dec_NN(data_fake.last_found); - - if ( data_fake.minlen_fixed != minlenp ) + + if ( data_fake.minlen_fixed != minlenp ) { data->offset_fixed= data_fake.offset_fixed; data->minlen_fixed= data_fake.minlen_fixed; @@ -4548,7 +5260,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext } else if ( PL_regkind[OP(scan)] == ENDLIKE ) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); flags &= ~SCF_DO_SUBSTR; } if (data && OP(scan)==ACCEPT) { @@ -4560,26 +5272,26 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } else if (OP(scan) == GPOS) { - if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && - !(delta || is_inf || (data && data->pos_delta))) + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) { - if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) - RExC_rx->extflags |= RXf_ANCH_GPOS; - if (RExC_rx->gofs < (U32)min) + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) RExC_rx->gofs = min; } else { - RExC_rx->extflags |= RXf_GPOS_FLOAT; + RExC_rx->intflags |= PREGf_GPOS_FLOAT; RExC_rx->gofs = 0; - } + } } #ifdef TRIE_STUDY_OPT #ifdef FULL_TRIE_STUDY @@ -4590,26 +5302,28 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext regnode *trie_node= scan; regnode *tail= regnext(scan); reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - I32 max1 = 0, min1 = I32_MAX; - struct regnode_charclass_class accum; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; - if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); - + ssc_init_zero(pRExC_state, &accum); + if (!trie->jump) { min1= trie->minlen; max1= trie->maxlen; } else { const regnode *nextbranch= NULL; U32 word; - - for ( word=1 ; word <= trie->wordcount ; word++) + + for ( word=1 ; word <= trie->wordcount ; word++) { - I32 deltanext=0, minnext=0, f = 0, fake; - struct regnode_charclass_class this_class; - + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; + data_fake.flags = 0; if (data) { data_fake.whilem_c = data->whilem_c; @@ -4619,40 +5333,39 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.last_closep = &fake; data_fake.pos_delta = delta; if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } if (flags & SCF_WHILEM_VISITED_POS) f |= SCF_WHILEM_VISITED_POS; - + if (trie->jump[word]) { if (!nextbranch) nextbranch = trie_node + trie->jump[0]; scan= trie_node + trie->jump[word]; /* We go from the jump point to the branch that follows - it. Note this means we need the vestigal unused branches - even though they arent otherwise used. - */ - minnext = study_chunk(pRExC_state, &scan, minlenp, - &deltanext, (regnode *)nextbranch, &data_fake, - stopparen, recursed, NULL, f,depth+1); + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed_depth, NULL, f,depth+1); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode*)nextbranch); - - if (min1 > (I32)(minnext + trie->minlen)) + + if (min1 > (SSize_t)(minnext + trie->minlen)) min1 = minnext + trie->minlen; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; - } else if (max1 < (I32)(minnext + deltanext + trie->maxlen)) + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) max1 = minnext + deltanext + trie->maxlen; - + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > min + min1) + if ( stopmin > min + min1) stopmin = min + min1; flags &= ~SCF_DO_SUBSTR; if (data) @@ -4664,7 +5377,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -4676,28 +5389,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } scan= tail; @@ -4707,19 +5417,20 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext else if (PL_regkind[OP(scan)] == TRIE) { reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; U8*bang=NULL; - + min += trie->minlen; delta += (trie->maxlen - trie->minlen); flags &= ~SCF_DO_STCLASS; /* xxx */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += trie->minlen; data->pos_delta += (trie->maxlen - trie->minlen); if (trie->maxlen != trie->minlen) data->longest = &(data->longest_float); } if (trie->jump) /* no more substrings -- for now /grr*/ - flags &= ~SCF_DO_SUBSTR; + flags &= ~SCF_DO_SUBSTR; } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ @@ -4727,10 +5438,24 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext /* Else: zero-length, ignore. */ scan = regnext(scan); } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ last = frame->last; scan = frame->next; stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + frame = frame->prev; goto fake_study_recurse; } @@ -4740,9 +5465,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("pre-fin:",data,depth); *scanp = scan; - *deltap = is_inf_internal ? I32_MAX : delta; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) - data->pos_delta = I32_MAX - data->pos_min; + data->pos_delta = SSize_t_MAX - data->pos_min; if (is_par > (I32)U8_MAX) is_par = 0; if (is_par && pars==1 && data) { @@ -4754,17 +5480,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; - + DEBUG_STUDYDATA("post-fin:",data,depth); - - return min < stopmin ? min : stopmin; + + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ } STATIC U32 -S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; @@ -4782,7 +5516,8 @@ S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) return count; } -/*XXX: todo make this not included in a non debugging perl */ +/*XXX: todo make this not included in a non debugging perl, but appears to be + * used anyway there, in 'use re' */ #ifndef PERL_IN_XSUB_RE void Perl_reginitcolors(pTHX) @@ -4825,7 +5560,7 @@ Perl_reginitcolors(pTHX) } STMT_END #else #define CHECK_RESTUDY_GOTO_butfirst -#endif +#endif /* * pregcomp - compile a regular expression into internal code @@ -4834,7 +5569,7 @@ Perl_reginitcolors(pTHX) * scope */ -#ifndef PERL_IN_XSUB_RE +#ifndef PERL_IN_XSUB_RE /* return the currently in-scope regex engine (or the default if none) */ @@ -4847,7 +5582,7 @@ Perl_current_re_engine(pTHX) HV * const table = GvHV(PL_hintgv); SV **ptr; - if (!table) + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) return &PL_core_reg_engine; ptr = hv_fetchs(table, "regcomp", FALSE); if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) @@ -4928,12 +5663,11 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, Newx(dst, *plen_p * 2 + 1, U8); while (s < *plen_p) { - const UV uv = NATIVE_TO_ASCII(src[s]); - if (UNI_IS_INVARIANT(uv)) - dst[d] = (U8)UTF_TO_NATIVE(uv); + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; else { - dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv); - dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv); + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); } if (n < num_code_blocks) { if (!do_end && pRExC_state->code_blocks[n].start == s) { @@ -4986,7 +5720,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, /* if we know we have at least two args, create an empty string, * then concatenate args to that. For no args, return an empty string */ if (!pat && pat_count != 1) { - pat = newSVpvn("", 0); + pat = newSVpvs(""); SAVEFREESV(pat); alloced = TRUE; } @@ -4997,6 +5731,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, STRLEN orig_patlen = 0; bool code = 0; SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; /* if we've got a delimiter, we go round the loop twice for each * svp slot (except the last), using the delimiter the second @@ -5015,21 +5750,21 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * The code in this block is based on S_pushav() */ AV *const av = (AV*)msv; - const I32 maxarg = AvFILL(av) + 1; + const SSize_t maxarg = AvFILL(av) + 1; SV **array; if (oplist) { assert(oplist->op_type == OP_PADAV - || oplist->op_type == OP_RV2AV); + || oplist->op_type == OP_RV2AV); oplist = oplist->op_sibling;; } if (SvRMAGICAL(av)) { - U32 i; + SSize_t i; Newx(array, maxarg, SV*); SAVEFREEPV(array); - for (i=0; i < (U32)maxarg; i++) { + for (i=0; i < maxarg; i++) { SV ** const svp = av_fetch(av, i, FALSE); array[i] = svp ? *svp : &PL_sv_undef; } @@ -5191,6 +5926,8 @@ S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, { int n = 0; STRLEN s; + + PERL_UNUSED_CONTEXT; for (s = 0; s < plen; s++) { if (n < pRExC_state->num_code_blocks @@ -5319,7 +6056,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, { Safefree(pRExC_state->code_blocks); /* use croak_sv ? */ - Perl_croak_nocontext("%s", SvPV_nolen_const(errsv)); + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); } } assert(SvROK(qr_ref)); @@ -5410,20 +6147,24 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, STATIC bool -S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol) +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) { /* This is the common code for setting up the floating and fixed length * string data extracted from Perl_re_op_compile() below. Returns a boolean * as to whether succeeded or not */ - I32 t,ml; + I32 t; + SSize_t ml; if (! (longest_length || (eol /* Can't have SEOL and MULTI */ && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) ) - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) { return FALSE; } @@ -5441,7 +6182,7 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, S follow this item. We calculate it ahead of time as once the lookbehind offset is added in we lose the ability to correctly calculate it.*/ - ml = minlen ? *(minlen) : (I32)longest_length; + ml = minlen ? *(minlen) : (SSize_t)longest_length; *rx_end_shift = ml - offset - longest_length + (SvTAIL(sv_longest) != 0) + lookbehind; @@ -5510,7 +6251,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, char *exp; regnode *scan; I32 flags; - I32 minlen = 0; + SSize_t minlen = 0; U32 rx_flags; SV *pat; SV *code_blocksv = NULL; @@ -5521,16 +6262,18 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, I32 sawlookahead = 0; I32 sawplus = 0; I32 sawopen = 0; + I32 sawminmod = 0; + regex_charset initial_charset = get_regex_charset(orig_rx_flags); bool recompile = 0; bool runtime_code = 0; scan_data_t data; RExC_state_t RExC_state; RExC_state_t * const pRExC_state = &RExC_state; -#ifdef TRIE_STUDY_OPT +#ifdef TRIE_STUDY_OPT int restudied = 0; RExC_state_t copyRExC_state; -#endif +#endif GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_OP_COMPILE; @@ -5542,61 +6285,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * having to test them each time otherwise */ if (! PL_AboveLatin1) { PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); - PL_ASCII = _new_invlist_C_array(ASCII_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); - - PL_L1Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(PosixAlnum_invlist); - - PL_L1Posix_ptrs[_CC_ALPHA] - = _new_invlist_C_array(L1PosixAlpha_invlist); - PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist); - PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist); - - /* Cased is the same as Alpha in the ASCII range */ - PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist); - PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist); - PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist); - - PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - - PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist); - PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist); - - PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist); - PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist); - - PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist); - PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist); - - PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist); - PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist); - - PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist); - PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist); - PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist); - PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist); - - PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist); - PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist); - - PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist); - - PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist); - PL_L1Posix_ptrs[_CC_WORDCHAR] - = _new_invlist_C_array(L1PosixWord_invlist); - - PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist); - PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); - - PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); } #endif @@ -5712,6 +6405,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + RExC_contains_i = 0; pRExC_state->runtime_code_qr = NULL; DEBUG_COMPILE_r({ @@ -5733,11 +6427,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); /* return old regex if pattern hasn't changed */ - /* XXX: note in the below we have to check the flags as well as the pattern. + /* XXX: note in the below we have to check the flags as well as the + * pattern. * - * Things get a touch tricky as we have to compare the utf8 flag independently - * from the compile flags. - */ + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ if ( old_re && !recompile @@ -5754,10 +6448,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, rx_flags = orig_rx_flags; - if (initial_charset == REGEX_LOCALE_CHARSET) { - RExC_contains_locale = 1; + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; } - else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -5785,6 +6479,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_sawback = 0; RExC_seen = 0; + RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; @@ -5799,7 +6494,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_npar = 1; RExC_nestroot = 0; RExC_size = 0L; - RExC_emit = &RExC_emit_dummy; + RExC_emit = (regnode *) &RExC_emit_dummy; RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; @@ -5809,6 +6504,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_paren_name_list = NULL; #endif RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; RExC_recurse_count = 0; pRExC_state->code_index = 0; @@ -5852,12 +6549,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ DEBUG_PARSE_r({ - PerlIO_printf(Perl_debug_log, + PerlIO_printf(Perl_debug_log, "Required size %"IVdf" nodes\n" - "Starting second pass (creation)\n", + "Starting second pass (creation)\n", (IV)RExC_size); - RExC_lastnum=0; - RExC_lastparse=NULL; + RExC_lastnum=0; + RExC_lastparse=NULL; }); /* The first pass could have found things that force Unicode semantics */ @@ -5876,8 +6573,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (RExC_whilem_seen > 15) RExC_whilem_seen = 15; - /* Allocate space and zero-initialize. Note, the two step process - of zeroing when in debug mode, thus anything assigned has to + /* Allocate space and zero-initialize. Note, the two step process + of zeroing when in debug mode, thus anything assigned has to happen after that */ rx = (REGEXP*) newSV_type(SVt_REGEXP); r = ReANY(rx); @@ -5887,10 +6584,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, FAIL("Regexp out of space"); #ifdef DEBUGGING /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ - Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); -#else + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); +#else /* bulk initialize base fields with 0. */ - Zero(ri, sizeof(regexp_internal), char); + Zero(ri, sizeof(regexp_internal), char); #endif /* non-zero initialization begins here */ @@ -5914,14 +6612,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); /* The caret is output if there are any defaults: if not all the STD * flags are set, or if no character set specifier is needed */ bool has_default = (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) || ! has_charset); - bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ @@ -5982,13 +6682,24 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - - if (RExC_seen & REG_SEEN_RECURSE) { + + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { Newxz(RExC_open_parens, RExC_npar,regnode *); SAVEFREEPV(RExC_open_parens); Newxz(RExC_close_parens,RExC_npar,regnode *); SAVEFREEPV(RExC_close_parens); } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } /* Useful during FAIL. */ #ifdef RE_TRACK_PATTERN_OFFSETS @@ -6017,7 +6728,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { - ReREFCNT_dec(rx); + ReREFCNT_dec(rx); Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); } /* XXXX To minimize changes to RE engine we always allocate @@ -6029,8 +6740,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } reStudy: - r->minlen = minlen = sawlookahead = sawplus = sawopen = 0; + r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); #ifdef TRIE_STUDY_OPT if (!restudied) { @@ -6039,22 +6753,22 @@ reStudy: } else { U32 seen=RExC_seen; DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); - + RExC_state = copyRExC_state; - if (seen & REG_TOP_LEVEL_BRANCHES) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; else - RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; StructCopy(&zero_scan_data, &data, scan_data_t); } #else StructCopy(&zero_scan_data, &data, scan_data_t); -#endif +#endif /* Dig out information for optimizations. */ r->extflags = RExC_flags; /* was pm_op */ /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ - + if (UTF) SvUTF8_on(rx); /* Unicode in it? */ ri->regstclass = NULL; @@ -6064,20 +6778,21 @@ reStudy: /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ - if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ - I32 fake; + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; STRLEN longest_float_length, longest_fixed_length; - struct regnode_charclass_class ch_class; /* pointed to by data */ + regnode_ssc ch_class; /* pointed to by data */ int stclass_flag; - I32 last_close = 0; /* pointed to by data */ + SSize_t last_close = 0; /* pointed to by data */ regnode *first= scan; regnode *first_next= regnext(first); /* * Skip introductions and multiplicators >= 1 - * so that we can extract the 'meat' of the pattern that must + * so that we can extract the 'meat' of the pattern that must * match in the large if() sequence following. * NOTE that EXACT is NOT covered here, as it is normally - * picked up by the optimiser separately. + * picked up by the optimiser separately. * * This is unfortunate as the optimiser isnt handling lookahead * properly currently. @@ -6094,16 +6809,19 @@ reStudy: (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { - /* + /* * the only op that could be a regnode is PLUS, all the rest * will be regnode_1 or regnode_2. * + * (yves doesn't think this is true) */ if (OP(first) == PLUS) sawplus = 1; - else + else { + if (OP(first) == MINMOD) + sawminmod = 1; first += regarglen[OP(first)]; - + } first = NEXTOPER(first); first_next= regnext(first); } @@ -6120,24 +6838,10 @@ reStudy: } #ifdef TRIE_STCLASS else if (PL_regkind[OP(first)] == TRIE && - ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) + ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { - regnode *trie_op; - /* this can happen only on restudy */ - if ( OP(first) == TRIE ) { - struct regnode_1 *trieop = (struct regnode_1 *) - PerlMemShared_calloc(1, sizeof(struct regnode_1)); - StructCopy(first,trieop,struct regnode_1); - trie_op=(regnode *)trieop; - } else { - struct regnode_charclass *trieop = (struct regnode_charclass *) - PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); - StructCopy(first,trieop,struct regnode_charclass); - trie_op=(regnode *)trieop; - } - OP(trie_op)+=2; - make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); - ri->regstclass = trie_op; + /* this can happen only on restudy */ + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); } #endif else if (REGNODE_SIMPLE(OP(first))) @@ -6146,35 +6850,35 @@ reStudy: PL_regkind[OP(first)] == NBOUND) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOL) { - r->extflags |= (OP(first) == MBOL - ? RXf_ANCH_MBOL + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL : (OP(first) == SBOL - ? RXf_ANCH_SBOL - : RXf_ANCH_BOL)); + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); first = NEXTOPER(first); goto again; } else if (OP(first) == GPOS) { - r->extflags |= RXf_ANCH_GPOS; + r->intflags |= PREGf_ANCH_GPOS; first = NEXTOPER(first); goto again; } else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = (OP(NEXTOPER(first)) == REG_ANY) - ? RXf_ANCH_MBOL - : RXf_ANCH_SBOL; - r->extflags |= type; - r->intflags |= PREGf_IMPLICIT; + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); first = NEXTOPER(first); goto again; } - if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6216,15 +6920,17 @@ reStudy: SAVEFREESV(data.last_found); first = scan; if (!ri->regstclass) { - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; } else /* XXXX Check for BOUND? */ stclass_flag = 0; data.last_closep = &last_close; - - minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ - &data, -1, NULL, NULL, + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 0); @@ -6236,9 +6942,11 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen - && !(RExC_seen & REG_SEEN_VERBARG) - && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ r->extflags |= RXf_CHECK_ALL; + } scan_commit(pRExC_state, &data,&minlen,0); longest_float_length = CHR_SVLEN(data.longest_float); @@ -6260,7 +6968,7 @@ reStudy: { r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; - if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; SvREFCNT_inc_simple_void_NN(data.longest_float); } @@ -6298,126 +7006,141 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag - && ! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } - /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; r->check_end_shift = r->anchored_end_shift; r->check_substr = r->anchored_substr; r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; - if (r->extflags & RXf_ANCH_SINGLE) - r->extflags |= RXf_NOSCAN; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; } else { + r->substrs->check_ix = 1; r->check_end_shift = r->float_end_shift; r->check_substr = r->float_substr; r->check_utf8 = r->float_utf8; r->check_offset_min = r->float_min_offset; r->check_offset_max = r->float_max_offset; } - /* XXXX Currently intuiting is not compatible with ANCH_GPOS. - This should be changed ASAP! */ - if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) ) { r->extflags |= RXf_USE_INTUIT; if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) if ( (STRLEN)minlen < longest_float_length ) minlen= longest_float_length; if ( (STRLEN)minlen < longest_fixed_length ) - minlen= longest_fixed_length; + minlen= longest_fixed_length; */ } else { /* Several toplevels. Best we can is to set minlen. */ - I32 fake; - struct regnode_charclass_class ch_class; - I32 last_close = 0; + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = ri->program + 1; - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; data.last_closep = &last_close; - - minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, - &data, -1, NULL, NULL, - SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS - |(restudied ? SCF_TRIE_DOING_RESTUDY : 0), + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), 0); - + CHECK_RESTUDY_GOTO_butfirst(NOOP); r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - if (! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) - { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } } + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", - (IV)minlen, (IV)r->minlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); }); r->minlenret = minlen; - if (r->minlen < minlen) + if (r->minlen < minlen) r->minlen = minlen; - - if (RExC_seen & REG_SEEN_GPOS) - r->extflags |= RXf_GPOS_SEEN; - if (RExC_seen & REG_SEEN_LOOKBEHIND) - r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ + + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_SEEN_CANY) - r->extflags |= RXf_CANY_SEEN; - if (RExC_seen & REG_SEEN_VERBARG) + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ } - if (RExC_seen & REG_SEEN_CUTGROUP) + if (RExC_seen & REG_CUTGROUP_SEEN) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) r->intflags |= PREGf_USE_RE_EVAL; @@ -6426,7 +7149,20 @@ reStudy: else RXp_PAREN_NAMES(r) = NULL; + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); regnode *next = NEXTOPER(first); @@ -6436,16 +7172,28 @@ reStudy: r->extflags |= RXf_NULL; else if (PL_regkind[fop] == BOL && nop == END) r->extflags |= RXf_START_ONLY; - else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END) + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) r->extflags |= RXf_WHITE; - else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + #ifdef DEBUGGING if (RExC_paren_names) { - ri->name_list_idx = add_data( pRExC_state, 1, "a" ); - ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); } else #endif ri->name_list_idx = 0; @@ -6460,15 +7208,17 @@ reStudy: /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ + DEBUG_RExC_seen(); PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); #ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_OFFSETS_r(if (ri->u.offsets) { - const U32 len = ri->u.offsets[0]; - U32 i; + const STRLEN len = ri->u.offsets[0]; + STRLEN i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", @@ -6528,7 +7278,8 @@ Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, else if (flags & RXapif_NEXTKEY) return reg_named_buff_nextkey(rx, flags); else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); return NULL; } } @@ -6654,7 +7405,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) { SV *ret; AV *av; - I32 length; + SSize_t length; struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; @@ -6665,11 +7416,12 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) } else if (flags & RXapif_ONE) { ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = MUTABLE_AV(SvRV(ret)); - length = av_len(av); + length = av_tindex(av); SvREFCNT_dec_NN(ret); return newSViv(length + 1); } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); return NULL; } } @@ -6717,19 +7469,29 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, { struct regexp *const rx = ReANY(r); char *s = NULL; - I32 i = 0; - I32 s1, t1; + SSize_t i = 0; + SSize_t s1, t1; I32 n = paren; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - - if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + + if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - && !(rx->extflags & RXf_PMf_KEEPCOPY) - ) - goto ret_undef; + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } if (!rx->subbeg) goto ret_undef; @@ -6745,14 +7507,14 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, i = rx->offs[0].start; s = rx->subbeg; } - else + else if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) && rx->offs[0].end != -1) { /* $', ${^POSTMATCH} */ s = rx->subbeg - rx->suboffset + rx->offs[0].end; i = rx->sublen + rx->suboffset - rx->offs[0].end; - } + } else if ( 0 <= n && n <= (I32)rx->nparens && (s1 = rx->offs[n].start) != -1 && @@ -6763,12 +7525,12 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, s = rx->subbeg + s1 - rx->suboffset; } else { goto ret_undef; - } + } assert(s >= rx->subbeg); - assert(rx->sublen >= (s - rx->subbeg) + i ); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); if (i >= 0) { -#if NO_TAINT_SUPPORT +#ifdef NO_TAINT_SUPPORT sv_setpvn(sv, s, i); #else const int oldtainted = TAINT_get; @@ -6776,7 +7538,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->extflags & RXf_CANY_SEEN) + if ( (rx->intflags & PREGf_CANY_SEEN) ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) : (RXp_MATCH_UTF8(rx)) ) @@ -6801,7 +7563,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, TAINT; SvTAINT(sv); } - } else + } else SvTAINTED_off(sv); } } else { @@ -6835,13 +7597,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + /* Some of this code was originally in C in F */ switch (paren) { case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; @@ -6854,8 +7630,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, return 0; case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; @@ -6867,13 +7641,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } return 0; - case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - - /* $& / ${^MATCH}, $1, $2, ... */ - default: + default: /* $& / ${^MATCH}, $1, $2, ... */ if (paren <= (I32)rx->nparens && (s1 = rx->offs[paren].start) != -1 && (t1 = rx->offs[paren].end) != -1) @@ -6930,7 +7698,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) PERL_ARGS_ASSERT_REG_SCAN_NAME; - if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { /* skip IDFIRST by using do...while */ if (UTF) do { @@ -6941,7 +7711,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) RExC_parse++; } while (isWORDCHAR(*RExC_parse)); } else { - RExC_parse++; /* so the <- from the vFAIL is after the offending character */ + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ vFAIL("Group name must start with a non-digit word character"); } if ( flags ) { @@ -7021,11 +7792,12 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* This section of code defines the inversion list object and its methods. The * interfaces are highly subject to change, so as much as possible is static to * this file. An inversion list is here implemented as a malloc'd C UV array - * with some added info that is placed as UVs at the beginning in a header - * portion. An inversion list for Unicode is an array of code points, sorted - * by ordinal number. The zeroth element is the first code point in the list. - * The 1th element is the first element beyond that not in the list. In other - * words, the first range is + * as an SVt_INVLIST scalar. + * + * An inversion list for Unicode is an array of code points, sorted by ordinal + * number. The zeroth element is the first code point in the list. The 1th + * element is the first element beyond that not in the list. In other words, + * the first range is * invlist[0]..(invlist[1]-1) * The other ranges follow. Thus every element whose index is divisible by two * marks the beginning of a range that is in the list, and every element not @@ -7043,9 +7815,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. * This implementation reserves an element at the beginning of each inversion - * list to contain 0 when the list contains 0, and contains 1 otherwise. The - * actual beginning of the list is either that element if 0, or the next one if - * 1. + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. @@ -7060,36 +7832,35 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * should eventually be made public */ /* The header definitions are in F */ -#define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV)) -#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH) - -#define INVLIST_INITIAL_LEN 10 PERL_STATIC_INLINE UV* -S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) +S__invlist_array_init(SV* const invlist, const bool will_have_0) { /* Returns a pointer to the first element in the inversion list's array. * This is called upon initialization of an inversion list. Where the - * array begins depends on whether the list has the code point U+0000 - * in it or not. The other parameter tells it whether the code that - * follows this call is about to put a 0 in the inversion list or not. - * The first element is either the element with 0, if 0, or the next one, - * if 1 */ + * array begins depends on whether the list has the code point U+0000 in it + * or not. The other parameter tells it whether the code that follows this + * call is about to put a 0 in the inversion list or not. The first + * element is either the element reserved for 0, if TRUE, or the element + * after it, if FALSE */ - UV* zero = get_invlist_zero_addr(invlist); + bool* offset = get_invlist_offset_addr(invlist); + UV* zero_addr = (UV *) SvPVX(invlist); PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; /* Must be empty */ - assert(! *_get_invlist_len_addr(invlist)); + assert(! _invlist_len(invlist)); + + *zero_addr = 0; /* 1^1 = 0; 1^0 = 1 */ - *zero = 1 ^ will_have_0; - return zero + *zero; + *offset = 1 ^ will_have_0; + return zero_addr + *offset; } PERL_STATIC_INLINE UV* -S_invlist_array(pTHX_ SV* const invlist) +S_invlist_array(SV* const invlist) { /* Returns the pointer to the inversion list's array. Every time the * length changes, this needs to be called in case malloc or realloc moved @@ -7099,57 +7870,47 @@ S_invlist_array(pTHX_ SV* const invlist) /* Must not be empty. If these fail, you probably didn't check for * being non-zero before trying to get the array */ - assert(*_get_invlist_len_addr(invlist)); - assert(*get_invlist_zero_addr(invlist) == 0 - || *get_invlist_zero_addr(invlist) == 1); - - /* The array begins either at the element reserved for zero if the - * list contains 0 (that element will be set to 0), or otherwise the next - * element (in which case the reserved element will be set to 1). */ - return (UV *) (get_invlist_zero_addr(invlist) - + *get_invlist_zero_addr(invlist)); + assert(_invlist_len(invlist)); + + /* The very first element always contains zero, The array begins either + * there, or if the inversion list is offset, at the element after it. + * The offset header field determines which; it contains 0 or 1 to indicate + * how much additionally to add */ + assert(0 == *(SvPVX(invlist))); + return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); } PERL_STATIC_INLINE void -S_invlist_set_len(pTHX_ SV* const invlist, const UV len) +S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) { - /* Sets the current number of elements stored in the inversion list */ - + /* Sets the current number of elements stored in the inversion list. + * Updates SvCUR correspondingly */ + PERL_UNUSED_CONTEXT; PERL_ARGS_ASSERT_INVLIST_SET_LEN; - *_get_invlist_len_addr(invlist) = len; - - assert(len <= SvLEN(invlist)); - - SvCUR_set(invlist, TO_INTERNAL_SIZE(len)); - /* If the list contains U+0000, that element is part of the header, - * and should not be counted as part of the array. It will contain - * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and - * subtract: - * SvCUR_set(invlist, - * TO_INTERNAL_SIZE(len - * - (*get_invlist_zero_addr(inv_list) ^ 1))); - * But, this is only valid if len is not 0. The consequences of not doing - * this is that the memory allocation code may think that 1 more UV is - * being used than actually is, and so might do an unnecessary grow. That - * seems worth not bothering to make this the precise amount. - * - * Note that when inverting, SvCUR shouldn't change */ + assert(SvTYPE(invlist) == SVt_INVLIST); + + SvCUR_set(invlist, + (len == 0) + ? 0 + : TO_INTERNAL_SIZE(len + offset)); + assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); } PERL_STATIC_INLINE IV* -S_get_invlist_previous_index_addr(pTHX_ SV* invlist) +S_get_invlist_previous_index_addr(SV* invlist) { - /* Return the address of the UV that is reserved to hold the cached index + /* Return the address of the IV that is reserved to hold the cached index * */ - PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; - return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV))); + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); } PERL_STATIC_INLINE IV -S_invlist_previous_index(pTHX_ SV* const invlist) +S_invlist_previous_index(SV* const invlist) { /* Returns cached index of previous search */ @@ -7159,7 +7920,7 @@ S_invlist_previous_index(pTHX_ SV* const invlist) } PERL_STATIC_INLINE void -S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) +S_invlist_set_previous_index(SV* const invlist, const IV index) { /* Caches for later retrieval */ @@ -7171,29 +7932,20 @@ S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) } PERL_STATIC_INLINE UV -S_invlist_max(pTHX_ SV* const invlist) +S_invlist_max(SV* const invlist) { /* Returns the maximum number of elements storable in the inversion list's * array, without having to realloc() */ PERL_ARGS_ASSERT_INVLIST_MAX; - return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ - ? _invlist_len(invlist) - : FROM_INTERNAL_SIZE(SvLEN(invlist)); -} - -PERL_STATIC_INLINE UV* -S_get_invlist_zero_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that is reserved to hold 0 if the inversion - * list contains 0. This has to be the last element of the heading, as the - * list proper starts with either it if 0, or the next element if not. - * (But we force it to contain either 0 or 1) */ - - PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); - return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV))); + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ + return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; } #ifndef PERL_IN_XSUB_RE @@ -7208,60 +7960,81 @@ Perl__new_invlist(pTHX_ IV initial_size) SV* new_list; if (initial_size < 0) { - initial_size = INVLIST_INITIAL_LEN; + initial_size = 10; } /* Allocate the initial space */ - new_list = newSV(TO_INTERNAL_SIZE(initial_size)); - invlist_set_len(new_list, 0); + new_list = newSV_type(SVt_INVLIST); - /* Force iterinit() to be used to get iteration to work */ - *get_invlist_iter_addr(new_list) = UV_MAX; + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(new_list, 0, 0); - /* This should force a segfault if a method doesn't initialize this - * properly */ - *get_invlist_zero_addr(new_list) = UV_MAX; + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; *get_invlist_previous_index_addr(new_list) = 0; - *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; -#if HEADER_LENGTH != 5 -# error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length -#endif return new_list; } -#endif -STATIC SV* -S__new_invlist_C_array(pTHX_ UV* list) +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) { /* Return a pointer to a newly constructed inversion list, initialized to * point to , which has to be in the exact correct inversion list * form, including internal fields. Thus this is a dangerous routine that - * should not be used in the wrong hands */ + * should not be used in the wrong hands. The passed in 'list' contains + * several header fields at the beginning that are not part of the + * inversion list body proper */ + + const STRLEN length = (STRLEN) list[0]; + const UV version_id = list[1]; + const bool offset = cBOOL(list[2]); +#define HEADER_LENGTH 3 + /* If any of the above changes in any way, you must change HEADER_LENGTH + * (if appropriate) and regenerate INVLIST_VERSION_ID by running + * perl -E 'say int(rand 2**31-1)' + */ +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and + data structure type, so that one being + passed in can be validated to be an + inversion list of the correct vintage. + */ - SV* invlist = newSV_type(SVt_PV); + SV* invlist = newSV_type(SVt_INVLIST); PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; - SvPV_set(invlist, (char *) list); + if (version_id != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + /* The generated array passed in includes header elements that aren't part + * of the list proper, so start it just after them */ + SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); + SvLEN_set(invlist, 0); /* Means we own the contents, and the system shouldn't touch it */ - SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist))); - if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { - Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); - } + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); - /* Initialize the iteration pointer. - * XXX This could be done at compile time in charclass_invlists.h, but I - * (khw) am not confident that the suffixes for specifying the C constant - * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured - * to use 64 bits; might need a Configure probe */ + invlist_set_previous_index(invlist, 0); + + /* Initialize the iteration pointer. */ invlist_iterfinish(invlist); + SvREADONLY_on(invlist); + return invlist; } +#endif /* ifndef PERL_IN_XSUB_RE */ STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) @@ -7270,24 +8043,28 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) PERL_ARGS_ASSERT_INVLIST_EXTEND; - SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max)); + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Add one to account for the zero element at the beginning which may not + * be counted by the calling parameters */ + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); } PERL_STATIC_INLINE void -S_invlist_trim(pTHX_ SV* const invlist) +S_invlist_trim(SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_TRIM; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Change the length of the inversion list to how many entries it currently * has */ - SvPV_shrink_to_cur((SV *) invlist); } -#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) - STATIC void -S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) { /* Subject to change or removal. Append the range from 'start' to 'end' at * the end of the inversion list. The range must be above any existing @@ -7296,11 +8073,13 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end UV* array; UV max = invlist_max(invlist); UV len = _invlist_len(invlist); + bool offset; PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; if (len == 0) { /* Empty lists must be initialized */ - array = _invlist_array_init(invlist, start == 0); + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); } else { /* Here, the existing list is non-empty. The current max entry in the @@ -7315,14 +8094,15 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) { Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", - array[final_element], start, - ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); } /* Here, it is a legal append. If the new range begins with the first * value not in the set, it is extending the set, so the new first * value not in the set is one greater than the newly extended range. * */ + offset = *get_invlist_offset_addr(invlist); if (array[final_element] == start) { if (end != UV_MAX) { array[final_element] = end + 1; @@ -7330,7 +8110,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, * just let the range that this would extend to have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } return; } @@ -7340,16 +8120,18 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end len += 2; /* Includes an element each for the start and end of range */ - /* If overflows the existing space, extend, which may cause the array to be - * moved */ + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ if (max < len) { invlist_extend(invlist, len); - invlist_set_len(invlist, len); /* Have to set len here to avoid assert - failure in invlist_array() */ + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + array = invlist_array(invlist); } else { - invlist_set_len(invlist, len); + invlist_set_len(invlist, len, offset); } /* The next item on the list starts the range, the one after that is @@ -7361,14 +8143,14 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, just let * the range have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } } #ifndef PERL_IN_XSUB_RE IV -Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) +Perl__invlist_search(SV* const invlist, const UV cp) { /* Searches the inversion list for the entry that contains the input code * point . If is not in the list, -1 is returned. Otherwise, the @@ -7456,7 +8238,8 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) } void -Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) +Perl__invlist_populate_swatch(SV* const invlist, + const UV start, const UV end, U8* swatch) { /* populates a swatch of a swash the same way swatch_get() does in utf8.c, * but is used when the swash has an inversion list. This makes this much @@ -7549,14 +8332,16 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV } void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. The first list, - * , may be NULL, in which case a copy of the second list is returned. - * If is TRUE, the union is taken of the complement - * (inversion) of instead of b itself. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7571,8 +8356,8 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co * return the larger of the input lists, but then outside code might need * to keep track of whether to free the input list or not */ - UV* array_a; /* a's array */ - UV* array_b; + const UV* array_a; /* a's array */ + const UV* array_b; UV len_a; /* length of a's array */ UV len_b; @@ -7597,9 +8382,13 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co /* If either one is empty, the union is the other one */ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + if (*output == a) { if (a != NULL) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } } if (*output != b) { @@ -7608,18 +8397,27 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co _invlist_invert(*output); } } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; if (*output == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } /* The complement of an empty list is a list that has everything in it, * so the union with includes everything too */ if (complement_b) { if (a == *output) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } *output = _new_invlist(1); _append_range_to_invlist(*output, 0, UV_MAX); @@ -7628,6 +8426,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co *output = invlist_clone(a); } /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } @@ -7640,23 +8442,17 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co if (complement_b) { /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later, and clear the - * flag as we don't have to do anything else later */ + * do this, we just pretend the array starts one later */ if (array_b[0] == 0) { array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -7752,7 +8548,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co /* Set result to final length, which can change the pointer to array_u, so * re-find it */ if (len_u != _invlist_len(u)) { - invlist_set_len(u, len_u); + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); invlist_trim(u); array_u = invlist_array(u); } @@ -7773,29 +8569,36 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co } } - /* If we've changed b, restore it */ - if (complement_b) { - array_b[0] = 1; - } - - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *output || b == *output) { assert(! invlist_is_iterating(*output)); - SvREFCNT_dec_NN(*output); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } } *output = u; + return; } void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. - * If is TRUE, the result will be the intersection of - * and the complement (or inversion) of instead of directly. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7806,8 +8609,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * union above */ - UV* array_a; /* a's array */ - UV* array_b; + const UV* array_a; /* a's array */ + const UV* array_b; UV len_a; /* length of a's array */ UV len_b; @@ -7831,8 +8634,9 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, assert(a != b); /* Special case if either one is empty */ - len_a = _invlist_len(a); + len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; if (len_a != 0 && complement_b) { @@ -7841,25 +8645,39 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * must be every possible code point. Thus the intersection is * simply 'a'. */ if (*i != a) { - *i = invlist_clone(a); - if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } + + *i = invlist_clone(a); } /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } return; } /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The * intersection must be empty */ if (*i == a) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } else if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + return; } @@ -7872,23 +8690,17 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, if (complement_b) { /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later, and clear the - * flag as we don't have to do anything else later */ + * do this, we just pretend the array starts one later */ if (array_b[0] == 0) { array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -7970,7 +8782,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } /* The final length is what we've output so far plus what else is in the - * intersection. At most one of the subexpressions below will be non-zero */ + * intersection. At most one of the subexpressions below will be non-zero + * */ len_r = i_r; if (count >= 2) { len_r += (len_a - i_a) + (len_b - i_b); @@ -7979,7 +8792,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Set result to final length, which can change the pointer to array_r, so * re-find it */ if (len_r != _invlist_len(r)) { - invlist_set_len(r, len_r); + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); invlist_trim(r); array_r = invlist_array(r); } @@ -7995,18 +8808,21 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* If we've changed b, restore it */ - if (complement_b) { - array_b[0] = 1; - } - - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *i || b == *i) { assert(! invlist_is_iterating(*i)); - SvREFCNT_dec_NN(*i); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } } *i = r; + return; } @@ -8053,6 +8869,35 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) return invlist; } +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + #endif PERL_STATIC_INLINE SV* @@ -8068,66 +8913,19 @@ Perl__invlist_invert(pTHX_ SV* const invlist) * have a zero; removes it otherwise. As described above, the data * structure is set up so that this is very efficient */ - UV* len_pos = _get_invlist_len_addr(invlist); - PERL_ARGS_ASSERT__INVLIST_INVERT; assert(! invlist_is_iterating(invlist)); /* The inverse of matching nothing is matching everything */ - if (*len_pos == 0) { + if (_invlist_len(invlist) == 0) { _append_range_to_invlist(invlist, 0, UV_MAX); return; } - /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the - * zero element was a 0, so it is being removed, so the length decrements - * by 1; and vice-versa. SvCUR is unaffected */ - if (*get_invlist_zero_addr(invlist) ^= 1) { - (*len_pos)--; - } - else { - (*len_pos)++; - } + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); } -void -Perl__invlist_invert_prop(pTHX_ SV* const invlist) -{ - /* Complement the input inversion list (which must be a Unicode property, - * all of which don't match above the Unicode maximum code point.) And - * Perl has chosen to not have the inversion match above that either. This - * adds a 0x110000 if the list didn't end with it, and removes it if it did - */ - - UV len; - UV* array; - - PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; - - _invlist_invert(invlist); - - len = _invlist_len(invlist); - - if (len != 0) { /* If empty do nothing */ - array = invlist_array(invlist); - if (array[len - 1] != PERL_UNICODE_MAX + 1) { - /* Add 0x110000. First, grow if necessary */ - len++; - if (invlist_max(invlist) < len) { - invlist_extend(invlist, len); - array = invlist_array(invlist); - } - invlist_set_len(invlist, len); - array[len - 1] = PERL_UNICODE_MAX + 1; - } - else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1); - } - } - - return; -} #endif PERL_STATIC_INLINE SV* @@ -8135,44 +8933,38 @@ S_invlist_clone(pTHX_ SV* const invlist) { /* Return a new inversion list that is a copy of the input one, which is - * unchanged */ + * unchanged. The new list will not be mortal even if the old one was. */ /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); - STRLEN length = SvCUR(invlist); + STRLEN physical_length = SvCUR(invlist); + bool offset = *(get_invlist_offset_addr(invlist)); PERL_ARGS_ASSERT_INVLIST_CLONE; - SvCUR_set(new_invlist, length); /* This isn't done automatically */ - Copy(SvPVX(invlist), SvPVX(new_invlist), length, char); + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, _invlist_len(invlist), offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); return new_invlist; } -PERL_STATIC_INLINE UV* -S_get_invlist_iter_addr(pTHX_ SV* invlist) +PERL_STATIC_INLINE STRLEN* +S_get_invlist_iter_addr(SV* invlist) { /* Return the address of the UV that contains the current iteration * position */ PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; - return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV))); -} - -PERL_STATIC_INLINE UV* -S_get_invlist_version_id_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that contains the version id. */ - - PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); - return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV))); + return &(((XINVLIST*) SvANY(invlist))->iterator); } PERL_STATIC_INLINE void -S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ +S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ { PERL_ARGS_ASSERT_INVLIST_ITERINIT; @@ -8180,7 +8972,7 @@ S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ } PERL_STATIC_INLINE void -S_invlist_iterfinish(pTHX_ SV* invlist) +S_invlist_iterfinish(SV* invlist) { /* Terminate iterator for invlist. This is to catch development errors. * Any iteration that is interrupted before completed should call this @@ -8192,11 +8984,11 @@ S_invlist_iterfinish(pTHX_ SV* invlist) PERL_ARGS_ASSERT_INVLIST_ITERFINISH; - *get_invlist_iter_addr(invlist) = UV_MAX; + *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; } STATIC bool -S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) +S_invlist_iternext(SV* invlist, UV* start, UV* end) { /* An C call on must be used to set this up. * This call sets in <*start> and <*end>, the next range in . @@ -8205,14 +8997,14 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) * <*start> and <*end> are unchanged, and the next call to this function * will start over at the beginning of the list */ - UV* pos = get_invlist_iter_addr(invlist); + STRLEN* pos = get_invlist_iter_addr(invlist); UV len = _invlist_len(invlist); UV *array; PERL_ARGS_ASSERT_INVLIST_ITERNEXT; if (*pos >= len) { - *pos = UV_MAX; /* Force iterinit() to be required next time */ + *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ return FALSE; } @@ -8231,15 +9023,15 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) } PERL_STATIC_INLINE bool -S_invlist_is_iterating(pTHX_ SV* const invlist) +S_invlist_is_iterating(SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; - return *(get_invlist_iter_addr(invlist)) < UV_MAX; + return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; } PERL_STATIC_INLINE UV -S_invlist_highest(pTHX_ SV* const invlist) +S_invlist_highest(SV* const invlist) { /* Returns the highest code point that matches an inversion list. This API * has an ambiguity, as it returns 0 under either the highest is actually @@ -8301,51 +9093,85 @@ Perl__invlist_contents(pTHX_ SV* const invlist) } #endif -#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP +#ifndef PERL_IN_XSUB_RE void -Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) { - /* Dumps out the ranges in an inversion list. The string 'header' - * if present is output on a line before the first range */ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ UV start, end; + STRLEN count = 0; PERL_ARGS_ASSERT__INVLIST_DUMP; - if (header && strlen(header)) { - PerlIO_printf(Perl_debug_log, "%s\n", header); - } if (invlist_is_iterating(invlist)) { - PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n"); + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); return; } invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); } else if (end != start) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", - start, end); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); } else { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start); + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); } + count += 2; + } +} + +void +Perl__load_PL_utf8_foldclosures (pTHX) +{ + assert(! PL_utf8_foldclosures); + + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); } #endif -#if 0 +#ifdef PERL_ARGS_ASSERT__INVLISTEQ bool -S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) +S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) { /* Return a boolean as to if the two passed in inversion lists are * identical. The final argument, if TRUE, says to take the complement of * the second inversion list before doing the comparison */ - UV* array_a = invlist_array(a); - UV* array_b = invlist_array(b); + const UV* array_a = invlist_array(a); + const UV* array_b = invlist_array(b); UV len_a = _invlist_len(a); UV len_b = _invlist_len(b); @@ -8367,23 +9193,17 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) /* Otherwise, to complement, we invert. Here, the first element is * 0, just remove it. To do this, we just pretend the array starts - * one later, and clear the flag as we don't have to do anything - * else later */ + * one later */ array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -8400,27 +9220,19 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) } } - if (complement_b) { - array_b[0] = 1; - } return retval; } #endif #undef HEADER_LENGTH -#undef INVLIST_INITIAL_LENGTH #undef TO_INTERNAL_SIZE #undef FROM_INTERNAL_SIZE -#undef INVLIST_LEN_OFFSET -#undef INVLIST_ZERO_OFFSET -#undef INVLIST_ITER_OFFSET #undef INVLIST_VERSION_ID -#undef INVLIST_PREVIOUS_INDEX_OFFSET /* End of inversion list object */ STATIC void -S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) { /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' * constructs, and updates RExC_flags with them. On input, RExC_parse @@ -8480,7 +9292,6 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) } cs = REGEX_LOCALE_CHARSET; has_charset_modifier = LOCALE_PAT_MOD; - RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier) { @@ -8534,7 +9345,8 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); } else if (has_charset_modifier == *(RExC_parse - 1)) { - vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); } else { vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); @@ -8542,12 +9354,15 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) /*NOTREACHED*/ neg_modifier: RExC_parse++; - vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { - const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; if (! (wastedflags & wflagbit) ) { wastedflags |= wflagbit; /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ @@ -8600,13 +9415,17 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) RExC_flags |= posflags; RExC_flags &= ~negflags; set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } return; /*NOTREACHED*/ default: fail_modifiers: - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", - RExC_parse-seqstart, seqstart); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } @@ -8654,6 +9473,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) bool is_open = 0; I32 freeze_paren = 0; I32 after_freeze = 0; + I32 num; /* numeric backreferences */ char * parse_start = RExC_parse; /* MJD */ char * const oregcomp_parse = RExC_parse; @@ -8681,10 +9501,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char *start_arg = NULL; unsigned char op = 0; int argok = 1; - int internal_argval = 0; /* internal_argval is only useful if !argok */ + int internal_argval = 0; /* internal_argval is only useful if + !argok */ - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); } while ( *RExC_parse && *RExC_parse != ')' ) { if ( *RExC_parse == ':' ) { @@ -8697,9 +9519,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) verb_len = RExC_parse - start_verb; if ( start_arg ) { RExC_parse++; - while ( *RExC_parse && *RExC_parse != ')' ) + while ( *RExC_parse && *RExC_parse != ')' ) RExC_parse++; - if ( *RExC_parse != ')' ) + if ( *RExC_parse != ')' ) vFAIL("Unterminated verb pattern argument"); if ( RExC_parse == start_arg ) start_arg = NULL; @@ -8707,7 +9529,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if ( *RExC_parse != ')' ) vFAIL("Unterminated verb pattern"); } - + switch ( *start_verb ) { case 'A': /* (*ACCEPT) */ if ( memEQs(start_verb,verb_len,"ACCEPT") ) { @@ -8736,48 +9558,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if ( memEQs(start_verb,verb_len,"PRUNE") ) op = PRUNE; break; - case 'S': /* (*SKIP) */ - if ( memEQs(start_verb,verb_len,"SKIP") ) + case 'S': /* (*SKIP) */ + if ( memEQs(start_verb,verb_len,"SKIP") ) op = SKIP; break; case 'T': /* (*THEN) */ /* [19:06] :: is then */ if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; - RExC_seen |= REG_SEEN_CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; } break; } if ( ! op ) { - RExC_parse++; - vFAIL3("Unknown verb pattern '%.*s'", - verb_len, start_verb); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); } if ( argok ) { if ( start_arg && internal_argval ) { vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); + verb_len, start_verb); } else if ( argok < 0 && !start_arg ) { vFAIL3("Verb pattern '%.*s' has a mandatory argument", - verb_len, start_verb); + verb_len, start_verb); } else { ret = reganode(pRExC_state, op, internal_argval); if ( ! internal_argval && ! SIZE_ONLY ) { if (start_arg) { - SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); - ARG(ret) = add_data( pRExC_state, 1, "S" ); + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); RExC_rxi->data->data[ARG(ret)]=(void*)sv; ret->flags = 0; } else { - ret->flags = 1; + ret->flags = 1; } - } + } } if (!internal_argval) - RExC_seen |= REG_SEEN_VERBARG; + RExC_seen |= REG_VERBARG_SEEN; } else if ( start_arg ) { vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); + verb_len, start_verb); } else { ret = reg_node(pRExC_state, op); } @@ -8787,8 +9612,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (*RExC_parse == '?') { /* (?...) */ bool is_logical = 0; const char * const seqstart = RExC_parse; - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated"); + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(?...)', the '(' and '?' must be adjacent"); } RExC_parse++; @@ -8804,17 +9630,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto named_recursion; } else if (paren == '=') { /* (?P=...) named backref */ - /* this pretty much dupes the code for \k in regatom(), if - you change this make sure you change that */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ char* name_start = RExC_parse; U32 num = 0; SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -8839,12 +9667,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); /*NOTREACHED*/ case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; - else if (*RExC_parse != '=') + else if (*RExC_parse != '=') named_capture: { /* (?<...>) */ char *name_start; @@ -8853,15 +9683,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '\'': /* (?'...') */ name_start= RExC_parse; svname = reg_scan_name(pRExC_state, - SIZE_ONLY ? /* reverse test from the others */ - REG_RSN_RETURN_NAME : - REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - /*NOTREACHED*/ - } - if (*RExC_parse != paren) + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); if (SIZE_ONLY) { @@ -8901,20 +9726,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if ( count ) { - pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); pv[count] = RExC_npar; SvIV_set(sv_dat, SvIVX(sv_dat) + 1); } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); - sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); SvIOK_on(sv_dat); SvIV_set(sv_dat, 1); } #ifdef DEBUGGING - /* Yes this does cause a memory leak in debugging Perls */ - if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) SvREFCNT_dec_NN(svname); #endif @@ -8924,9 +9753,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = 1; goto capturing_parens; } - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; + /* FALLTHROUGH */ case '=': /* (?=...) */ RExC_seen_zerolen++; break; @@ -8941,7 +9771,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '|': /* (?|...) */ /* branch reset, behave like a (?:...) except that buffers in alternations share the same numbers */ - paren = ':'; + paren = ':'; after_freeze = freeze_paren = RExC_npar; break; case ':': /* (?:...) */ @@ -8951,29 +9781,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '@': /* (?@...) */ vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; - case '#': /* (?#...) */ - /* XXX As soon as we disallow separating the '?' and '*' (by - * spaces or (?#...) comment), it is believed that this case - * will be unreachable and can be removed. See - * [perl #117327] */ - while (*RExC_parse && *RExC_parse != ')') - RExC_parse++; - if (*RExC_parse != ')') - FAIL("Sequence (?#... not terminated"); - nextchar(pRExC_state); - *flagp = TRYAGAIN; - return NULL; case '0' : /* (?0) */ case 'R' : /* (?R) */ if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; *flagp |= POSTPONED; nextchar(pRExC_state); return ret; /*notreached*/ - { /* named and numeric backreferences */ - I32 num; + /* named and numeric backreferences */ case '&': /* (?&NAME) */ parse_start = RExC_parse - 1; named_recursion: @@ -8982,6 +9800,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); goto gen_recurse_regop; assert(0); /* NOT REACHED */ case '+': @@ -8995,8 +9815,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse--; /* rewind to let it be handled later */ goto parse_flags; - } - /*FALLTHROUGH */ + } + /* FALLTHROUGH */ case '1': case '2': case '3': case '4': /* (?1) */ case '5': case '6': case '7': case '8': case '9': RExC_parse--; @@ -9007,7 +9827,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; while (isDIGIT(*RExC_parse)) RExC_parse++; - if (*RExC_parse!=')') + if (*RExC_parse!=')') vFAIL("Expecting close bracket"); gen_recurse_regop: @@ -9041,30 +9861,34 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ARG2L_SET( ret, RExC_recurse_count++); RExC_emit++; DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); } else { RExC_size++; } - RExC_seen |= REG_SEEN_RECURSE; + RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ *flagp |= POSTPONED; nextchar(pRExC_state); return ret; - } /* named and numeric backreferences */ + assert(0); /* NOT REACHED */ case '?': /* (??...) */ is_logical = 1; if (*RExC_parse != '{') { RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } *flagp |= POSTPONED; paren = *RExC_parse++; - /* FALL THROUGH */ + /* FALLTHROUGH */ case '{': /* (?{...}) */ { U32 n = 0; @@ -9088,14 +9912,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { OP *o = cb->block; if (cb->src_regex) { - n = add_data(pRExC_state, 2, "rl"); + n = add_data(pRExC_state, STR_WITH_LEN("rl")); RExC_rxi->data->data[n] = (void*)SvREFCNT_inc((SV*)cb->src_regex); RExC_rxi->data->data[n+1] = (void*)o; } else { - n = add_data(pRExC_state, 1, - (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); RExC_rxi->data->data[n] = (void*)o; } } @@ -9133,7 +9957,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reg_node(pRExC_state, LOGICAL); if (!SIZE_ONLY) ret->flags = 1; - + tail = reg(pRExC_state, 1, &flag, depth+1); if (flag & RESTART_UTF8) { *flagp = RESTART_UTF8; @@ -9156,7 +9980,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) (ch == '>' ? '<' : ch)); RExC_parse++; if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -9186,15 +10010,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV *sv_dat; RExC_parse++; sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } - ret = reganode(pRExC_state,INSUBP,parno); + ret = reganode(pRExC_state,INSUBP,parno); goto insert_if_check_paren; } else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; + char *tmp; parno = atoi(RExC_parse++); while (isDIGIT(*RExC_parse)) @@ -9202,8 +10029,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if ((c = *nextchar(pRExC_state)) != ')') + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); + } insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -9215,14 +10048,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); } else - REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); c = *nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { - if (is_define) + if (is_define) vFAIL("(?(DEFINE)....) does not allow branches"); - lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { if (flags & RESTART_UTF8) { *flagp = RESTART_UTF8; @@ -9254,7 +10091,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else { - vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); } } case '[': /* (?[ ... ]) */ @@ -9283,16 +10121,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) capturing_parens: parno = RExC_npar; RExC_npar++; - + ret = reganode(pRExC_state, OPEN, parno); if (!SIZE_ONLY ){ - if (!RExC_nestroot) + if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE + if (RExC_seen & REG_RECURSE_SEEN && !RExC_open_parens[parno-1]) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Setting open paren #%"IVdf" to %d\n", + "Setting open paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); RExC_open_parens[parno-1]= ret; } @@ -9304,7 +10142,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else /* ! paren */ ret = NULL; - + parse_rest: /* Pick up the branches, linking them together. */ parse_start = RExC_parse; /* MJD */ @@ -9345,7 +10183,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { ender = reganode(pRExC_state, LONGJMP,0); - REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); } if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ @@ -9353,7 +10193,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (freeze_paren) { if (RExC_npar > after_freeze) after_freeze = RExC_npar; - RExC_npar = freeze_paren; + RExC_npar = freeze_paren; } br = regbranch(pRExC_state, &flags, 0, depth+1); @@ -9377,14 +10217,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Setting close paren #%"IVdf" to %d\n", + "Setting close paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ender))); RExC_close_parens[parno-1]= ender; - if (RExC_nestroot == parno) + if (RExC_nestroot == parno) RExC_nestroot = 0; - } + } Set_Node_Offset(ender,RExC_parse+1); /* MJD */ Set_Node_Length(ender,1); /* MJD */ break; @@ -9393,7 +10233,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '=': case '!': *flagp &= ~HASWIDTH; - /* FALL THROUGH */ + /* FALLTHROUGH */ case '>': ender = reg_node(pRExC_state, SUCCEED); break; @@ -9409,8 +10249,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("lsbr"); - regprop(RExC_rx, mysv_val1, lastbr); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(lastbr), @@ -9424,20 +10264,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (have_branch && !SIZE_ONLY) { char is_nothing= 1; if (depth==1) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; /* Hook the tails of the branches to the closing node. */ for (br = ret; br; br = regnext(br)) { const U8 op = PL_regkind[OP(br)]; if (op == BRANCH) { REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); - if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender) + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) is_nothing= 0; } else if (op == BRANCHJ) { REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); /* for now we always disable this optimisation * / - if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender) + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) */ is_nothing= 0; } @@ -9448,8 +10290,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("NADA"); - regprop(RExC_rx, mysv_val1, ret); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(ret), @@ -9643,7 +10485,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) op = *RExC_parse; - if (op == '{' && regcurly(RExC_parse, FALSE)) { + if (op == '{' && regcurly(RExC_parse)) { maxpos = NULL; #ifdef RE_TRACK_PATTERN_OFFSETS parse_start = RExC_parse; /* MJD */ @@ -9691,23 +10533,18 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } - else if (max == 0) { /* replace {0} with a nothing node */ + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { if (SIZE_ONLY) { - RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING]; + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); } - else { - RExC_emit = orig_emit; - } - ret = reg_node(pRExC_state, NOTHING); - - /* But the quantifier includes any '?', the non-greedy - * modifier, after the {}, [perl #118375] - * Likewise the '+', the possessive modifier. They are mutually exclusive. - */ - if (RExC_parse < RExC_end && (*RExC_parse == '?' || *RExC_parse == '+') ) { - nextchar(pRExC_state); - } - return ret; + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); } do_curly: @@ -9750,6 +10587,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; goto nest_check; } @@ -9787,6 +10626,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '*') { min = 0; @@ -9796,6 +10636,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '+') { min = 1; @@ -9808,10 +10649,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) nest_check: if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN3reg(RExC_parse, - "%.*s matches null string many times", - (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), - origparse); + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); (void)ReREFCNT_inc(RExC_rx_sv); } @@ -9841,11 +10684,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } STATIC bool -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, - const bool strict /* Apply stricter parsing rules? */ +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ ) { - + /* This is expected to be called by a parser routine that has recognized '\N' and needs to handle the rest. RExC_parse is expected to point at the first char following the N at the time of the call. On successful return, @@ -9900,7 +10744,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I more than one character */ GET_RE_DEBUG_FLAGS_DECL; - + PERL_ARGS_ASSERT_GROK_BSLASH_N; GET_RE_DEBUG_FLAGS; @@ -9908,27 +10752,30 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ /* The [^\n] meaning of \N ignores spaces and comments under the /x - * modifier. The other meaning does not */ + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) - ? regwhite( pRExC_state, RExC_parse ) + ? regpatws(pRExC_state, RExC_parse, + TRUE) /* means recognize comments */ : RExC_parse; /* Disambiguate between \N meaning a named character versus \N meaning * [^\n]. The former is assumed when it can't be the latter. */ - if (*p != '{' || regcurly(p, FALSE)) { + if (*p != '{' || regcurly(p)) { RExC_parse = p; if (! node_p) { - /* no bare \N in a charclass */ + /* no bare \N allowed in a charclass */ if (in_char_class) { vFAIL("\\N in a character class must be a named character: \\N{...}"); } return FALSE; } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ nextchar(pRExC_state); *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; - RExC_parse--; Set_Node_Length(*node_p, 1); /* MJD */ return TRUE; } @@ -9947,8 +10794,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ || ! (endbrace == RExC_parse /* nothing between the {} */ - || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ - && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ { if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); @@ -10086,7 +10935,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I } FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", (UV) flags); - } + } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); RExC_parse = endbrace; @@ -10134,7 +10983,7 @@ S_reg_recode(pTHX_ const char value, SV **encp) } PERL_STATIC_INLINE U8 -S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) +S_compute_EXACTish(RExC_state_t *pRExC_state) { U8 op; @@ -10154,7 +11003,9 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) } PERL_STATIC_INLINE void -S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) { /* This knows the details about sizing an EXACTish node, setting flags for * it (by setting <*flagp>, and potentially populating it with a single @@ -10169,48 +11020,111 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 * If is zero, the function assumes that the node is to contain only * the single character given by and calculates what * should be. In pass 1, it sizes the node appropriately. In pass 2, it - * additionally will populate the node's STRING with , if - * is 0. In both cases <*flagp> is appropriately set + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set * * It knows that under FOLD, the Latin Sharp S and UTF characters above * 255, must be folded (the former only when the rules indicate it can - * match 'ss') */ + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + if (! len_passed_in) { if (UTF) { - if (FOLD && (! LOC || code_point > 255)) { - _to_uni_fold_flags(NATIVE_TO_UNI(code_point), + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, character, &len, - FOLD_FLAGS_FULL | ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; } else { uvchr_to_utf8( character, code_point); len = UTF8SKIP(character); } - } - else if (! FOLD - || code_point != LATIN_SMALL_LETTER_SHARP_S - || ASCII_FOLD_RESTRICTED - || ! AT_LEAST_UNI_SEMANTICS) - { + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { *character = (U8) code_point; len = 1; - } - else { + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { *character = 's'; *(character + 1) = 's'; len = 2; } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } } if (SIZE_ONLY) { @@ -10234,15 +11148,36 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 { *flagp |= SIMPLE; } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } } -/* - - regatom - the lowest level - Try to identify anything special at the start of the pattern. If there - is, then handle it as required. This may involve generating a single regop, - such as for an assertion; or it may involve recursing, such as to - handle a () structure. +/* return atoi(p), unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + char *q = p; + + for (;isDIGIT(*q); q++) {} /* calculate length of num */ + if (q - p == 0 || q - p > 9) + return I32_MAX; + return atoi(p); +} + + +/* + - regatom - the lowest level + + Try to identify anything special at the start of the pattern. If there + is, then handle it as required. This may involve generating a single regop, + such as for an assertion; or it may involve recursing, such as to + handle a () structure. If the string doesn't start with something special then we gobble up as much literal text as we can. @@ -10296,7 +11231,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 by the other. Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with - TRYAGAIN. + TRYAGAIN. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be restarted. Otherwise does not return NULL. @@ -10311,6 +11246,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start = RExC_parse; U8 op; int invert = 0; + U8 arg; GET_RE_DEBUG_FLAGS_DECL; @@ -10393,7 +11329,8 @@ tryagain: *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; @@ -10406,12 +11343,6 @@ tryagain: vFAIL("Internal urp"); /* Supposed to be caught earlier. */ break; - case '{': - if (!regcurly(RExC_parse, FALSE)) { - RExC_parse++; - goto defchar; - } - /* FALL THROUGH */ case '?': case '+': case '*': @@ -10432,7 +11363,6 @@ tryagain: literal text handling code. */ switch ((U8)*++RExC_parse) { - U8 arg; /* Special Escapes */ case 'A': RExC_seen_zerolen++; @@ -10441,7 +11371,7 @@ tryagain: goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); - RExC_seen |= REG_SEEN_GPOS; + RExC_seen |= REG_GPOS_SEEN; *flagp |= SIMPLE; goto finish_meta_pat; case 'K': @@ -10452,7 +11382,7 @@ tryagain: * be necessary here to avoid cases of memory corruption, as * with: C<$_="x" x 80; s/x\K/y/> -- rgs */ - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -10466,7 +11396,7 @@ tryagain: goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_SEEN_CANY; + RExC_seen |= REG_CANY_SEEN; *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'X': @@ -10483,30 +11413,38 @@ tryagain: case 'b': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = BOUND + get_regex_charset(RExC_flags); if (op > BOUNDA) { /* /aa is same as /a */ op = BOUNDA; } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = NBOUND + get_regex_charset(RExC_flags); if (op > NBOUNDA) { /* /aa is same as /a */ op = NBOUNDA; } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); } goto finish_meta_pat; @@ -10550,6 +11488,9 @@ tryagain: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } join_posix_op_known: @@ -10563,12 +11504,12 @@ tryagain: } *flagp |= HASWIDTH|SIMPLE; - /* FALL THROUGH */ + /* FALLTHROUGH */ - finish_meta_pat: + finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ - break; + break; case 'p': case 'P': { @@ -10598,7 +11539,7 @@ tryagain: nextchar(pRExC_state); } break; - case 'N': + case 'N': /* Handle \N and \N{NAME} with multiple code points here and not * below because it can be multicharacter. join_exact() will join * them up later on. Also this makes sure that things like @@ -10620,10 +11561,11 @@ tryagain: break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: - { - char ch= RExC_parse[1]; + { + char ch= RExC_parse[1]; if (ch != '<' && ch != '\'' && ch != '{') { RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.2s... not terminated",parse_start); } else { /* this pretty much dupes the code for (?P=...) in reg(), if @@ -10634,10 +11576,11 @@ tryagain: SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -10664,15 +11607,16 @@ tryagain: } break; } - case 'g': + case 'g': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { I32 num; - bool isg = *RExC_parse == 'g'; - bool isrel = 0; bool hasbrace = 0; - if (isg) { + + if (*RExC_parse == 'g') { + bool isrel = 0; + RExC_parse++; if (*RExC_parse == '{') { RExC_parse++; @@ -10684,37 +11628,52 @@ tryagain: } if (hasbrace && !isDIGIT(*RExC_parse)) { if (isrel) RExC_parse--; - RExC_parse -= 2; + RExC_parse -= 2; goto parse_named_seq; - } } - num = atoi(RExC_parse); - if (isg && num == 0) { - if (*RExC_parse == '0') { + } + + num = S_backref_value(RExC_parse); + if (num == 0) vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); } - else { - vFAIL("Unterminated \\g... pattern"); + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); } } - if (isrel) { - num = RExC_npar - num; - if (num < 1) - vFAIL("Reference to nonexistent or unclosed group"); + else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } } - if (!isg && num > 9 && num >= RExC_npar) - /* Probably a character specified in octal, e.g. \35 */ - goto defchar; - else { + + /* at this point RExC_parse definitely points to a backref + * number */ + { #ifdef RE_TRACK_PATTERN_OFFSETS char * const parse_start = RExC_parse - 1; /* MJD */ #endif while (isDIGIT(*RExC_parse)) RExC_parse++; if (hasbrace) { - if (*RExC_parse != '}') + if (*RExC_parse != '}') vFAIL("Unterminated \\g{...} pattern"); RExC_parse++; - } + } if (!SIZE_ONLY) { if (num > (I32)RExC_rx->nparens) vFAIL("Reference to nonexistent group"); @@ -10744,7 +11703,7 @@ tryagain: case '\0': if (RExC_parse >= RExC_end) FAIL("Trailing \\"); - /* FALL THROUGH */ + /* FALLTHROUGH */ default: /* Do not generate "unrecognized" warnings here, we fall back into the quick-grab loop below */ @@ -10755,10 +11714,11 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - if ( reg_skipcomment( pRExC_state ) ) + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) goto tryagain; } - /* FALL THROUGH */ + /* FALLTHROUGH */ default: @@ -10768,25 +11728,35 @@ tryagain: defchar: { STRLEN len = 0; - UV ender; + UV ender = 0; char *p; char *s; #define MAX_NODE_STRING_SIZE 127 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; - STRLEN foldlen; - U8 node_type; + U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); + /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, * which allows the optimizer more things to look for */ bool maybe_exact; - ender = 0; - node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); /* In pass1, folded, we use a temporary buffer instead of the @@ -10797,10 +11767,9 @@ tryagain: reparse: - /* We do the EXACTFish to EXACT node only if folding, and not if in - * locale, as whether a character folds or not isn't known until - * runtime */ - maybe_exact = FOLD && ! LOC; + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10827,7 +11796,8 @@ tryagain: oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ switch ((U8)*p) { case '^': case '$': @@ -10865,7 +11835,8 @@ tryagain: case 's': case 'S': /* space class */ case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ - case 'X': /* eXtended Unicode "combining character sequence" */ + case 'X': /* eXtended Unicode "combining + character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ --p; goto loopdone; @@ -10913,7 +11884,7 @@ tryagain: p++; break; case 'a': - ender = ASCII_TO_NATIVE('\007'); + ender = '\a'; p++; break; case 'o': @@ -10976,12 +11947,37 @@ tryagain: } case 'c': p++; - ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); + ender = grok_bslash_c(*p++, SIZE_ONLY); break; - case '0': case '1': case '2': case '3':case '4': + case '8': case '9': /* must be a backreference */ + --p; + goto loopdone; + case '1': case '2': case '3':case '4': case '5': case '6': case '7': - if (*p == '0' || - (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. + * + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) + { /* Not to be treated as an octal constant, go + find backref */ + --p; + goto loopdone; + } + /* FALLTHROUGH */ + case '0': { I32 flags = PERL_SCAN_SILENT_ILLDIGIT; STRLEN numlen = 3; @@ -11000,11 +11996,6 @@ tryagain: form_short_octal_warning(p, numlen)); } } - else { /* Not to be treated as an octal constant, go - find backref */ - --p; - goto loopdone; - } if (PL_encoding && ender < 0x100) goto recode_encoding; break; @@ -11020,7 +12011,7 @@ tryagain: case '\0': if (p >= RExC_end) FAIL("Trailing \\"); - /* FALL THROUGH */ + /* FALLTHROUGH */ default: if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { /* Include any { following the alpha to emphasize @@ -11032,17 +12023,18 @@ tryagain: goto normal_default; } /* End of switch on '\' */ break; + case '{': + /* Currently we don't warn when the lbrace is at the start + * of a construct. This catches it in the middle of a + * literal string, or when its the first thing after + * something like "\b" */ + if (! SIZE_ONLY + && (len || (p > RExC_start && isALPHA_A(*(p -1))))) + { + ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); + } + /*FALLTHROUGH*/ default: /* A literal character */ - - if (! SIZE_ONLY - && RExC_flags & RXf_PMf_EXTENDED - && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low(p, UTF)) - { - vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), - "Escape literal pattern white space under /x"); - } - normal_default: if (UTF8_IS_START(*p) && UTF) { STRLEN numlen; @@ -11060,7 +12052,8 @@ tryagain: */ if ( RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ /* If the next thing is a quantifier, it applies to this * character only, which means that this character has to be in @@ -11075,7 +12068,10 @@ tryagain: goto loopdone; } - if (! FOLD) { + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { if (UTF) { const STRLEN unilen = reguni(pRExC_state, ender, s); if (unilen > 0) { @@ -11093,81 +12089,100 @@ tryagain: else { REGC((char)ender, s++); } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } } - else /* FOLD */ + else /* FOLD */ if (! ( UTF /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { - *(s++) = (char) ender; - maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender); - } - else { /* UTF */ - - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UNI_IS_INVARIANT(ender)) { - *s = (U8) ender; - foldlen = 1; - } else { - *s = UTF8_TWO_BYTE_HI(ender); - *(s + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) + { + maybe_exactfu = FALSE; } } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ + *(s++) = (char) ender; + } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; + } else { - UV folded = _to_uni_fold_flags( - ender, - (U8 *) s, - &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) - ); - - /* If this node only contains non-folding code points - * so far, see if this new one is also non-folding */ - if (maybe_exact) { - if (folded != ender) { + STRLEN foldlen; + + folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { maybe_exact = FALSE; } - else { - /* Here the fold is the original; we have - * to check further to see if anything - * folds to it */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", - "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = - _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - if (_invlist_contains_cp(PL_utf8_foldable, - ender)) - { - maybe_exact = FALSE; - } - } } - ender = folded; } - s += foldlen; - - /* The loop increments each time, as all but this - * path (and one other) through it add a single byte to the - * EXACTish node. But this one has changed len to be the - * correct final value, so subtract one to cancel out the - * increment that follows */ - len += foldlen - 1; + ender = folded; } if (next_is_quantifier) { @@ -11216,9 +12231,8 @@ tryagain: if (! UTF) { - /* These two have no multi-char folds to non-UTF characters - */ - if (ASCII_FOLD_RESTRICTED || LOC) { + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { goto loopdone; } @@ -11249,12 +12263,8 @@ tryagain: } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - - /* No Latin1 characters participate in multi-char - * folds under /l */ - if (LOC - || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( - *s, *(s+1)))) + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + *s, *(s+1)))) { break; } @@ -11291,6 +12301,15 @@ tryagain: * do any better */ if (len == 0) { len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } } else { /* Here, the node does contain some characters that aren't @@ -11349,14 +12368,26 @@ tryagain: if (len == 0) { OP(ret) = NOTHING; } - else{ - - /* If 'maybe_exact' is still set here, means there are no - * code points in the node that participate in folds */ - if (FOLD && maybe_exact) { - OP(ret) = EXACT; + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * (for /u), or depending on locale for /l */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } } - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); } RExC_parse = p - 1; @@ -11377,39 +12408,11 @@ tryagain: } STATIC char * -S_regwhite( RExC_state_t *pRExC_state, char *p ) -{ - const char *e = RExC_end; - - PERL_ARGS_ASSERT_REGWHITE; - - while (p < e) { - if (isSPACE(*p)) - ++p; - else if (*p == '#') { - bool ended = 0; - do { - if (*p++ == '\n') { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; - } - else - break; - } - return p; -} - -STATIC char * -S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) +S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) { /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is - * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */ + * ended by RExC_end. See also reg_skipcomment */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -11420,16 +12423,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) p += len; } else if (recognize_comment && *p == '#') { - bool ended = 0; - do { - p++; - if (is_LNBREAK_safe(p, e, UTF)) { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + p = reg_skipcomment(pRExC_state, p); } else break; @@ -11437,6 +12431,72 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) return p; } +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. Character classes ([:foo:]) can also be negated ([:^foo:]). Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. @@ -11555,8 +12615,9 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) } if (namedclass == OOB_NAMEDCLASS) - Simple_vFAIL3("POSIX class [:%.*s:] unknown", - t - s - 1, s + 1); + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); /* The #defines are structured so each complement is +1 to * the normal one */ @@ -11591,7 +12652,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) } STATIC bool -S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) +S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state) { /* This applies some heuristics at the current parse position (which should * be at a '[') to see if what follows might be intended to be a [:posix:] @@ -11644,8 +12705,9 @@ S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) } STATIC regnode * -S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth, - char * const oregcomp_parse) +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) { /* Handle the (?[...]) construct to do set operations */ @@ -11681,16 +12743,19 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), "The regex_sets feature is experimental" REPORT_LOCATION, - (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse); + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); while (RExC_parse < RExC_end) { SV* current = NULL; RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE); /* means recognize comments */ switch (*RExC_parse) { case '?': if (RExC_parse[1] == '[') depth++, RExC_parse++; - /* FALL THROUGH */ + /* FALLTHROUGH */ default: break; case '\\': @@ -11803,7 +12868,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f /* Skip white space */ RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE /* means recognize comments */ ); if (RExC_parse >= RExC_end) { Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); } @@ -11877,7 +12942,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f RExC_flags = save_flags; goto handle_operand; } - /* FALL THROUGH */ + /* FALLTHROUGH */ default: RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; @@ -11973,7 +13038,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f top_index -= 2; SvREFCNT_dec_NN(lparen); - /* FALL THROUGH */ + /* FALLTHROUGH */ } handle_operand: @@ -12132,10 +13197,79 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f } #undef IS_OPERAND +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a fold with other code points above Latin1. It + * would give false results if /aa has been specified. Multi-char folds + * are outside the scope of this, and must be handled specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character folds from code points + * that require UTF8 to express, so they can't match unless the + * target string is in UTF-8, so no action here is necessary, as + * regexec.c properly handles the general case for UTF-8 matching + * and multi-char folds */ + break; + default: + /* Use deprecated warning to increase the chances of this being + * output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + break; + } +} + /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ -#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, @@ -12185,8 +13319,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ SV* properties = NULL; /* Code points that match \p{} \P{} */ - SV* posixes = NULL; /* Code points that match classes like, [:word:], - extended beyond the Latin1 range */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ AV * multi_char_matches = NULL; /* Code points that fold to more than one @@ -12213,11 +13355,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * string is in UTF-8. (Because is under /d) */ SV* depends_list = NULL; - /* inversion list of code points this node matches. For much of the - * function, it includes only those that match regardless of the utf8ness - * of the target string */ + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ SV* cp_list = NULL; + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -12225,14 +13374,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif bool invert = FALSE; /* Is this class to be complemented */ - /* Is there any thing like \W or [:^digit:] that matches above the legal - * Unicode range? */ - bool runtime_posix_matches_above_Unicode = FALSE; + bool warn_super = ALWAYS_WARN_SUPER; regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; - const I32 orig_size = RExC_size; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -12253,9 +13401,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ANYOF_FLAGS(ret) = 0; RExC_emit += ANYOF_SKIP; - if (LOC) { - ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } listsv = newSVpvs_flags("# comment\n", SVs_TEMP); initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ @@ -12263,7 +13408,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ @@ -12273,7 +13418,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_naughty++; if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } } @@ -12311,7 +13456,7 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == ']') { @@ -12377,7 +13522,7 @@ parseit: case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { - /* We only pay attention to the first char of + /* We only pay attention to the first char of multichar strings being returned. I kinda wonder if this makes sense as it does change the behaviour from earlier versions, OTOH that behaviour was broken @@ -12398,7 +13543,12 @@ parseit: char *e; /* We will handle any undefined properties ourselves */ - U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF; + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; if (RExC_parse >= RExC_end) vFAIL2("Empty \\%c{}", (U8)value); @@ -12407,12 +13557,12 @@ parseit: e = strchr(RExC_parse++, '}'); if (!e) vFAIL2("Missing right brace on \\%c{}", c); - while (isSPACE(UCHARAT(RExC_parse))) + while (isSPACE(*RExC_parse)) RExC_parse++; if (e == RExC_parse) vFAIL2("Empty \\%c{}", c); n = e - RExC_parse; - while (isSPACE(UCHARAT(RExC_parse + n - 1))) + while (isSPACE(*(RExC_parse + n - 1))) n--; } else { @@ -12431,7 +13581,7 @@ parseit: * that bit) */ value ^= 'P' ^ 'p'; - while (isSPACE(UCHARAT(RExC_parse))) { + while (isSPACE(*RExC_parse)) { RExC_parse++; n--; } @@ -12441,14 +13591,13 @@ parseit: * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - Newx(name, n + sizeof("_i__\n"), char); - - sprintf(name, "%s%.*s%s\n", - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - ); + name = savepv(Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + )); /* Look up the property name, and get its swash and * inversion list, if the property is found */ @@ -12462,6 +13611,9 @@ parseit: &swash_init_flags ); if (! swash || ! (invlist = _get_swash_invlist(swash))) { + HV* curpkg = (IN_PERL_COMPILETIME) + ? PL_curstash + : CopSTASH(PL_curcop); if (swash) { SvREFCNT_dec_NN(swash); swash = NULL; @@ -12473,11 +13625,29 @@ parseit: * otherwise add it to the list for run-time look up */ if (ret_invlist) { RExC_parse = e + 1; - vFAIL3("Property '%.*s' is unknown", (int) n, name); + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); + } + + /* If the property name doesn't already have a package + * name, add the current one to it so that it can be + * referred to outside it. [perl #121777] */ + if (curpkg && ! instr(name, "::")) { + char* pkgname = HvNAME(curpkg); + if (strNE(pkgname, "main")) { + char* full_name = Perl_form(aTHX_ + "%s::%s", + pkgname, + name); + n = strlen(full_name); + Safefree(name); + name = savepvn(full_name, n); + } } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", (value == 'p' ? '+' : '!'), - name); + UTF8fARG(UTF, n, name)); has_user_defined_property = TRUE; /* We don't know yet, so have to assume that the @@ -12486,7 +13656,7 @@ parseit: * would cause things in to match * inappropriately, except that any \p{}, including * this one forces Unicode semantics, which means there - * is */ + * is no */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; } else { @@ -12494,9 +13664,23 @@ parseit: /* Here, did get the swash and its inversion list. If * the swash is from a user-defined property, then this * whole character class should be regarded as such */ - has_user_defined_property = - (swash_init_flags - & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + /* Invert if asking for the complement */ if (value == 'P') { @@ -12530,7 +13714,7 @@ parseit: case 'f': value = '\f'; break; case 'b': value = '\b'; break; case 'e': value = ASCII_TO_NATIVE('\033');break; - case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'a': value = '\a'; break; case 'o': RExC_parse--; /* function expects to be pointed at the 'o' */ { @@ -12570,7 +13754,7 @@ parseit: goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -12642,31 +13826,8 @@ parseit: /* Here, we have the current token in 'value' */ - /* What matches in a locale is not known until runtime. This includes - * what the Posix classes (like \w, [:space:]) match. Room must be - * reserved (one time per class) to store such classes, either if Perl - * is compiled so that locale nodes always should have this space, or - * if there is such class info to be stored. The space will contain a - * bit for each named class that is to be matched against. This isn't - * needed for \p{} and pseudo-classes, as they are not affected by - * locale, and hence are dealt with separately */ - if (LOC - && ! need_class - && (ANYOF_LOCALE == ANYOF_CLASS - || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX))) - { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; - ANYOF_CLASS_ZERO(ret); - } - ANYOF_FLAGS(ret) |= ANYOF_CLASS; - } - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a * literal, as is the character that began the false range, i.e. @@ -12677,16 +13838,19 @@ parseit: ? RExC_parse - rangebegin : 0; if (strict) { - vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); } else { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN4reg(RExC_parse, - "False [] range \"%*.*s\"", - w, w, rangebegin); + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); (void)ReREFCNT_inc(RExC_rx_sv); cp_list = add_cp_to_invlist(cp_list, '-'); - cp_list = add_cp_to_invlist(cp_list, prevvalue); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); } } @@ -12694,16 +13858,94 @@ parseit: element_count += 2; /* So counts for three values */ } - if (! SIZE_ONLY) { - U8 classnum = namedclass_to_classnum(namedclass); - if (namedclass >= ANYOF_MAX) { /* If a special class */ + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* Coverity thinks it is possible for this to be negative; both + * jhi and khw think it's not, but be safer */ + assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ - /* Here, should be \h, \H, \v, or \V. Neither /d nor - * /l make a difference in what these match. There - * would be problems if these characters had folds - * other than themselves, as cp_list is subject to - * folding. */ + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ if (classnum != _CC_VERTSPACE) { assert( namedclass == ANYOF_HORIZWS || namedclass == ANYOF_NHORIZWS); @@ -12716,246 +13958,25 @@ parseit: _invlist_union_maybe_complement_2nd( cp_list, PL_XPosix_ptrs[classnum], - cBOOL(namedclass % 2), /* Complement if odd + namedclass % 2 != 0, /* Complement if odd (NHORIZWS, NVERTWS) */ &cp_list); } } - else if (classnum == _CC_ASCII) { -#ifdef HAS_ISASCII - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else -#endif /* Not isascii(); just use the hard-coded definition for it */ - _invlist_union_maybe_complement_2nd( - posixes, - PL_ASCII, - cBOOL(namedclass % 2), /* Complement if odd - (NASCII) */ - &posixes); - } - else { /* Garden variety class */ - - /* The ascii range inversion list */ - SV* ascii_source = PL_Posix_ptrs[classnum]; - - /* The full Latin1 range inversion list */ - SV* l1_source = PL_L1Posix_ptrs[classnum]; - - /* This code is structured into two major clauses. The - * first is for classes whose complete definitions may not - * already be known. It not, the Latin1 definition - * (guaranteed to already known) is used plus code is - * generated to load the rest at run-time (only if needed). - * If the complete definition is known, it drops down to - * the second clause, where the complete definition is - * known */ - - if (classnum < _FIRST_NON_SWASH_CC) { - - /* Here, the class has a swash, which may or not - * already be loaded */ - - /* The name of the property to use to match the full - * eXtended Unicode range swash for this character - * class */ - const char *Xname = swash_property_names[classnum]; - - /* If returning the inversion list, we can't defer - * getting this until runtime */ - if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) { - PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", Xname, &PL_sv_undef, - 1, /* binary */ - 0, /* not tr/// */ - NULL, /* No inversion list */ - NULL /* No flags */ - ); - assert(PL_utf8_swash_ptrs[classnum]); - } - if ( ! PL_utf8_swash_ptrs[classnum]) { - if (namedclass % 2 == 0) { /* A non-complemented - class */ - /* If not /a matching, there are code points we - * don't know at compile time. Arrange for the - * unknown matches to be loaded at run-time, if - * needed */ - if (! AT_LEAST_ASCII_RESTRICTED) { - Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", - Xname); - } - if (LOC) { /* Under locale, set run-time - lookup */ - ANYOF_CLASS_SET(ret, namedclass); - } - else { - /* Add the current class's code points to - * the running total */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : l1_source, - &posixes); - } - } - else { /* A complemented class */ - if (AT_LEAST_ASCII_RESTRICTED) { - /* Under /a should match everything above - * ASCII, plus the complement of the set's - * ASCII matches */ - _invlist_union_complement_2nd(posixes, - ascii_source, - &posixes); - } - else { - /* Arrange for the unknown matches to be - * loaded at run-time, if needed */ - Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", - Xname); - runtime_posix_matches_above_Unicode = TRUE; - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else { - - /* We want to match everything in - * Latin1, except those things that - * l1_source matches */ - SV* scratch_list = NULL; - _invlist_subtract(PL_Latin1, l1_source, - &scratch_list); - - /* Add the list from this class to the - * running total */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, - scratch_list, - &posixes); - SvREFCNT_dec_NN(scratch_list); - } - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) - |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - } - } - goto namedclass_done; - } - - /* Here, there is a swash loaded for the class. If no - * inversion list for it yet, get it */ - if (! PL_XPosix_ptrs[classnum]) { - PL_XPosix_ptrs[classnum] - = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]); - } - } - - /* Here there is an inversion list already loaded for the - * entire class */ - - if (namedclass % 2 == 0) { /* A non-complemented class, - like ANYOF_PUNCT */ - if (! LOC) { - /* For non-locale, just add it to any existing list - * */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - } - else { /* Locale */ - SV* scratch_list = NULL; - - /* For above Latin1 code points, we use the full - * Unicode range */ - _invlist_intersection(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - /* And set the output to it, adding instead if - * there already is an output. Checking if - * 'posixes' is NULL first saves an extra clone. - * Its reference count will be decremented at the - * next union, etc, or if this is the only - * instance, at the end of the routine */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } - -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_BLANK) { -#endif - /* Set this class in the node for runtime - * matching */ - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* No isblank(), use the hard-coded ASCII-range - * blanks, adding them to the running total. */ - - _invlist_union(posixes, ascii_source, &posixes); - } -#endif - } - } - else { /* A complemented class, like ANYOF_NPUNCT */ - if (! LOC) { - _invlist_union_complement_2nd( - posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - /* Under /d, everything in the upper half of the - * Latin1 range matches this complement */ - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* Locale */ - SV* scratch_list = NULL; - _invlist_subtract(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_NBLANK) { -#endif - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* Get the list of all code points in Latin1 - * that are not ASCII blanks, and add them to - * the running total */ - _invlist_subtract(PL_Latin1, ascii_source, - &scratch_list); - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#endif - } - } + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); } - namedclass_done: - continue; /* Go get next character */ + continue; /* Go get next character */ } } /* end of namedclass \blah */ @@ -12968,13 +13989,15 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (range) { if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; - Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); range = 0; /* not a valid range */ } } @@ -13055,11 +14078,9 @@ parseit: value, foldbuf, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) ); /* Here, should be the first character of the @@ -13078,7 +14099,7 @@ parseit: AV* this_array; STRLEN cp_count = utf8_length(foldbuf, foldbuf + foldlen); - SV* multi_fold = sv_2mortal(newSVpvn("", 0)); + SV* multi_fold = sv_2mortal(newSVpvs("")); Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); @@ -13127,7 +14148,8 @@ parseit: /* Deal with this element of the class */ if (! SIZE_ONLY) { #ifndef EBCDIC - cp_list = _add_range_to_invlist(cp_list, prevvalue, value); + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); #else SV* this_range = _new_invlist(1); _append_range_to_invlist(this_range, prevvalue, value); @@ -13141,13 +14163,18 @@ parseit: * included. literal_endpoint==2 means both ends of the range used * a literal character, not \x{foo} */ if (literal_endpoint == 2 - && (prevvalue >= 'a' && value <= 'z') - || (prevvalue >= 'A' && value <= 'Z')) + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) { - _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA], + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII], &this_range); - } - _invlist_union(cp_list, this_range, &cp_list); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], + &this_range); + } + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); literal_endpoint = 0; #endif } @@ -13177,7 +14204,7 @@ parseit: #endif /* Look at the longest folds first */ - for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -13232,15 +14259,29 @@ parseit: return ret; } - /* If the character class contains only a single element, it may be - * optimizable into another node type which is smaller and runs faster. - * Check if this is the case for this class */ - if (element_count == 1 && ! ret_invlist) { + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { U8 op = END; U8 arg = 0; - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or - [:digit:] or \p{foo} */ + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ /* All named classes are mapped into POSIXish nodes, with its FLAG * argument giving which class it is */ @@ -13296,14 +14337,6 @@ parseit: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } -#ifndef HAS_ISBLANK - if (op == POSIXL - && (namedclass == ANYOF_BLANK - || namedclass == ANYOF_NBLANK)) - { - op = POSIXA; - } -#endif join_posix: /* The odd numbered ones are the complements of the @@ -13341,6 +14374,26 @@ parseit: op = POSIXA; } } + else if (prevvalue == 'A') { + if (value == 'Z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } + } + else if (prevvalue == 'a') { + if (value == 'z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } + } } /* Here, we have changed away from its initial value iff we found @@ -13358,13 +14411,16 @@ parseit: /* To get locale nodes to not use the full ANYOF size would * require moving the code above that writes the portions * of it that aren't in other nodes to after this point. - * e.g. ANYOF_CLASS_SET */ + * e.g. ANYOF_POSIXL_SET */ RExC_size = orig_size; } } else { RExC_emit = (regnode *)orig_emit; if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } if (invert) { op += NPOSIXD - POSIXD; } @@ -13380,13 +14436,17 @@ parseit: *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } RExC_parse = (char *) cur_parse; SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); return ret; } } @@ -13397,238 +14457,179 @@ parseit: /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ - if (FOLD && cp_list) { - UV start, end; /* End points of code point ranges */ - - SV* fold_intersection = NULL; - - /* If the highest code point is within Latin1, we can use the - * compiled-in Alphas list, and not have to go out to disk. This - * yields two false positives, the masculine and feminine ordinal - * indicators, which are weeded out below using the - * IS_IN_SOME_FOLD_L1() macro */ - if (invlist_highest(cp_list) < 256) { - _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, - &fold_intersection); - } - else { - - /* Here, there are non-Latin1 code points, so we will have to go - * fetch the list of all the characters that participate in folds - */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { - - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode - * rules hard-coded into Perl. (This case happens legitimately - * during compilation of Perl itself before the Unicode tables - * are generated) */ - if (_invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = newHV(); - } - else { - /* If the folds haven't been read in, call a fold function - * to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - - /* This string is just a short named one above \xff */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ - } - PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); - } + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; } /* Only the characters in this class that participate in folds need * be checked. Get the intersection of this class and all the * possible characters that are foldable. This can quickly narrow * down a large class */ - _invlist_intersection(PL_utf8_foldable, cp_list, + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, &fold_intersection); - } - /* Now look at the foldable characters in this class individually */ - invlist_iterinit(fold_intersection); - while (invlist_iternext(fold_intersection, &start, &end)) { - UV j; + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { - /* Locale folding for Latin1 characters is deferred until runtime */ - if (LOC && start < 256) { - start = 256; + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } } - /* Look at every character in the range */ - for (j = start; j <= end; j++) { - - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - SV** listp; + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; - if (j < 256) { + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; - /* We have the latin1 folding rules hard-coded here so that - * an innocent-looking character class, like /[ks]/i won't - * have to go out to disk to find the possible matches. - * XXX It would be better to generate these via regen, in - * case a new version of the Unicode standard adds new - * mappings, though that is not really likely, and may be - * caught by the default: case of the switch below. */ + if (j < 256) { - if (IS_IN_SOME_FOLD_L1(j)) { + if (IS_IN_SOME_FOLD_L1(j)) { - /* ASCII is always matched; non-ASCII is matched only - * under Unicode rules */ - if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { - cp_list = - add_cp_to_invlist(cp_list, PL_fold_latin1[j]); + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } } - else { - depends_list = - add_cp_to_invlist(depends_list, PL_fold_latin1[j]); + + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); } + continue; } - if (HAS_NONLATIN1_FOLD_CLOSURE(j) - && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for to the - * inversion list. */ - switch (j) { - case 'k': - case 'K': - cp_list = - add_cp_to_invlist(cp_list, KELVIN_SIGN); - break; - case 's': - case 'S': - cp_list = add_cp_to_invlist(cp_list, - LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - cp_list = add_cp_to_invlist(cp_list, - GREEK_CAPITAL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - cp_list = - add_cp_to_invlist(cp_list, ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - cp_list = add_cp_to_invlist(cp_list, - LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - cp_list = add_cp_to_invlist(cp_list, - LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 to - * express, so they can't match unless the - * target string is in UTF-8, so no action here - * is necessary, as regexec.c properly handles - * the general case for UTF-8 matching and - * multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; - } - } - continue; - } + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } - /* Here is an above Latin1 character. We don't have the rules - * hard-coded for it. First, get its fold. This is the simple - * fold, as the multi-character folds have been handled earlier - * and separated out */ - _to_uni_fold_flags(j, foldbuf, &foldlen, - ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); - - /* Single character fold of above Latin1. Add everything in - * its fold closure to the list that this node should match. - * The fold closures data structure is a hash with the keys - * being the UTF-8 of every character that is folded to, like - * 'k', and the values each an array of all code points that - * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. - * Multi-character folds are not included */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); - - /* /aa doesn't allow folds between ASCII and non-; /l - * doesn't allow them between above and below 256 */ - if ((ASCII_FOLD_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && c < 256)) { - continue; - } + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ - if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) - { - cp_list = add_cp_to_invlist(cp_list, c); - } - else { - depends_list = add_cp_to_invlist(depends_list, c); + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } } } } } - } - SvREFCNT_dec_NN(fold_intersection); + SvREFCNT_dec_NN(fold_intersection); + } + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); } /* And combine the result (if any) with any inversion list from posix * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ - if (posixes) { + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } if (! DEPENDS_SEMANTICS) { if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); @@ -13642,10 +14643,8 @@ parseit: /* Under /d, we put into a separate list the Latin1 things that * match only when the target string is utf8 */ SV* nonascii_but_latin1_properties = NULL; - _invlist_intersection(posixes, PL_Latin1, + _invlist_intersection(posixes, PL_UpperLatin1, &nonascii_but_latin1_properties); - _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII, - &nonascii_but_latin1_properties); _invlist_subtract(posixes, nonascii_but_latin1_properties, &posixes); if (cp_list) { @@ -13679,7 +14678,6 @@ parseit: * , because having a Unicode property forces Unicode * semantics */ if (properties) { - bool warn_super = ! has_user_defined_property; if (cp_list) { /* If it matters to the final outcome, see if a non-property @@ -13690,14 +14688,8 @@ parseit: * are using above-Unicode code points indicates they should know * the issues involved */ if (warn_super) { - bool non_prop_matches_above_Unicode = - runtime_posix_matches_above_Unicode - | (invlist_highest(cp_list) > PERL_UNICODE_MAX); - if (invert) { - non_prop_matches_above_Unicode = - ! non_prop_matches_above_Unicode; - } - warn_super = ! non_prop_matches_above_Unicode; + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); } _invlist_union(properties, cp_list, &cp_list); @@ -13708,7 +14700,7 @@ parseit: } if (warn_super) { - OP(ret) = ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; } } @@ -13721,12 +14713,33 @@ parseit: * shouldn't. Therefore we can't invert folded locale now, as it won't be * folded until runtime */ + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ - if (invert - && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) + if (cp_list + && invert + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { @@ -13756,15 +14769,6 @@ parseit: return orig_emit; } - /* If we didn't do folding, it's because some information isn't available - * until runtime; set the run-time fold flag for these. (We don't have to - * worry about properties folding, as that is taken care of by the swash - * fetching) */ - if (FOLD && LOC) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; - } - /* Some character classes are equivalent to other nodes. Such nodes take * up less room and generally fewer operations to execute than ANYOF nodes. * Above, we checked for and optimized into some such equivalents for @@ -13783,8 +14787,13 @@ parseit: if (cp_list && ! invert && ! depends_list - && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) { UV start, end; U8 op = END; /* The optimzation node-type */ @@ -13808,7 +14817,7 @@ parseit: && (start < 256 || UTF)) { /* Here, the list contains a single code point. Can optimize - * into an EXACT node */ + * into an EXACTish node */ value = start; @@ -13838,12 +14847,6 @@ parseit: } } else { - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } if (_invlist_contains_cp(PL_utf8_foldable, value)) { op = EXACT; } @@ -13883,7 +14886,9 @@ parseit: RExC_parse = (char *)cur_parse; if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } SvREFCNT_dec_NN(cp_list); @@ -13896,55 +14901,8 @@ parseit: * for things that belong in the bitmap, put them there, and delete from * . While we are at it, see if everything above 255 is in the * list, and if so, set a flag to speed up execution */ - ANYOF_BITMAP_ZERO(ret); - if (cp_list) { - /* This gets set if we actually need to modify things */ - bool change_invlist = FALSE; - - UV start, end; - - /* Start looking through */ - invlist_iterinit(cp_list); - while (invlist_iternext(cp_list, &start, &end)) { - UV high; - int i; - - if (end == UV_MAX && start <= 256) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - } - - /* Quit if are above what we should change */ - if (start > 255) { - break; - } - - change_invlist = TRUE; - - /* Set all the bits in the range, up to the max that we are doing */ - high = (end < 255) ? end : 255; - for (i = start; i <= (int) high; i++) { - if (! ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - } - invlist_iterfinish(cp_list); - - /* Done with loop; remove any code points that are in the bitmap from - * */ - if (change_invlist) { - _invlist_subtract(cp_list, PL_Latin1, &cp_list); - } - - /* If have completely emptied it, remove it completely */ - if (_invlist_len(cp_list) == 0) { - SvREFCNT_dec_NN(cp_list); - cp_list = NULL; - } - } + populate_ANYOF_from_invlist(ret, &cp_list); if (invert) { ANYOF_FLAGS(ret) |= ANYOF_INVERT; @@ -13961,6 +14919,7 @@ parseit: else { cp_list = depends_list; } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; } /* If there is a swash and more than one element, we can't use the swash in @@ -13970,82 +14929,130 @@ parseit: swash = NULL; } - if (! cp_list - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - { - ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { - /* av[0] stores the character class description in its textual form: - * used later (regexec.c:Perl_regclass_swash()) to initialize the - * appropriate swash, and is also useful for dumping the regnode. - * av[1] if NULL, is a placeholder to later contain the swash computed - * from av[0]. But if no further computation need be done, the - * swash is stored there now. - * av[2] stores the cp_list inversion list for use in addition or - * instead of av[0]; used only if av[1] is NULL - * av[3] is set if any component of the class is from a user-defined - * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; - av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - ? SvREFCNT_inc(listsv) : &PL_sv_undef); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { + assert(cp_list); av_store(av, 1, swash); SvREFCNT_dec_NN(cp_list); } else { - av_store(av, 1, NULL); + av_store(av, 1, &PL_sv_undef); if (cp_list) { - av_store(av, 2, cp_list); - av_store(av, 3, newSVuv(has_user_defined_property)); + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); } } + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + rv = newRV_noinc(MUTABLE_SV(av)); - n = add_data(pRExC_state, 1, "s"); + n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; - ARG_SET(ret, n); + ARG_SET(node, n); } - - *flagp |= HASWIDTH|SIMPLE; - return ret; } -#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* reg_skipcomment() - Absorbs an /x style # comments from the input stream. - Returns true if there is more text remaining in the stream. - Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment - terminates the pattern without including a newline. + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. - Note its the callers responsibility to ensure that we are + Note it's the callers responsibility to ensure that we are actually in /x mode */ -STATIC bool -S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +PERL_STATIC_INLINE char* +S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) { - bool ended = 0; - PERL_ARGS_ASSERT_REG_SKIPCOMMENT; - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') { - ended = 1; - break; + assert(*p = '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; } - if (!ended) { - /* we ran off the end of the pattern without ending - the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; - return 0; - } else - return 1; + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; } /* nextchar() @@ -14083,16 +15090,14 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } if (RExC_flags & RXf_PMf_EXTENDED) { - if (isSPACE(*RExC_parse)) { - RExC_parse++; - continue; - } - else if (*RExC_parse == '#') { - if ( reg_skipcomment( pRExC_state ) ) - continue; - } + char * p = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + if (p != RExC_parse) { + RExC_parse = p; + continue; + } } - return retval; + return retval; } } @@ -14116,21 +15121,22 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) } if (RExC_emit >= RExC_emit_bound) Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", - op, RExC_emit, RExC_emit_bound); + op, (void*)RExC_emit, (void*)RExC_emit_bound); NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", - "reg_node", __LINE__, + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + "reg_node", __LINE__, PL_reg_name[op], - (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } #endif @@ -14154,16 +15160,16 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 2; - /* + /* We can't do this: - - assert(2==regarglen[op]+1); + + assert(2==regarglen[op]+1); Anything larger than this has to allocate the extra amount. If we changed this to be: - + RExC_size += (1 + regarglen[op]); - + then it wouldn't matter. Its not clear what side effect might come from that so its not done so far. -- dmq @@ -14172,25 +15178,26 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) } if (RExC_emit >= RExC_emit_bound) Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", - op, RExC_emit, RExC_emit_bound); + op, (void*)RExC_emit, (void*)RExC_emit_bound); NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", __LINE__, PL_reg_name[op], - (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Cur_Node_Offset; } -#endif +#endif RExC_emit = ptr; return(ret); } @@ -14198,7 +15205,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) /* - reguni - emit (if appropriate) a Unicode character */ -STATIC STRLEN +PERL_STATIC_INLINE STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; @@ -14225,6 +15232,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGINSERT; + PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); @@ -14259,30 +15267,32 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) StructCopy(--src, --dst, regnode); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", __LINE__, PL_reg_name[op], - (UV)(dst - RExC_emit_start) > RExC_offsets[0] + (UV)(dst - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(src - RExC_emit_start), (UV)(dst - RExC_emit_start), - (UV)RExC_offsets[0])); + (UV)RExC_offsets[0])); Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); } #endif } - + place = opnd; /* Op node, where operand used to be. */ #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", __LINE__, PL_reg_name[op], - (UV)(place - RExC_emit_start) > RExC_offsets[0] + (UV)(place - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(place - RExC_emit_start), (UV)(RExC_parse - RExC_start), @@ -14290,7 +15300,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset(place, RExC_parse); Set_Node_Length(place, 1); } -#endif +#endif src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); @@ -14302,7 +15312,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) */ /* TODO: All three parms should be const */ STATIC void -S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14323,7 +15334,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tail" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), @@ -14349,7 +15360,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de - Look for optimizable sequences at the same time. - currently only looks for EXACT chains. -This is experimental code. The idea is to use this routine to perform +This is experimental code. The idea is to use this routine to perform in place optimizations on branches and groups as they are constructed, with the long term intention of removing optimization from study_chunk so that it is purely analytical. @@ -14361,7 +15372,8 @@ to control which is which. /* TODO: All four parms should be const */ STATIC U8 -S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14384,8 +15396,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN if (PL_regkind[OP(scan)] == EXACT) { - bool has_exactf_sharp_s; /* Unexamined in this routine */ - if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) return EXACT; } #endif @@ -14393,10 +15406,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, switch (OP(scan)) { case EXACT: case EXACTF: + case EXACTFA_NO_TRIE: case EXACTFA: case EXACTFU: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -14411,7 +15424,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), @@ -14424,8 +15437,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv_val=sv_newmortal(); DEBUG_PARSE_MSG(""); - regprop(RExC_rx, mysv_val, val); - PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", SvPV_nolen_const(mysv_val), (IV)REG_NODE_NUM(val), (IV)(val - scan) @@ -14446,23 +15460,49 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ #ifdef DEBUGGING -static void + +static void +S_regdump_intflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitanchored_substr) { - RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), RE_SV_DUMPLEN(r->anchored_substr), 30); PerlIO_printf(Perl_debug_log, "anchored %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_substr), (IV)r->anchored_offset); } else if (r->anchored_utf8) { - RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), RE_SV_DUMPLEN(r->anchored_utf8), 30); PerlIO_printf(Perl_debug_log, "anchored utf8 %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_utf8), (IV)r->anchored_offset); - } + } if (r->float_substr) { - RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), RE_SV_DUMPLEN(r->float_substr), 30); PerlIO_printf(Perl_debug_log, "floating %s%s at %"IVdf"..%"UVuf" ", s, RE_SV_TAIL(r->float_substr), (IV)r->float_min_offset, (UV)r->float_max_offset); } else if (r->float_utf8) { - RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), RE_SV_DUMPLEN(r->float_utf8), 30); PerlIO_printf(Perl_debug_log, "floating utf8 %s%s at %"IVdf"..%"UVuf" ", @@ -14545,7 +15585,7 @@ Perl_regdump(pTHX_ const regexp *r) (r->check_substr == r->float_substr && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored")); - if (r->extflags & RXf_NOSCAN) + if (r->intflags & PREGf_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->extflags & RXf_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); @@ -14553,22 +15593,22 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, ") "); if (ri->regstclass) { - regprop(r, sv, ri->regstclass); + regprop(r, sv, ri->regstclass, NULL); PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); } - if (r->extflags & RXf_ANCH) { + if (r->intflags & PREGf_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); - if (r->extflags & RXf_ANCH_BOL) + if (r->intflags & PREGf_ANCH_BOL) PerlIO_printf(Perl_debug_log, "(BOL)"); - if (r->extflags & RXf_ANCH_MBOL) + if (r->intflags & PREGf_ANCH_MBOL) PerlIO_printf(Perl_debug_log, "(MBOL)"); - if (r->extflags & RXf_ANCH_SBOL) + if (r->intflags & PREGf_ANCH_SBOL) PerlIO_printf(Perl_debug_log, "(SBOL)"); - if (r->extflags & RXf_ANCH_GPOS) + if (r->intflags & PREGf_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); PerlIO_putc(Perl_debug_log, ' '); } - if (r->extflags & RXf_GPOS_SEEN) + if (r->intflags & PREGf_GPOS_SEEN) PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) PerlIO_printf(Perl_debug_log, "plus "); @@ -14578,7 +15618,10 @@ Perl_regdump(pTHX_ const regexp *r) if (r->extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); - DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); + DEBUG_FLAGS_r({ + regdump_extflags("r->extflags: ",r->extflags); + regdump_intflags("r->intflags: ",r->intflags); + }); #else PERL_ARGS_ASSERT_REGDUMP; PERL_UNUSED_CONTEXT; @@ -14587,21 +15630,11 @@ Perl_regdump(pTHX_ const regexp *r) } /* -- regprop - printable representation of opcode +- regprop - printable representation of opcode, with run time support */ -#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ -STMT_START { \ - if (do_sep) { \ - Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ - if (flags & ANYOF_INVERT) \ - /*make sure the invert info is in each */ \ - sv_catpvs(sv, "^"); \ - do_sep = 0; \ - } \ -} STMT_END void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) { #ifdef DEBUGGING dVAR; @@ -14617,10 +15650,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) || _CC_VERTSPACE != 16 #error Need to adjust order of anyofs[] #endif - "[\\w]", - "[\\W]", - "[\\d]", - "[\\D]", + "\\w", + "\\W", + "\\d", + "\\D", "[:alpha:]", "[:^alpha:]", "[:lower:]", @@ -14637,8 +15670,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^graph:]", "[:cased:]", "[:^cased:]", - "[\\s]", - "[\\S]", + "\\s", + "\\S", "[:blank:]", "[:^blank:]", "[:xdigit:]", @@ -14649,12 +15682,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^cntrl:]", "[:ascii:]", "[:^ascii:]", - "[\\v]", - "[\\V]" + "\\v", + "\\V" }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; - + PERL_ARGS_ASSERT_REGPROP; sv_setpvs(sv, ""); @@ -14662,16 +15695,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from regexec.c, and it would be hard to supply pRExC_state. */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; if (k == EXACT) { sv_catpvs(sv, " "); - /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) - * is a crude hack but it may be the best for now since - * we have no flag "this EXACTish node was UTF-8" + /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) + * is a crude hack but it may be the best for now since + * we have no flag "this EXACTish node was UTF-8" * --jhi */ pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], PERL_PV_ESCAPE_UNI_DETECT | @@ -14690,44 +15724,28 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) NULL; const reg_trie_data * const trie = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; - + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); DEBUG_TRIE_COMPILE_r( - Perl_sv_catpvf(aTHX_ sv, - "", - (UV)trie->startstate, - (IV)trie->statecount-1, /* -1 because of the unused 0 element */ - (UV)trie->wordcount, - (UV)trie->minlen, - (UV)trie->maxlen, - (UV)TRIE_CHARCOUNT(trie), - (UV)trie->uniquecharcount - ) + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { - int i; - int rangestart = -1; - U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); sv_catpvs(sv, "["); - for (i = 0; i <= 256; i++) { - if (i < 256 && BITMAP_TEST(bitmap,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; - } - } + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); sv_catpvs(sv, "]"); - } - + } + } else if (k == CURLY) { if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ @@ -14735,7 +15753,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || (OP(o) < NREF)) { @@ -14743,7 +15763,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SV **name= av_fetch(list, ARG(o), 0 ); if (name) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); - } + } else { AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); @@ -14758,22 +15778,37 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } } - } - } else if (k == GOSUB) - Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ + } + if ( k == REF && reginfo) { + U32 n = ARG(o); /* which paren pair */ + I32 ln = prog->offs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } + } else if (k == GOSUB) + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); else if (k == VERB) { - if (!o->flags) - Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + if (!o->flags) + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { - int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - if (flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE_FLAGS) sv_catpvs(sv, "{loc}"); if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); @@ -14782,140 +15817,137 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "^"); /* output what the standard cp 0-255 bitmap matches */ - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - do_sep = 1; - rangestart = -1; - } - } - - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - /* output any special charclass tests (used entirely under use locale) */ - if (ANYOF_CLASS_TEST_ANY_SET(o)) - for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) - if (ANYOF_CLASS_TEST(o,i)) { + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); + + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { sv_catpv(sv, anyofs[i]); do_sep = 1; } - - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - - if (flags & ANYOF_NON_UTF8_LATIN1_ALL) { - sv_catpvs(sv, "{non-utf8-latin1-all}"); - } - - /* output information about the unicode matching */ - if (flags & ANYOF_UNICODE_ALL) - sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) - sv_catpvs(sv, "{unicode}"); - if (flags & ANYOF_NONBITMAP_NON_UTF8) - sv_catpvs(sv, "{outside bitmap}"); - - if (ANYOF_NONBITMAP(o)) { - SV *lv; /* Set if there is something outside the bit map */ - SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL); - bool byte_output = FALSE; /* If something in the bitmap has been - output */ - - if (lv && lv != &PL_sv_undef) { - if (sw) { - U8 s[UTF8_MAXBYTES_CASE+1]; - - for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */ - uvchr_to_utf8(s, i); - - if (i < 256 - && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate - things already - output as part - of the bitmap */ - && swash_fetch(sw, s, TRUE)) - { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - byte_output = TRUE; - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) { - put_byte(sv, rangestart); - } - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i-1); - } - rangestart = -1; - } - } - } + } + } - { - char *s = savesvpv(lv); - char * const origs = s; + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); + } - while (*s && *s != '\n') - s++; + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } - if (*s == '\n') { - const char * const t = ++s; + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { + char *s = savesvpv(lv); + char * const origs = s; + + while (*s && *s != '\n') + s++; + + if (*s == '\n') { + const char * const t = ++s; + + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } if (byte_output) { sv_catpvs(sv, " "); } - while (*s) { - if (*s == '\n') { + while (*s) { + if (*s == '\n') { /* Truncate very long output */ - if (s - origs > 256) { - Perl_sv_catpvf(aTHX_ sv, - "%.*s...", - (int) (s - origs - 1), - t); - goto out_dump; - } - *s = ' '; - } - else if (*s == '\t') { - *s = '-'; - } - s++; - } - if (s[-1] == ' ') - s[-1] = 0; + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } + s++; + } + if (s[-1] == ' ') + s[-1] = 0; - sv_catpv(sv, t); - } + sv_catpv(sv, t); + } - out_dump: + out_dump: - Safefree(origs); - } - SvREFCNT_dec_NN(lv); - } + Safefree(origs); + SvREFCNT_dec_NN(lv); + } + + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); + } + } } Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; - if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } } else { - sv_catpv(sv, anyofs[index]); + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); } } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -14925,9 +15957,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) PERL_UNUSED_ARG(sv); PERL_UNUSED_ARG(o); PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); #endif /* DEBUGGING */ } + + SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ @@ -14957,17 +15992,17 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r) return prog->check_substr ? prog->check_substr : prog->check_utf8; } -/* - pregfree() - - handles refcounting and freeing the perl core regexp structure. When - it is necessary to actually free the structure the first thing it +/* + pregfree() + + handles refcounting and freeing the perl core regexp structure. When + it is necessary to actually free the structure the first thing it does is call the 'free' method of the regexp_engine associated to - the regexp, allowing the handling of the void *pprivate; member - first. (This routine is not overridable by extensions, which is why + the regexp, allowing the handling of the void *pprivate; member + first. (This routine is not overridable by extensions, which is why the extensions free is called first.) - - See regdupe and regdupe_internal if you change anything here. + + See regdupe and regdupe_internal if you change anything here. */ #ifndef PERL_IN_XSUB_RE void @@ -14991,7 +16026,7 @@ Perl_pregfree2(pTHX_ REGEXP *rx) CALLREGFREE_PVT(rx); /* free the private data */ SvREFCNT_dec(RXp_PAREN_NAMES(r)); Safefree(r->xpv_len_u.xpvlenu_pv); - } + } if (r->substrs) { SvREFCNT_dec(r->anchored_substr); SvREFCNT_dec(r->anchored_utf8); @@ -15009,22 +16044,22 @@ Perl_pregfree2(pTHX_ REGEXP *rx) } /* reg_temp_copy() - + This is a hacky workaround to the structural issue of match results being stored in the regexp structure which is in turn stored in PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern could be PL_curpm in multiple contexts, and could require multiple result sets being associated with the pattern simultaneously, such as when doing a recursive match with (??{$qr}) - - The solution is to make a lightweight copy of the regexp structure + + The solution is to make a lightweight copy of the regexp structure when a qr// is returned from the code executed by (??{$qr}) this lightweight copy doesn't actually own any of its data except for - the starp/end and the actual regexp structure itself. - -*/ - - + the starp/end and the actual regexp structure itself. + +*/ + + REGEXP * Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) { @@ -15057,7 +16092,7 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) sv_force_normal(sv) is called. */ SvFAKE_on(ret_x); ret = ReANY(ret_x); - + SvFLAGS(ret_x) |= SvUTF8(rx); /* We share the same string buffer as the original regexp, on which we hold a reference count, incremented when mother_re is set below. @@ -15088,23 +16123,23 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) #endif ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); SvREFCNT_inc_void(ret->qr_anoncv); - + return ret_x; } #endif -/* regfree_internal() +/* regfree_internal() + + Free the private data in a regexp. This is overloadable by + extensions. Perl takes care of the regexp structure in pregfree(), + this covers the *pprivate pointer which technically perl doesn't + know about, however of course we have to handle the + regexp_internal structure when no extension is in use. - Free the private data in a regexp. This is overloadable by - extensions. Perl takes care of the regexp structure in pregfree(), - this covers the *pprivate pointer which technically perl doesn't - know about, however of course we have to handle the - regexp_internal structure when no extension is in use. - - Note this is called before freeing anything in the regexp - structure. + Note this is called before freeing anything in the regexp + structure. */ - + void Perl_regfree_internal(pTHX_ REGEXP * const rx) { @@ -15122,7 +16157,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RX_UTF8(rx), dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); - PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } }); @@ -15156,7 +16191,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) case 'l': case 'L': break; - case 'T': + case 'T': { /* Aho Corasick add-on structure for a trie node. Used in stclass optimization only */ U32 refcount; @@ -15169,7 +16204,16 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) PerlMemShared_free(aho->fail); /* do this last!!!! */ PerlMemShared_free(ri->data->data[n]); - PerlMemShared_free(ri->regstclass); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } } } break; @@ -15196,7 +16240,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } break; default: - Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); } } Safefree(ri->data->what); @@ -15210,9 +16255,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) -/* - re_dup - duplicate a regexp. - +/* + re_dup - duplicate a regexp. + This routine is expected to clone a given regexp structure. It is only compiled under USE_ITHREADS. @@ -15221,7 +16266,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) stored in the *pprivate pointer. This allows extensions to handle any duplication it needs to do. - See pregfree() and regfree_internal() if you change anything here. + See pregfree() and regfree_internal() if you change anything here. */ #if defined(USE_ITHREADS) #ifndef PERL_IN_XSUB_RE @@ -15232,7 +16277,7 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) I32 npar; const struct regexp *r = ReANY(sstr); struct regexp *ret = ReANY(dstr); - + PERL_ARGS_ASSERT_RE_DUP_GUTS; npar = r->nparens+1; @@ -15299,21 +16344,20 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) so we need to copy it locally. */ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); ret->mother_re = NULL; - ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ /* regdupe_internal() - + This is the internal complement to regdupe() which is used to copy the structure pointed to by the *pprivate pointer in the regexp. This is the core version of the extension overridable cloning hook. The regexp structure being duplicated will be copied by perl prior - to this and will be provided as the regexp *r argument, however + to this and will be provided as the regexp *r argument, however with the /old/ structures pprivate pointer value. Thus this routine may override any copying normally done by perl. - + It returns a pointer to the new regexp_internal structure. */ @@ -15327,10 +16371,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) RXi_GET_DECL(r,ri); PERL_ARGS_ASSERT_REGDUPE_INTERNAL; - + len = ProgLen(ri); - - Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); + + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); reti->num_code_blocks = ri->num_code_blocks; @@ -15372,9 +16417,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) break; case 'f': /* This is cheating. */ - Newx(d->data[i], 1, struct regnode_charclass_class); - StructCopy(ri->data->data[i], d->data[i], - struct regnode_charclass_class); + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); reti->regstclass = (regnode*)d->data[i]; break; case 'T': @@ -15383,18 +16427,19 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) * when the corresponding reg_ac_data struct is freed. */ reti->regstclass= ri->regstclass; - /* Fall through */ + /* FALLTHROUGH */ case 't': OP_REFCNT_LOCK; ((reg_trie_data*)ri->data->data[i])->refcount++; OP_REFCNT_UNLOCK; - /* Fall through */ + /* FALLTHROUGH */ case 'l': case 'L': d->data[i] = ri->data->data[i]; break; default: - Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); } } @@ -15434,7 +16479,8 @@ Perl_regnext(pTHX_ regnode *p) return(NULL); if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); } offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); @@ -15446,7 +16492,7 @@ Perl_regnext(pTHX_ regnode *p) #endif STATIC void -S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) { va_list args; STRLEN l1 = strlen(pat1); @@ -15465,20 +16511,15 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) Copy(pat2, buf + l1, l2 , char); buf[l1 + l2] = '\n'; buf[l1 + l2 + 1] = '\0'; -#ifdef I_STDARG - /* ANSI variant takes additional second argument */ va_start(args, pat2); -#else - va_start(args); -#endif msv = vmess(buf, &args); va_end(args); message = SvPV_const(msv,l1); if (l1 > 512) l1 = 512; Copy(message, buf, l1 , char); - buf[l1-1] = '\0'; /* Overwrite \n */ - Perl_croak(aTHX_ "%s", buf); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); } /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ @@ -15496,7 +16537,8 @@ Perl_save_re_context(pTHX) U32 i; for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; - const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); GV *const *const gvp = (GV**)hv_fetch(PL_defstash, digits, len, 0); @@ -15518,26 +16560,18 @@ S_put_byte(pTHX_ SV *sv, int c) { PERL_ARGS_ASSERT_PUT_BYTE; - /* Our definition of isPRINT() ignores locales, so only bytes that are - not part of UTF-8 are considered printable. I assume that the same - holds for UTF-EBCDIC. - Also, code point 255 is not printable in either (it's E0 in EBCDIC, - which Wikipedia says: - - EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all - ones (binary 1111 1111, hexadecimal FF). It is similar, but not - identical, to the ASCII delete (DEL) or rubout control character. ... - it is typically mapped to hexadecimal code 9F, in order to provide a - unique character mapping in both directions) - - So the old condition can be simplified to !isPRINT(c) */ if (!isPRINT(c)) { - if (c < 256) { - Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); - } + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; + } } else { const char string = c; @@ -15547,25 +16581,104 @@ S_put_byte(pTHX_ SV *sv, int c) } } +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end' */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) + put_byte(sv, start); + } + else if ( end > 255 + || ! isALPHANUMERIC(start) + || ! isALPHANUMERIC(end) + || isDIGIT(start) != isDIGIT(end) + || isUPPER(start) != isUPPER(end) + || isLOWER(start) != isLOWER(end) + + /* This final test should get optimized out except on EBCDIC + * platforms, where it causes ranges that cross discontinuities + * like i/j to be shown as hex instead of the misleading, + * e.g. H-K (since that range includes more than H, I, J, K). + * */ + || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start)) + { + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + } + else { /* Here, the ends of the range are both digits, or both uppercase, + or both lowercase; and there's no discontinuity in the range + (which could happen on EBCDIC platforms) */ + put_byte(sv, start); + sv_catpvs(sv, "-"); + put_byte(sv, end); + } +} + +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i < 256; i++) { + if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j <= 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); + has_output_anything = TRUE; + i = j; + } + } + + return has_output_anything; +} #define CLEAR_OPTSTART \ - if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ } STMT_END -#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); STATIC const regnode * S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, - const regnode *last, const regnode *plast, + const regnode *last, const regnode *plast, SV* sv, I32 indent, U32 depth) { dVAR; U8 op = PSEUDO; /* Arbitrary non-END op. */ const regnode *next; const regnode *optstart= NULL; - + RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; @@ -15575,11 +16688,12 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, last ? last-start : 0,plast ? plast-start : 0); #endif - - if (plast && plast < last) + + if (plast && plast < last) last= plast; while (PL_regkind[op] != END && (!last || node < last)) { + assert(node); /* While that wasn't END last time... */ NODE_ALIGN(node); op = OP(node); @@ -15596,20 +16710,21 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else CLEAR_OPTSTART; - regprop(r, sv, node); + regprop(r, sv, node, NULL); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); - - if (OP(node) != OPTIMIZED) { + + if (OP(node) != OPTIMIZED) { if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, " (0)"); - else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) PerlIO_printf(Perl_debug_log, " (FAIL)"); - else + else PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); - (void)PerlIO_putc(Perl_debug_log, '\n'); + (void)PerlIO_putc(Perl_debug_log, '\n'); } - + after_print: if (PL_regkind[(U8)op] == BRANCHJ) { assert(next); @@ -15636,7 +16751,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const reg_trie_data * const trie = (reg_trie_data*)ri->data->data[optrie]; #ifdef DEBUGGING - AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); #endif const regnode *nextbranch= NULL; I32 word_idx; @@ -15646,21 +16762,25 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PerlIO_printf(Perl_debug_log, "%*s%s ", (int)(2*(indent+3)), "", - elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, - PL_colors[0], PL_colors[1], - (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELLIPSES | - PERL_PV_PRETTY_LTGT + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT ) - : "???" + : "???" ); if (trie->jump) { U16 dist= trie->jump[word_idx+1]; PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", - (UV)((dist ? this_trie + dist : next) - start)); + (UV)((dist ? this_trie + dist : next) - start)); if (dist) { if (!nextbranch) - nextbranch= this_trie + trie->jump[0]; + nextbranch= this_trie + trie->jump[0]; DUMPUNTIL(this_trie + dist, nextbranch); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) @@ -15687,8 +16807,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS) - ? ANYOF_CLASS_SKIP : ANYOF_SKIP); + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) { @@ -15704,7 +16825,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, indent++; } CLEAR_OPTSTART; -#ifdef DEBUG_DUMPUNTIL +#ifdef DEBUG_DUMPUNTIL PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); #endif return node; diff --git a/src/5019001/orig/regexec.c b/src/5021001/orig/regexec.c similarity index 76% rename from src/5019001/orig/regexec.c rename to src/5021001/orig/regexec.c index 12548d5..1aafcc7 100644 --- a/src/5019001/orig/regexec.c +++ b/src/5021001/orig/regexec.c @@ -37,16 +37,6 @@ #include "re_top.h" #endif -/* At least one required character in the target string is expressible only in - * UTF-8. */ -static const char* const non_utf8_target_but_utf8_required - = "Can't match, because target string needs to be in UTF-8\n"; - -#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ - goto target; \ -} STMT_END - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * @@ -93,16 +83,28 @@ static const char* const non_utf8_target_but_utf8_required #include "inline_invlist.c" #include "unicode_constants.h" +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #ifndef STATIC #define STATIC static #endif -/* Valid for non-utf8 strings: avoids the reginclass +/* Valid only for non-utf8 strings: avoids the reginclass * call if there are no complications: i.e., if everything matchable is * straight forward in the bitmap */ -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ : ANYOF_BITMAP_TEST(p,*(c))) /* @@ -117,6 +119,7 @@ static const char* const non_utf8_target_but_utf8_required ? reghop3((U8*)pos, off, \ (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ : (U8*)(pos + off)) + #define HOPBACKc(pos, off) \ (char*)(reginfo->is_utf8_target \ ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ @@ -127,6 +130,24 @@ static const char* const non_utf8_target_but_utf8_required #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ #define NEXTCHR_IS_EOS (nextchr < 0) @@ -139,11 +160,11 @@ static const char* const non_utf8_target_but_utf8_required SET_nextchr -#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \ +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ if (!swash_ptr) { \ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ - 1, 0, NULL, &flags); \ + 1, 0, invlist, &flags); \ assert(swash_ptr); \ } \ } STMT_END @@ -152,28 +173,33 @@ static const char* const non_utf8_target_but_utf8_required #ifdef DEBUGGING # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); #else # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name) + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) #endif #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ PL_utf8_swash_ptrs[_CC_WORDCHAR], \ - swash_property_names[_CC_WORDCHAR], \ - GREEK_SMALL_LETTER_IOTA_UTF8) + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ STMT_START { \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ "_X_regular_begin", \ - GREEK_SMALL_LETTER_IOTA_UTF8); \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ "_X_extend", \ + NULL, \ COMBINING_GRAVE_ACCENT_UTF8); \ } STMT_END @@ -190,14 +216,14 @@ static const char* const non_utf8_target_but_utf8_required * although it may be done at run time beause of the REF possibility - more * investigation required. -- demerphq */ -#define JUMPABLE(rn) ( \ - OP(rn) == OPEN || \ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ - OP(rn) == EVAL || \ - OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ - OP(rn) == PLUS || OP(rn) == MINMOD || \ - OP(rn) == KEEPS || \ - (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ ) #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) @@ -207,13 +233,13 @@ static const char* const non_utf8_target_but_utf8_required /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -223,7 +249,7 @@ static const char* const non_utf8_target_but_utf8_required Search for mandatory following text node; for lookahead, the text must follow but for lookbehind (rn->flags != 0) we skip to the next step. */ -#define FIND_NEXT_IMPT(rn) STMT_START { \ +#define FIND_NEXT_IMPT(rn) STMT_START { \ while (JUMPABLE(rn)) { \ const OPCODE type = OP(rn); \ if (type == SUSPEND || PL_regkind[type] == CURLY) \ @@ -274,8 +300,9 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) PERL_ARGS_ASSERT_REGCPPUSH; if (paren_elems_to_push < 0) - Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", - paren_elems_to_push); + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u", + (int)paren_elems_to_push, (int)maxopenparen, + (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS); if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf @@ -296,8 +323,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) ); for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(rex->offs[p].end); - SSPUSHINT(rex->offs[p].start); + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", @@ -369,10 +396,10 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) ); paren = *maxopenparen_p; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { - I32 tmps; + SSize_t tmps; rex->offs[paren].start_tmp = SSPOPINT; - rex->offs[paren].start = SSPOPINT; - tmps = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; if (paren <= rex->lastparen) rex->offs[paren].end = tmps; DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, @@ -484,7 +511,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } if (classnum < _FIRST_NON_SWASH_CC) { @@ -492,8 +519,11 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) /* Initialize the swash unless done already */ if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags); + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) @@ -526,7 +556,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) */ I32 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, - char *strbeg, I32 minend, SV *screamer, U32 nosave) + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -543,68 +573,70 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, } #endif -/* - * Need to implement the following flags for reg_anch: - * - * USE_INTUIT_NOML - Useful to call re_intuit_start() first - * USE_INTUIT_ML - * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer - * INTUIT_AUTORITATIVE_ML - * INTUIT_ONCE_NOML - Intuit can match in one location only. - * INTUIT_ONCE_ML - * - * Another flag for this function: SECOND_TIME (so that float substrs - * with giant delta may be not rechecked). - */ - -/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ - -/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. - Otherwise, only SvCUR(sv) is used to get strbeg. */ -/* XXXX We assume that strpos is strbeg unless sv. */ -/* XXXX Some places assume that there is a fixed substring. - An update may be needed if optimizer marks as "INTUITable" - RExen without fixed substrings. Similarly, it is assumed that - lengths of all the strings are no more than minlen, thus they - cannot come from lookahead. - (Or minlen should take into account lookahead.) - NOTE: Some of this comment is not correct. minlen does now take account - of lookahead/behind. Further research is required. -- demerphq - -*/ - -/* A failure to find a constant substring means that there is no need to make - an expensive call to REx engine, thus we celebrate a failure. Similarly, - finding a substring too deep into the string means that fewer calls to - regtry() should be needed. - - REx compiler's optimizer found 4 possible hints: - a) Anchored substring; - b) Fixed substring; - c) Whether we are anchored (beginning-of-line or \G); - d) First node (of those at offset 0) which may distinguish positions; - We use a)b)d) and multiline-part of c), and try to find a position in the - string which does not contradict any of them. - */ - -/* Most of decisions we do here should have been done at compile time. - The nodes of the REx which we used for the search should have been - deleted from the finite automaton. */ - -/* args: - * rx: the regex to match against - * sv: the SV being matched: only used for utf8 flag; the string - * itself is accessed via the pointers below. Note that on - * something like an overloaded SV, SvPOK(sv) may be false - * and the string pointers may point to something unrelated to - * the SV itself. - * strbeg: real beginning of string - * strpos: the point in the string at which to begin matching - * strend: pointer to the byte following the last char of the string - * flags currently unused; set to 0 - * data: currently unused; set to NULL +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. */ char * @@ -619,34 +651,60 @@ Perl_re_intuit_start(pTHX_ { dVAR; struct regexp *const prog = ReANY(rx); - I32 start_shift = 0; + SSize_t start_shift = prog->check_offset_min; /* Should be nonnegative! */ - I32 end_shift = 0; - char *s; + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; SV *check; - char *t; const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ - I32 ml_anch; - char *other_last = NULL; /* other substr checked before this */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ char *check_at = NULL; /* check substr found at this pos */ - char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; -#ifdef DEBUGGING - const char * const i_strpos = strpos; -#endif GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_START; PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(data); - /* CHR_DIST() would be more correct here but it makes things slow. */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ if (prog->minlen > strend - strpos) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "String too short... [re_intuit_start]\n")); + " String too short...\n")); goto fail; } @@ -671,551 +729,702 @@ Perl_re_intuit_start(pTHX_ } check = prog->check_substr; } - if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */ - ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) - || ( (prog->extflags & RXf_ANCH_BOL) - && !multiline ) ); /* Check after \n? */ - - if (!ml_anch) { - if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */ - && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ - && (strpos != strbeg)) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); - goto fail; - } - if (prog->check_offset_min == prog->check_offset_max - && !(prog->extflags & RXf_CANY_SEEN) - && ! multiline) /* /m can cause \n's to match that aren't - accounted for in the string max length. - See [perl #115242] */ - { - /* Substring at constant offset from beg-of-str... */ - I32 slen; - - s = HOP3c(strpos, prog->check_offset_min, strend); - - if (SvTAIL(check)) { - slen = SvCUR(check); /* >= 1 */ - if ( strend - s > slen || strend - s < slen - 1 - || (strend - s == slen && strend[-1] != '\n')) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); - goto fail_finish; - } - /* Now should match s[0..slen-2] */ - slen--; - if (slen && (*SvPVX_const(check) != *s - || (slen > 1 - && memNE(SvPVX_const(check), s, slen)))) { - report_neq: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); - goto fail_finish; - } + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); + } + }); + + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ + + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); + + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; } - else if (*SvPVX_const(check) != *s - || ((slen = SvCUR(check)) > 1 - && memNE(SvPVX_const(check), s, slen))) - goto report_neq; - check_at = s; - goto success_at_start; - } - } - /* Match is anchored, but substr is not anchored wrt beg-of-str. */ - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - - if (!ml_anch) { - const I32 end = prog->check_offset_max + CHR_SVLEN(check) - - (SvTAIL(check) != 0); - const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; - if (end_shift < eshift) - end_shift = eshift; + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; + } + if (slen && (*SvPVX_const(check) != *s + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); + goto fail_finish; + } + + check_at = s; + goto success_at_start; + } } } - else { /* Can match at random position */ - ml_anch = 0; - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - - /* end shift should be non negative here */ - } -#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ + end_shift = prog->check_end_shift; + +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", (IV)end_shift, RX_PRECOMP(prog)); #endif restart: - /* Find a possible match in the region s..strend by looking for - the "check" substring in the region corrected by start/end_shift. */ + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ + { - I32 srch_start_shift = start_shift; - I32 srch_end_shift = end_shift; U8* start_point; U8* end_point; - if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { - srch_end_shift -= ((strbeg - s) - srch_start_shift); - srch_start_shift = strbeg - s; - } - DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", - (IV)prog->check_offset_min, - (IV)srch_start_shift, - (IV)srch_end_shift, - (IV)prog->check_end_shift); - }); + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); - if (prog->extflags & RXf_CANY_SEEN) { - start_point= (U8*)(s + srch_start_shift); - end_point= (U8*)(strend - srch_end_shift); + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; } else { - start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); - end_point= HOP3(strend, -srch_end_shift, strbeg); + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } + } + DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", (int)(end_point - start_point), (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), start_point); }); - s = fbm_instr( start_point, end_point, + check_at = fbm_instr( start_point, end_point, check, multiline ? FBMrf_MULTILINE : 0); + + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ + + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ + + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); } - /* Update the count-of-usability, remove useless subpatterns, - unshift s. */ - - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(check), RE_SV_DUMPLEN(check), 30); - PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", - (s ? "Found" : "Did not find"), - (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) - ? "anchored" : "floating"), - quoted, - RE_SV_TAIL(check), - (s ? " at offset " : "...\n") ); - }); - if (!s) - goto fail_finish; - /* Finish the diagnostic message */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); - /* XXX dmq: first branch is for positive lookbehind... - Our check string is offset from the beginning of the pattern. - So we need to do any stclass tests offset forward from that - point. I think. :-( - */ - - - - check_at=s; - - - /* Got a candidate. Check MBOL anchoring, and the *other* substr. - Start with the other substr. - XXXX no SCREAM optimization yet - and a very coarse implementation - XXXX /ttx+/ results in anchored="ttx", floating="x". floating will - *always* match. Probably should be marked during compile... - Probably it is right to do no SCREAM here... - */ + /* now look for the 'other' substring if defined */ - if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8) - : (prog->float_substr && prog->anchored_substr)) + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) { /* Take into account the "other" substring. */ - /* XXXX May be hopelessly wrong for UTF... */ - if (!other_last) - other_last = strpos; - if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { - do_other_anchored: - { - char * const last = HOP3c(s, -start_shift, strbeg); - char *last1, *last2; - char * const saved_s = s; - SV* must; - - t = s - prog->check_offset_max; - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) - && t > strpos))) - NOOP; - else - t = strpos; - t = HOP3c(t, prog->anchored_offset, strend); - if (t < other_last) /* These positions already checked */ - t = other_last; - last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); - if (last < last1) - last1 = last; - /* XXXX It is not documented what units *_offsets are in. - We assume bytes, but this is clearly wrong. - Meaning this code needs to be carefully reviewed for errors. - dmq. - */ - - /* On end-of-str: see comment below. */ - must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->anchored_utf8); /* for debug */ - } - else - s = fbm_instr( - (unsigned char*)t, - HOP3(HOP3(last1, prog->anchored_offset, strend) - + SvCUR(must), -(SvTAIL(must)!=0), strbeg), - must, - multiline ? FBMrf_MULTILINE : 0 - ); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); - - - if (!s) { - if (last1 >= last2) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying floating at offset %ld...\n", - (long)(HOP3c(saved_s, 1, strend) - i_strpos))); - other_last = HOP3c(last1, prog->anchored_offset+1, strend); - s = HOP3c(last, 1, strend); - goto restart; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - t = HOP3c(s, -prog->anchored_offset, strbeg); - other_last = HOP3c(s, 1, strend); - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; - } - } - } - else { /* Take into account the floating substring. */ - char *last, *last1; - char * const saved_s = s; - SV* must; - - t = HOP3c(s, -start_shift, strbeg); - last1 = last = - HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); - if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) - last = HOP3c(t, prog->float_max_offset, strend); - s = HOP3c(t, prog->float_min_offset, strend); - if (s < other_last) - s = other_last; - /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ - must = utf8_target ? prog->float_utf8 : prog->float_substr; - /* fbm_instr() takes into account exact value of end-of-str - if the check is SvTAIL(ed). Since false positives are OK, - and end-of-str is not later than strend we are OK. */ - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->float_utf8); /* for debug message */ - } - else - s = fbm_instr((unsigned char*)s, - (unsigned char*)last + SvCUR(must) - - (SvTAIL(must)!=0), - must, multiline ? FBMrf_MULTILINE : 0); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); - if (!s) { - if (last1 == last) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying anchored starting at offset %ld...\n", - (long)(saved_s + 1 - i_strpos))); - other_last = last; - s = HOP3c(t, 1, strend); - goto restart; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - other_last = s; /* Fix this later. --Hugo */ - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; - } - } + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ + + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); + + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); + } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; + } + else { + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); + } + } + } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); } - - t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); - - DEBUG_OPTIMISE_MORE_r( - PerlIO_printf(Perl_debug_log, - "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", - (IV)prog->check_offset_min, - (IV)prog->check_offset_max, - (IV)(s-strpos), - (IV)(t-strpos), - (IV)(t-s), - (IV)(strend-strpos) - ) - ); + postprocess_substr_matches: - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) - && t > strpos))) - { - /* Fixed substring is found far enough so that the match - cannot start at strpos. */ - try_at_offset: - if (ml_anch && t[-1] != '\n') { - /* Eventually fbm_*() should handle this, but often - anchored_offset is not 0, so this check will not be wasted. */ - /* XXXX In the code below we prefer to look for "^" even in - presence of anchored substrings. And we search even - beyond the found float position. These pessimizations - are historical artefacts only. */ - find_anchor: - while (t < strend - prog->minlen) { - if (*t == '\n') { - if (t < check_at - prog->check_offset_min) { - if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { - /* Since we moved from the found position, - we definitely contradict the found anchored - substr. Due to the above check we do not - contradict "check" substr. - Thus we can arrive here only if check substr - is float. Redo checking for "other"=="fixed". - */ - strpos = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); - goto do_other_anchored; - } - /* We don't contradict the found floating substring. */ - /* XXXX Why not check for STCLASS? */ - s = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(s - i_strpos))); - goto set_useful; - } - /* Position contradicts check-string */ - /* XXXX probably better to look for check-string - than for "\n", so one should lower the limit for t? */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos))); - other_last = strpos = s = t + 1; - goto restart; - } - t++; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); - goto fail_finish; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); - } - s = t; - set_useful: - ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + /* handle the extra constraint of /^.../m if present */ + + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); + goto fail_finish; + } + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) + { + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; + } + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; + } + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); } else { - /* The found string does not prohibit matching at strpos, - - no optimization of calling REx engine can be performed, - unless it was an MBOL and we are not after MBOL, - or a future STCLASS check will fail this. */ - try_at_start: - /* Even in this situation we may use MBOL flag if strpos is offset - wrt the start of the string. */ - if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n' - /* May be due to an implicit anchor of m{.*foo} */ - && !(prog->intflags & PREGf_IMPLICIT)) - { - t = strpos; - goto find_anchor; - } - DEBUG_EXECUTE_r( if (ml_anch) - PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", - (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]); - ); - success_at_start: - if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ - && (utf8_target ? ( - prog->check_utf8 /* Could be deleted already */ - && --BmUSEFUL(prog->check_utf8) < 0 - && (prog->check_utf8 == prog->float_utf8) - ) : ( - prog->check_substr /* Could be deleted already */ - && --BmUSEFUL(prog->check_substr) < 0 - && (prog->check_substr == prog->float_substr) - ))) - { - /* If flags & SOMETHING - do not do it many times on the same match */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); - /* XXX Does the destruction order has to change with utf8_target? */ - SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); - SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); - prog->check_substr = prog->check_utf8 = NULL; /* disable */ - prog->float_substr = prog->float_utf8 = NULL; /* clear */ - check = NULL; /* abort */ - s = strpos; - /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag - see http://bugs.activestate.com/show_bug.cgi?id=87173 */ - if (prog->intflags & PREGf_IMPLICIT) - prog->extflags &= ~RXf_ANCH_MBOL; - /* XXXX This is a remnant of the old implementation. It - looks wasteful, since now INTUIT can use many - other heuristics. */ - prog->extflags &= ~RXf_USE_INTUIT; - /* XXXX What other flags might need to be cleared in this branch? */ - } - else - s = strpos; + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); } - /* Last resort... */ - /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ - /* trie stclasses are too expensive to use here, we are better off to - leave it to regmatch itself */ + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ + if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { - /* minlen == 0 is possible if regstclass is \b or \B, - and the fixed substr is ''$. - Since minlen is already taken into account, s+1 is before strend; - accidentally, minlen >= 1 guaranties no false positives at s + 1 - even for \b or \B. But (minlen? 1 : 0) below assumes that - regstclass does not come from lookahead... */ - /* If regstclass takes bytelength more than 1: If charlength==1, OK. - This leaves EXACTF-ish only, which are dealt with in find_byclass(). */ const U8* const str = (U8*)STRING(progi->regstclass); + + /* XXX this value could be pre-computed */ const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT - ? CHR_DIST(str+STR_LEN(progi->regstclass), str) + ? (reginfo->is_utf8_pat + ? utf8_distance(str + STR_LEN(progi->regstclass), str) + : STR_LEN(progi->regstclass)) : 1); char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) - endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); - else if (prog->float_substr || prog->float_utf8) - endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } else endpos= strend; - if (checked_upto < s) - checked_upto = s; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); - t = s; - s = find_byclass(prog, progi->regstclass, checked_upto, endpos, + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, reginfo); - if (s) { - checked_upto = s; - } else { -#ifdef DEBUGGING - const char *what = NULL; -#endif + if (!s) { if (endpos == strend) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); + " Could not match STCLASS...\n") ); goto fail; } DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "This position contradicts STCLASS...\n") ); - if ((prog->extflags & RXf_ANCH) && !ml_anch) + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) goto fail; - checked_upto = HOPBACKc(endpos, start_shift); - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { - if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { - DEBUG_EXECUTE_r( what = "anchored" ); - hop_and_restart: - s = HOP3c(t, 1, strend); - if (s + start_shift + end_shift > strend) { - /* XXXX Should be taken into account earlier? */ - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); - goto fail; - } - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for %s substr starting at offset %ld...\n", - what, (long)(s + start_shift - i_strpos)) ); - goto restart; - } - /* Have both, check_string is floating */ - if (t + start_shift >= check_at) /* Contradicts floating=check */ - goto retry_floating_check; - /* Recheck anchored substring, but not floating... */ - s = check_at; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for anchored substr starting at offset %ld...\n", - (long)(other_last - i_strpos)) ); - goto do_other_anchored; - } - /* Another way we could have checked stclass at the - current position only: */ - if (ml_anch) { - s = t = t + 1; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for /%s^%s/m starting at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); - goto try_at_offset; - } - if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ - goto fail; - /* Check is floating substring. */ - retry_floating_check: - t = check_at - start_shift; - DEBUG_EXECUTE_r( what = "floating" ); - goto hop_and_restart; + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; + } + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; + } + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; + } + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; + } + + /* Success !!! */ + + if (rx_origin != s) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) + ); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Does not contradict STCLASS...\n"); + ); + } + } + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; } - if (t != s) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "By STCLASS: moving %ld --> %ld\n", - (long)(t - i_strpos), (long)(s - i_strpos)) - ); - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "Does not contradict STCLASS...\n"); - ); - } } - giveup: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", - PL_colors[4], (check ? "Guessed" : "Giving up"), - PL_colors[5], (long)(s - i_strpos)) ); - return s; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; fail_finish: /* Substring not found */ if (prog->check_substr || prog->check_utf8) /* could be removed already */ @@ -1226,46 +1435,57 @@ Perl_re_intuit_start(pTHX_ return NULL; } + #define DECL_TRIE_TYPE(scan) \ - const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ trie_type = ((scan->flags == EXACT) \ ? (utf8_target ? trie_utf8 : trie_plain) \ - : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ -STMT_START { \ +STMT_START { \ STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ case trie_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ - uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ len = UTF8SKIP(uc); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ case trie_latin_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ case trie_utf8: \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ case trie_plain: \ uvc = (UV)*uc; \ @@ -1336,28 +1556,28 @@ REXEC_FBC_SCAN( \ tmp = 1; \ ) -#define REXEC_FBC_TRYIT \ +#define REXEC_FBC_TRYIT \ if ((reginfo->intuit || regtry(reginfo, &s))) \ goto got_it #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ - if (utf8_target) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ } -#define DUMP_EXEC_POS(li,s,doutf8) \ +#define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ startpos, doutf8) -#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ tmp = TEST_NON_UTF8(tmp); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1366,18 +1586,19 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ -#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ if (s == reginfo->strbeg) { \ tmp = '\n'; \ } \ else { \ U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ } \ tmp = TeSt1_UtF8; \ - LOAD_UTF8_CHARCLASS_ALNUM(); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! (TeSt2_UtF8)) { \ + if (tmp == ! (TeSt2_UtF8)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1412,9 +1633,9 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ * one, and compare it with the wordness of this one. If they differ, we have * a boundary. At the beginning of the string, pretend that the previous * character was a new-line */ -#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ if (utf8_target) { \ - UTF8_CODE \ + UTF8_CODE \ } \ else { /* Not utf8 */ \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ @@ -1429,7 +1650,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ } \ - if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ + if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ goto got_it; /* We know what class REx starts with. Try to find this position... */ @@ -1468,11 +1689,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { case ANYOF: - case ANYOF_SYNTHETIC: - case ANYOF_WARN_SUPER: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( - reginclass(prog, c, (U8*)s, utf8_target)); + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); } else { REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); @@ -1487,6 +1706,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ case EXACTFA: if (is_utf8_pat || utf8_target) { utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; @@ -1496,10 +1718,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1; /* /a, except the sharp s one which */ goto do_exactf_non_utf8; /* isn't dealt with by these */ - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); if (utf8_target) { - - /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1508,8 +1729,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; case EXACTFL: - if (is_utf8_pat || utf8_target) { - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; } fold_array = PL_fold_locale; @@ -1522,7 +1743,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_TRICKYFOLD: case EXACTFU: if (is_utf8_pat || utf8_target) { utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -1535,7 +1755,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, fold_array = PL_fold_latin1; folder = foldEQ_latin1; - /* FALL THROUGH */ + /* FALLTHROUGH */ do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there are no glitches with fold-length differences @@ -1555,7 +1775,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * characters, and there are only 2 availabe, we know without * trying that it will fail; so don't start a match past the * required minimum number from the far end */ - e = HOP3c(strend, -((I32)ln), s); + e = HOP3c(strend, -((SSize_t)ln), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1601,7 +1821,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * only 2 are left, it's guaranteed to fail, so don't start a * match that would require us to go beyond the end of the string */ - e = HOP3c(strend, -((I32)lnc), s); + e = HOP3c(strend, -((SSize_t)lnc), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1627,15 +1847,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; } case BOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -1682,7 +1900,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); break; @@ -1700,9 +1917,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case NPOSIXA: if (utf8_target) { /* The complement of something that matches only ASCII matches all - * UTF-8 variant code points, plus everything in ASCII that isn't - * in the class */ - REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s) + * non-ASCII, plus everything in ASCII that isn't in the class. */ + REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s) || ! _generic_isCC_A(*s, FLAGS(c))); break; } @@ -1746,7 +1962,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), classnum)))) { if (tmp && (reginfo->intuit || regtry(reginfo, &s))) @@ -1765,7 +1982,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, macros */ case _CC_ENUM_SPACE: /* XXX would require separate code if we revert the change of \v matching this */ - /* FALL THROUGH */ + /* FALLTHROUGH */ case _CC_ENUM_PSXSPC: REXEC_FBC_UTF8_CLASS_SCAN( @@ -1804,8 +2021,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } /* This is a copy of the loop above for swash classes, though using the @@ -2040,20 +2259,169 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; default: Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); - break; } return 0; got_it: return s; } +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +static void +S_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) + { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + /* - regexec_flags - match a regexp against a string */ I32 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, - char *strbeg, I32 minend, SV *sv, void *data, U32 flags) + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -2061,21 +2429,17 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* sv: SV being matched: only used for utf8 flag, pos() etc; string * itself is accessed via the pointers above */ /* data: May be used for some additional optimizations. - Currently its only used, with a U32 cast, for transmitting - the ganch offset when doing a /g match. This will change */ -/* nosave: For optimizations. */ + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ { dVAR; struct regexp *const prog = ReANY(rx); char *s; regnode *c; - char *startpos = stringarg; - I32 minlen; /* must match at least this many chars */ - I32 dontbother = 0; /* how many characters not to try at end */ - I32 end_shift = 0; /* Same for the end. */ /* CC */ - I32 scream_pos = -1; /* Internal iterator of scream. */ - char *scream_olds = NULL; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); @@ -2089,16 +2453,69 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PERL_UNUSED_ARG(data); /* Be paranoid... */ - if (prog == NULL || startpos == NULL) { + if (prog == NULL || stringarg == NULL) { Perl_croak(aTHX_ "NULL regexp parameter"); - return 0; } DEBUG_EXECUTE_r( - debug_start_match(rx, utf8_target, startpos, strend, + debug_start_match(rx, utf8_target, stringarg, strend, "Matching"); ); + startpos = stringarg; + + if (prog->intflags & PREGf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if prog->gofs is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ + + if (prog->intflags & PREGf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->intflags & PREGf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; + } /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), * which will call destuctors to reset PL_regmatch_state, free higher @@ -2107,10 +2524,54 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, oldsave = PL_savestack_ix; + s = startpos; + + if ((prog->extflags & RXf_USE_INTUIT) + && !(flags & REXEC_CHECKED)) + { + s = re_intuit_start(rx, sv, strbeg, startpos, strend, + flags, NULL); + if (!s) + return 0; + + if (prog->extflags & RXf_CHECK_ALL) { + /* we can match based purely on the result of INTUIT. + * Set up captures etc just for $& and $-[0] + * (an intuit-only match wont have $1,$2,..) */ + assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + /* match via INTUIT shouldn't have any captures. + * Let @-, @+, $^N know */ + prog->lastparen = prog->lastcloseparen = 0; + RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, strend, + sv, flags, utf8_target); + + return 1; + } + } + multiline = prog->extflags & RXf_PMf_MULTILINE; - minlen = prog->minlen; - if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too short [regexec_flags]...\n")); goto phooey; @@ -2133,7 +2594,20 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, reginfo->poscache_maxiter = 0; /* not yet started a countdown */ reginfo->strend = strend; /* see how far we have to get to not match where we matched before */ - reginfo->till = startpos+minend; + reginfo->till = stringarg + minend; + + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { + /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after + S_cleanup_regmatch_info_aux has executed (registered by + SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies + magic belonging to this SV. + Not newSVsv, either, as it does not COW. + */ + assert(!IS_PADGV(sv)); + reginfo->sv = newSV(0); + SvSetSV_nosteal(reginfo->sv, sv); + SAVEFREESV(reginfo->sv); + } /* reserve next 2 or 3 slots in PL_regmatch_state: * slot N+0: may currently be in use: skip it @@ -2186,43 +2660,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* If there is a "must appear" string, look for it. */ - s = startpos; - if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */ - MAGIC *mg; - if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */ - reginfo->ganch = startpos + prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs)); - } else if (sv && SvTYPE(sv) >= SVt_PVMG - && SvMAGIC(sv) - && (mg = mg_find(sv, PERL_MAGIC_regex_global)) - && mg->mg_len >= 0) { - reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */ - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len)); - - if (prog->extflags & RXf_ANCH_GPOS) { - if (s > reginfo->ganch) - goto phooey; - s = reginfo->ganch - prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs)); - if (s < strbeg) - goto phooey; - } - } - else if (data) { - reginfo->ganch = strbeg + PTR2UV(data); - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data))); - - } else { /* pos() not defined */ - reginfo->ganch = strbeg; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS: reginfo->ganch = strbeg\n")); - } - } if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { /* We have to be careful. If the previous successful match was from this regex we don't want a subsequent partially @@ -2241,27 +2679,13 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PTR2UV(prog->offs) )); } - if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { - re_scream_pos_data d; - - d.scream_olds = &scream_olds; - d.scream_pos = &scream_pos; - s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d); - if (!s) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); - goto phooey; /* not present */ - } - } - - /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ - if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { - if (s == startpos && regtry(reginfo, &startpos)) + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { + if (s == startpos && regtry(reginfo, &s)) goto got_it; - else if (multiline || (prog->intflags & PREGf_IMPLICIT) - || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ { char *end; @@ -2335,14 +2759,15 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* end search for newline */ } /* end anchored/multiline check string search */ goto phooey; - } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) + } else if (prog->intflags & PREGf_ANCH_GPOS) { - /* the warning about reginfo->ganch being used without initialization - is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN - and we only enter this block when the same bit is set. */ - char *tmp_s = reginfo->ganch - prog->gofs; - - if (tmp_s >= strbeg && regtry(reginfo, &tmp_s)) + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) goto got_it; goto phooey; } @@ -2398,8 +2823,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, || ((prog->float_substr != NULL || prog->float_utf8 != NULL) && prog->float_max_offset < strend - s)) { SV *must; - I32 back_max; - I32 back_min; + SSize_t back_max; + SSize_t back_min; char *last; char *last1; /* Last position checked before */ #ifdef DEBUGGING @@ -2444,7 +2869,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, last = strend; } else { last = HOP3c(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) + -(SSize_t)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min), strbeg); } if (s > reginfo->strbeg) @@ -2454,11 +2879,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* XXXX check_substr already used to find "s", can optimize if check_substr==must. */ - scream_pos = -1; - dontbother = end_shift; + dontbother = 0; strend = HOPc(strend, -dontbother); while ( (s <= last) && - (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), (unsigned char*)strend, must, multiline ? FBMrf_MULTILINE : 0)) ) { DEBUG_EXECUTE_r( did_match = 1 ); @@ -2511,7 +2935,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, c); + regprop(prog, prop, c, reginfo); { RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); @@ -2648,6 +3072,18 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, goto phooey; got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + DEBUG_BUFFERS_r( if (swap) PerlIO_printf(Perl_debug_log, @@ -2670,123 +3106,10 @@ got_it: RX_MATCH_UTF8_set(rx, utf8_target); /* make sure $`, $&, $', and $digit will work later */ - if ( !(flags & REXEC_NOT_FIRST) ) { - if (flags & REXEC_COPY_STR) { -#ifdef PERL_ANY_COW - if (SvCANCOW(sv)) { - if (DEBUG_C_TEST) { - PerlIO_printf(Perl_debug_log, - "Copy on write: regexp capture, type %d\n", - (int) SvTYPE(sv)); - } - RX_MATCH_COPY_FREE(rx); - prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); - prog->subbeg = (char *)SvPVX_const(prog->saved_copy); - assert (SvPOKp(prog->saved_copy)); - prog->sublen = reginfo->strend - strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - } else -#endif - { - I32 min = 0; - I32 max = reginfo->strend - strbeg; - I32 sublen; - - if ( (flags & REXEC_COPY_SKIP_POST) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_RIGHT) - ) { /* don't copy $' part of string */ - U32 n = 0; - max = -1; - /* calculate the right-most part of the string covered - * by a capture. Due to look-ahead, this may be to - * the right of $&, so we have to scan all captures */ - while (n <= prog->lastparen) { - if (prog->offs[n].end > max) - max = prog->offs[n].end; - n++; - } - if (max == -1) - max = (PL_sawampersand & SAWAMPERSAND_LEFT) - ? prog->offs[0].start - : 0; - assert(max >= 0 && max <= reginfo->strend - strbeg); - } - - if ( (flags & REXEC_COPY_SKIP_PRE) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_LEFT) - ) { /* don't copy $` part of string */ - U32 n = 0; - min = max; - /* calculate the left-most part of the string covered - * by a capture. Due to look-behind, this may be to - * the left of $&, so we have to scan all captures */ - while (min && n <= prog->lastparen) { - if ( prog->offs[n].start != -1 - && prog->offs[n].start < min) - { - min = prog->offs[n].start; - } - n++; - } - if ((PL_sawampersand & SAWAMPERSAND_RIGHT) - && min > prog->offs[0].end - ) - min = prog->offs[0].end; - - } - - assert(min >= 0 && min <= max - && min <= reginfo->strend - strbeg); - sublen = max - min; - - if (RX_MATCH_COPIED(rx)) { - if (sublen > prog->sublen) - prog->subbeg = - (char*)saferealloc(prog->subbeg, sublen+1); - } - else - prog->subbeg = (char*)safemalloc(sublen+1); - Copy(strbeg + min, prog->subbeg, sublen, char); - prog->subbeg[sublen] = '\0'; - prog->suboffset = min; - prog->sublen = sublen; - RX_MATCH_COPIED_on(rx); - } - prog->subcoffset = prog->suboffset; - if (prog->suboffset && utf8_target) { - /* Convert byte offset to chars. - * XXX ideally should only compute this if @-/@+ - * has been seen, a la PL_sawampersand ??? */ - - /* If there's a direct correspondence between the - * string which we're matching and the original SV, - * then we can use the utf8 len cache associated with - * the SV. In particular, it means that under //g, - * sv_pos_b2u() will use the previously cached - * position to speed up working out the new length of - * subcoffset, rather than counting from the start of - * the string each time. This stops - * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; - * from going quadratic */ - if (SvPOKp(sv) && SvPVX(sv) == strbeg) - sv_pos_b2u(sv, &(prog->subcoffset)); - else - prog->subcoffset = utf8_length((U8*)strbeg, - (U8*)(strbeg+prog->suboffset)); - } - } - else { - RX_MATCH_COPY_FREE(rx); - prog->subbeg = strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - /* use reginfo->strend, as strend may have been modified */ - prog->sublen = reginfo->strend - strbeg; - } - } + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); return 1; @@ -2817,7 +3140,7 @@ phooey: /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. * Do inc before dec, in case old and new rex are the same */ -#define SET_reg_curpm(Re2) \ +#define SET_reg_curpm(Re2) \ if (reginfo->info_aux_eval) { \ (void)ReREFCNT_inc(Re2); \ ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ @@ -2835,7 +3158,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) CHECKPOINT lastcp; REGEXP *const rx = reginfo->prog; regexp *const prog = ReANY(rx); - I32 result; + SSize_t result; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -3082,11 +3405,11 @@ regmatch(), slabs allocated since entry are freed. #define DEBUG_STATE_pp(pp) \ DEBUG_STATE_r({ \ - DUMP_EXEC_POS(locinput, scan, utf8_target); \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ PerlIO_printf(Perl_debug_log, \ " %*s"pp" %s%s%s%s%s\n", \ depth*2, "", \ - PL_reg_name[st->resume_state], \ + PL_reg_name[st->resume_state], \ ((st==yes_state||st==mark_state) ? "[" : ""), \ ((st==yes_state) ? "Y" : ""), \ ((st==mark_state) ? "M" : ""), \ @@ -3202,7 +3525,7 @@ S_dump_exec_pos(pTHX_ const char *locinput, * or 0 if non of the buffers matched. */ STATIC I32 -S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) +S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) { I32 n; RXi_GET_DECL(rex,rexi); @@ -3290,6 +3613,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, dVAR; U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; if (OP(text_node) == EXACT) { @@ -3309,136 +3633,186 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = c1 = valid_utf8_to_uvchr(pat, NULL); } } - else /* an EXACTFish node */ - if ((is_utf8_pat - && is_MULTI_CHAR_FOLD_utf8_safe(pat, - pat + STR_LEN(text_node))) - || (!is_utf8_pat - && is_MULTI_CHAR_FOLD_latin1_safe(pat, - pat + STR_LEN(text_node)))) - { - /* Multi-character folds require more context to sort out. Also - * PL_utf8_foldclosures used below doesn't handle them, so have to be - * handled outside this routine */ - use_chrtest_void = TRUE; - } - else { /* an EXACTFish node which doesn't begin with a multi-char fold */ - c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; - if (c1 > 256) { - /* Load the folds hash, if not already done */ - SV** listp; - if (! PL_utf8_foldclosures) { - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - - /* Force loading this by folding an above-Latin1 char */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); } + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } + } - /* The fold closures data structure is a hash with the keys being - * the UTF-8 of every character that is folded to, like 'k', and - * the values each an array of all code points that fold to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are - * not included */ - if ((! (listp = hv_fetch(PL_utf8_foldclosures, - (char *) pat, - UTF8SKIP(pat), - FALSE)))) - { - /* Not found in the hash, therefore there are no folds - * containing it, so there is only a single character that - * could match */ - c2 = c1; + pat = folded; + pat_end = d; } - else { /* Does participate in folds */ - AV* list = (AV*) *listp; - if (av_len(list) != 1) { + } - /* If there aren't exactly two folds to this, it is outside - * the scope of this function */ - use_chrtest_void = TRUE; + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 255) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); } - else { /* There are two. Get them */ - SV** c_p = av_fetch(list, 0, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c1 = SvUV(*c_p); - c_p = av_fetch(list, 1, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { + + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; } - c2 = SvUV(*c_p); - - /* Folds that cross the 255/256 boundary are forbidden if - * EXACTFL, or EXACTFA and one is ASCIII. Since the - * pattern character is above 256, and its only other match - * is below 256, the only legal match will be to itself. - * We have thrown away the original, so have to compute - * which is the one above 255 */ - if ((c1 < 256) != (c2 < 256)) { - if (OP(text_node) == EXACTFL - || (OP(text_node) == EXACTFA - && (isASCII(c1) || isASCII(c2)))) - { - if (c1 < 256) { - c1 = c2; - } - else { - c2 = c1; + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 255, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255. */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } } } } } } - } - else /* Here, c1 is < 255 */ - if (utf8_target - && HAS_NONLATIN1_FOLD_CLOSURE(c1) - && OP(text_node) != EXACTFL - && (OP(text_node) != EXACTFA || ! isASCII(c1))) - { - /* Here, there could be something above Latin1 in the target which - * folds to this character in the pattern. All such cases except - * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters - * involved in their folds, so are outside the scope of this - * function */ - if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { - c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; - } - else { - use_chrtest_void = TRUE; + else /* Here, c1 is <= 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } } - } - else { /* Here nothing above Latin1 can fold to the pattern character */ - switch (OP(text_node)) { + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { - case EXACTFL: /* /l rules */ - c2 = PL_fold_locale[c1]; - break; + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; - case EXACTF: - if (! utf8_target) { /* /d rules */ - c2 = PL_fold[c1]; + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; break; - } - /* FALLTHROUGH */ - /* /u rules for all these. This happens to work for - * EXACTFA as nothing in Latin1 folds to ASCII */ - case EXACTFA: - case EXACTFU_TRICKYFOLD: - case EXACTFU_SS: - case EXACTFU: - c2 = PL_fold_latin1[c1]; - break; - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); - assert(0); /* NOTREACHED */ + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } } } } @@ -3480,7 +3854,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } /* returns -1 on failure, $+[0] on success */ -STATIC I32 +STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { #if PERL_VERSION < 9 && !defined(PERL_CORE) @@ -3498,7 +3872,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regnode *scan; regnode *next; U32 n = 0; /* general value; init to avoid compiler warning */ - I32 ln = 0; /* len or last; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput) */ @@ -3526,7 +3900,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; - SV* const oreplsv = GvSV(PL_replgv); + SV* const oreplsv = GvSVn(PL_replgv); /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops @@ -3556,6 +3930,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) GET_RE_DEBUG_FLAGS_DECL; #endif + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ multicall_oldcatch = 0; multicall_cv = NULL; @@ -3581,7 +3958,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SV * const prop = sv_newmortal(); regnode *rnext=regnext(scan); DUMP_EXEC_POS( locinput, scan, utf8_target ); - regprop(rex, prop, scan); + regprop(rex, prop, scan, reginfo); PerlIO_printf(Perl_debug_log, "%3"IVdf":%*s%s(%"IVdf")\n", @@ -3603,7 +3980,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); switch (state_num) { - case BOL: /* /^../ */ + case BOL: /* /^../ */ + case SBOL: /* /^../s */ if (locinput == reginfo->strbeg) break; sayNO; @@ -3616,11 +3994,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } sayNO; - case SBOL: /* /^../s */ - if (locinput == reginfo->strbeg) - break; - sayNO; - case GPOS: /* \G */ if (locinput == reginfo->ganch) break; @@ -3638,16 +4011,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO_SILENT; assert(0); /*NOTREACHED*/ - case EOL: /* /..$/ */ - goto seol; - case MEOL: /* /..$/m */ if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; break; + case EOL: /* /..$/ */ + /* FALLTHROUGH */ case SEOL: /* /..$/s */ - seol: if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; if (reginfo->strend - locinput > 1) @@ -3691,7 +4062,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO_SILENT; assert(0); /* NOTREACHED */ } - /* FALL THROUGH */ + /* FALLTHROUGH */ case TRIE: /* (ab|cd) */ /* the basic plan of execution of the trie is: * At the beginning, run though all the states, and @@ -3945,7 +4316,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (chars) { if (utf8_target) { - uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; } @@ -3958,7 +4329,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (foldlen) { if (!--chars) break; - uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, uniflags); uscan += len; foldlen -= len; @@ -4049,7 +4420,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { sayNO; } l += 2; @@ -4072,7 +4444,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { sayNO; } s += 2; @@ -4102,27 +4475,31 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * s; U32 fold_utf8_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - fold_utf8_flags = FOLDEQ_UTF8_LOCALE; + fold_utf8_flags = FOLDEQ_LOCALE; goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ - case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ case EXACTFA: /* /abc/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; - case EXACTF: /* /abc/i */ + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); folder = foldEQ; fold_array = PL_fold; fold_utf8_flags = 0; @@ -4131,7 +4508,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) { + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { /* Either target or the pattern are utf8, or has the issue where * the fold lengths may differ. */ const char * const l = locinput; @@ -4166,8 +4547,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * have to set the FLAGS fields of these */ case BOUNDL: /* /\b/l */ case NBOUNDL: /* /\B/l */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* FALL THROUGH */ case BOUND: /* /\b/ */ case BOUNDU: /* /\b/u */ case BOUNDA: /* /\b/a */ @@ -4185,7 +4564,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 * const r = reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); - ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); } if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isWORDCHAR_uni(ln); @@ -4198,7 +4578,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -4237,7 +4617,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; default: Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); - break; } } /* Note requires that all BOUNDs be lower than all NBOUNDs in @@ -4247,11 +4626,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case ANYOF: /* /[abc]/ */ - case ANYOF_WARN_SUPER: if (NEXTCHR_IS_EOS) sayNO; if (utf8_target) { - if (!reginclass(rex, scan, (U8*)locinput, utf8_target)) + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) sayNO; locinput += UTF8SKIP(locinput); } @@ -4273,10 +4652,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (NEXTCHR_IS_EOS) sayNO; - /* The locale hasn't influenced the outcome before this, so defer - * tainting until now */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* Use isFOO_lc() for characters within Latin1. (Note that * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else * wouldn't be invariant) */ @@ -4287,7 +4662,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_UNI(nextchr, + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -4368,9 +4743,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), - FLAGS(scan))))) + FLAGS(scan))))) { sayNO; } @@ -4386,8 +4761,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } if (! (to_complement ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], @@ -4649,11 +5025,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 *fold_array; UV utf8_fold_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; type = REFFL; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; case NREFFA: /* /\g{name}/iaa */ @@ -4694,10 +5069,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_nref_ref_common; case REFFL: /* /\1/il */ - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_ref; case REFFA: /* /\1/iaa */ @@ -4737,8 +5111,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = reginfo->strbeg + ln; if (type != REF /* REF can do byte comparison */ - && (utf8_target || type == REFFU)) - { /* XXX handle REFFL better */ + && (utf8_target || type == REFFU || type == REFFL)) + { char * limit = reginfo->strend; /* This call case insensitively compares the entire buffer @@ -4812,7 +5186,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) startpoint = rei->program+1; ST.close_paren = 0; } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ goto eval_recurse_doit; + assert(0); /* NOTREACHED */ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ @@ -4924,8 +5305,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->offs[0].end = locinput - reginfo->strbeg; if (reginfo->info_aux_eval->pos_magic) - reginfo->info_aux_eval->pos_magic->mg_len - = locinput - reginfo->strbeg; + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); @@ -4963,20 +5345,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* /(??{}) */ /* if its overloaded, let the regex compiler handle * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); if (!SvAMAGIC(ret)) { SV *sv = ret; if (SvROK(sv)) sv = SvRV(sv); if (SvTYPE(sv) == SVt_REGEXP) re_sv = (REGEXP*) sv; - else if (SvSMAGICAL(sv)) { - MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); if (mg) re_sv = (REGEXP *) mg->mg_obj; } - /* force any magic, undef warnings here */ - if (!re_sv) { + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { ret = sv_mortalcopy(ret); (void) SvPV_force_nolen(ret); } @@ -4991,6 +5375,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PL_op = oop; PL_curcop = ocurcop; S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); + PL_curpm = PL_reg_curpm; if (logical != 2) break; @@ -5029,17 +5414,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) pm_flags); if (!(SvFLAGS(ret) - & (SVs_TEMP | SVs_PADTMP | SVf_READONLY - | SVs_GMG))) { + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { /* This isn't a first class regexp. Instead, it's caching a regexp onto an existing, Perl visible scalar. */ sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); } - /* safe to do now that any $1 etc has been - * interpolated into the new pattern string and - * compiled */ - S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); } SAVEFREESV(re_sv); re = ReANY(re_sv); @@ -5049,6 +5430,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) re->sublen = rex->sublen; re->suboffset = rex->suboffset; re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; rei = RXi_GET(re); DEBUG_EXECUTE_r( debug_start_match(re_sv, utf8_target, locinput, @@ -5056,18 +5439,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ + /* Save all the seen positions so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + /* and set maxopenparen to 0, since we are starting a "fresh" match */ + maxopenparen = 0; + /* run the pattern returned from (??{...}) */ - eval_recurse_doit: /* Share code with GOSUB below this line */ - /* run the pattern returned from (??{...}) */ - - /* Save *all* the positions. */ - ST.cp = regcppush(rex, 0, maxopenparen); - REGCP_SET(ST.lastcp); - - re->lastparen = 0; - re->lastcloseparen = 0; - - maxopenparen = 0; + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ /* invalidate the S-L poscache. We're now executing a * different set of WHILEM ops (and their associated @@ -5079,6 +5460,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * pattern again */ reginfo->poscache_maxiter = 0; + /* the new regexp might have a different is_utf8_pat than we do */ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); ST.prev_rex = rex_sv; @@ -5103,7 +5485,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); rexi = RXi_GET(rex); - regcpblow(ST.cp); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; @@ -5453,10 +5842,10 @@ NULL if (reginfo->poscache_iter-- == 0) { /* initialise cache */ - const I32 size = (reginfo->poscache_maxiter + 7)/8; + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; regmatch_info_aux *const aux = reginfo->info_aux; if (aux->poscache) { - if ((I32)reginfo->poscache_size < size) { + if ((SSize_t)reginfo->poscache_size < size) { Renew(aux->poscache, size, char); reginfo->poscache_size = size; } @@ -5474,7 +5863,7 @@ NULL if (reginfo->poscache_iter < 0) { /* have we already failed at this position? */ - I32 offset, mask; + SSize_t offset, mask; reginfo->poscache_iter = -1; /* stop eventual underflow */ offset = (scan->flags & 0xf) - 1 @@ -5535,7 +5924,7 @@ NULL assert(0); /* NOTREACHED */ case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ - /* FALL THROUGH */ + /* FALLTHROUGH */ case WHILEM_A_pre_fail: /* just failed to match even minimal A */ REGCP_UNWIND(ST.lastcp); regcppop(rex, &maxopenparen); @@ -5612,7 +6001,7 @@ NULL if (next == scan) next = NULL; scan = NEXTOPER(scan); - /* FALL THROUGH */ + /* FALLTHROUGH */ case BRANCH: /* /(...|A|...)/ */ scan = NEXTOPER(scan); /* scan now points to inner node */ @@ -5799,7 +6188,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", (int)(REPORT_CODE_OFF+(depth*2)),"", valid_utf8_to_uvchr((U8 *) locinput, NULL), valid_utf8_to_uvchr(ST.c1_utf8, NULL), @@ -5813,7 +6202,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", (int)(REPORT_CODE_OFF+(depth*2)),"", (int) nextchr, ST.c1, ST.c2) ); @@ -6167,7 +6556,7 @@ NULL assert(0); /* NOTREACHED */ } } - /* FALL THROUGH */ + /* FALLTHROUGH */ case CURLY_B_max_fail: /* failed to find B in a greedy match */ @@ -6288,7 +6677,7 @@ NULL case IFMATCH_A_fail: /* body of (?...A) failed */ ST.wanted = !ST.wanted; - /* FALL THROUGH */ + /* FALLTHROUGH */ case IFMATCH_A: /* body of (?...A) succeeded */ if (ST.logical) { @@ -6452,7 +6841,7 @@ NULL /* push a state that backtracks on success */ st->u.yes.prev_yes_state = yes_state; yes_state = st; - /* FALL THROUGH */ + /* FALLTHROUGH */ push_state: /* push a new regex state, then continue at scan */ { @@ -6552,6 +6941,10 @@ yes: * When popping the save stack, all these locals would be undone; * bypass this by setting the outermost saved $^R to the latest * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ if (oreplsv != GvSV(PL_replgv)) sv_setsv(oreplsv, GvSV(PL_replgv)); } @@ -6607,6 +7000,8 @@ no_silent: sv_commit = &PL_sv_yes; sv_yes_mark = &PL_sv_no; } + assert(sv_err); + assert(sv_mrk); sv_setsv(sv_err, sv_commit); sv_setsv(sv_mrk, sv_yes_mark); } @@ -6754,7 +7149,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -6781,21 +7176,23 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALLTHROUGH */ case EXACTFA: - utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; case EXACTFL: - RXp_MATCH_TAINTED_on(prog); - utf8_flags = FOLDEQ_UTF8_LOCALE; + utf8_flags = FOLDEQ_LOCALE; goto do_exactf; - case EXACTF: - utf8_flags = 0; - goto do_exactf; + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6859,11 +7256,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; } case ANYOF: - case ANYOF_WARN_SUPER: if (utf8_target) { while (hardcount < max && scan < loceol - && reginclass(prog, p, (U8*)scan, utf8_target)) + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; @@ -6881,7 +7277,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); if (! utf8_target) { while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) @@ -6923,7 +7318,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, to_complement = 1; goto utf8_posix; } - /* FALL THROUGH */ + /* FALLTHROUGH */ case NPOSIXA: if (! utf8_target) { @@ -6934,10 +7329,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, else { /* The complement of something that matches only ASCII matches all - * UTF-8 variant code points, plus everything in ASCII that isn't - * in the class. */ + * non-ASCII, plus everything in ASCII that isn't in the class. */ while (hardcount < max && scan < loceol - && (! UTF8_IS_INVARIANT(*scan) + && (! isASCII_utf8(scan) || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) { scan += UTF8SKIP(scan); @@ -6981,8 +7375,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, - *(scan + 1)), + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), classnum)))) { break; @@ -7005,7 +7399,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, case _CC_ENUM_SPACE: /* XXX would require separate code if we revert the change of \v matching this */ - /* FALL THROUGH */ + /* FALLTHROUGH */ case _CC_ENUM_PSXSPC: while (hardcount < max && scan < loceol @@ -7064,8 +7458,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init( - "utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } while (hardcount < max && scan < loceol @@ -7133,7 +7529,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, GET_RE_DEBUG_FLAGS_DECL; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, p); + regprop(prog, prop, p, reginfo); PerlIO_printf(Perl_debug_log, "%*s %s can match %"IVdf" times out of %"IVdf"...\n", REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); @@ -7159,31 +7555,39 @@ Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, *altsvp = NULL; } - return newSVsv(core_regclass_swash(prog, node, doinit, listsvp)); + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); } -#endif -STATIC SV * -S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp) +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) { - /* Returns the swash for the input 'node' in the regex 'prog'. - * If is true, will attempt to create the swash if not already + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. + * If is 'true', will attempt to create the swash if not already * done. - * If is non-null, will return the swash initialization string in - * it. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). * Tied intimately to how regcomp.c sets up the data structure */ dVAR; SV *sw = NULL; - SV *si = NULL; + SV *si = NULL; /* Input swash initialization string */ SV* invlist = NULL; RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; - PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; - assert(ANYOF_NONBITMAP(node)); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); if (data && data->count) { const U32 n = ARG(node); @@ -7196,26 +7600,40 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit si = *ary; /* ary[0] = the string to initialize the swash with */ - /* Elements 2 and 3 are either both present or both absent. [2] is - * any inversion list generated at compile time; [3] indicates if + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if * that inversion list has any user-defined properties in it. */ - if (av_len(av) >= 2) { - invlist = ary[2]; - if (SvUV(ary[3])) { - swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + assert(only_utf8_locale_ptr); + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; } - } - else { - invlist = NULL; } /* Element [1] is reserved for the set-up swash. If already there, * return it; if not, create it and store it there */ - if (SvROK(ary[1])) { + if (ary[1] && SvROK(ary[1])) { sw = ary[1]; } - else if (si && doinit) { - + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { + assert(si); sw = _core_swash_init("utf8", /* the utf8 package */ "", /* nameless */ si, @@ -7228,16 +7646,18 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit } } + /* If requested, return a printable version of what this swash matches */ if (listsvp) { - SV* matches_string = newSVpvn("", 0); + SV* matches_string = newSVpvs(""); - /* Use the swash, if any, which has to have incorporated into it all - * possibilities */ + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) && (si && si != &PL_sv_undef)) { - - /* If no swash, use the input initialization string, if available */ sv_catsv(matches_string, si); } @@ -7251,12 +7671,14 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit return sw; } +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ /* - reginclass - determine if a character falls into a character class n is the ANYOF regnode p is the target string + p_end points to one byte beyond the end of the target string utf8_target tells whether p is in UTF-8. Returns true if matched; false otherwise. @@ -7268,7 +7690,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit */ STATIC bool -S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target) +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); @@ -7281,7 +7703,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_IS_INVARIANT() works even if not in UTF-8 */ if (! UTF8_IS_INVARIANT(c) && utf8_target) { STRLEN c_len = 0; - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, + c = utf8n_to_uvchr(p, p_end - p, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for @@ -7294,21 +7716,19 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_NON_UTF8_LATIN1_ALL + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL && ! utf8_target && ! isASCII(c)) { match = TRUE; } - else if (flags & ANYOF_LOCALE) { - RXp_MATCH_TAINTED_on(prog); - - if ((flags & ANYOF_LOC_FOLD) - && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) - { - match = TRUE; - } - else if (ANYOF_CLASS_TEST_ANY_SET(n)) { + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } + } + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { /* The data structure is arranged so bits 0, 2, 4, ... are set * if the class includes the Posix character class given by @@ -7342,8 +7762,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const int count = 0; int to_complement = 0; + while (count < ANYOF_MAX) { - if (ANYOF_CLASS_TEST(n, count) + if (ANYOF_POSIXL_TEST(n, count) && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) { match = TRUE; @@ -7356,60 +7777,63 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } } + /* If the bitmap didn't (or couldn't) match, and something outside the - * bitmap could match, try that. Locale nodes specify completely the - * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except in - * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes. So allow locale - * to match for the synthetic start class, which will give a false - * positive that will be resolved when the match is done again as not part - * of the synthetic start class */ + * bitmap could match, try that. */ if (!match) { - if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { match = TRUE; /* Everything above 255 matches */ } - else if (ANYOF_NONBITMAP(n) - && ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target - && (c >=256 - || (! (flags & ANYOF_LOCALE)) - || OP(n) == ANYOF_SYNTHETIC)))) - { - SV * const sw = core_regclass_swash(prog, n, TRUE, 0); + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) + { + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); if (sw) { + U8 utf8_buffer[2]; U8 * utf8_p; if (utf8_target) { utf8_p = (U8 *) p; } else { /* Convert to utf8 */ - STRLEN len = 1; - utf8_p = bytes_to_utf8(p, &len); + utf8_p = utf8_buffer; + append_utf8_from_native_byte(*p, &utf8_p); + utf8_p = utf8_buffer; } if (swash_fetch(sw, utf8_p, TRUE)) { match = TRUE; } - - /* If we allocated a string above, free it */ - if (! utf8_target) Safefree(utf8_p); } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } } if (UNICODE_IS_SUPER(c) - && OP(n) == ANYOF_WARN_SUPER + && (flags & ANYOF_WARN_SUPER) && ckWARN_d(WARN_NON_UNICODE)) { Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); } } +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ - return cBOOL(flags & ANYOF_INVERT) ^ match; + return (flags & ANYOF_INVERT) ^ match; } STATIC U8 * -S_reghop3(U8 *s, I32 off, const U8* lim) +S_reghop3(U8 *s, SSize_t off, const U8* lim) { /* return the position 'off' UTF-8 characters away from 's', forward if * 'off' >= 0, backwards if negative. But don't go outside of position @@ -7438,13 +7862,8 @@ S_reghop3(U8 *s, I32 off, const U8* lim) return s; } -#ifdef XXX_dmq -/* there are a bunch of places where we use two reghop3's that should - be replaced with this routine. but since thats not done yet - we ifdef it out - dmq -*/ STATIC U8 * -S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) { dVAR; @@ -7468,10 +7887,12 @@ S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) } return s; } -#endif + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ STATIC U8 * -S_reghopmaybe3(U8* s, I32 off, const U8* lim) +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) { dVAR; @@ -7532,19 +7953,14 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) DEFSV_set(reginfo->sv); } - if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv) - && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) { + if (!(mg = mg_find_mglob(reginfo->sv))) { /* prepare for quick setting of pos */ -#ifdef PERL_OLD_COPY_ON_WRITE - if (SvIsCOW(reginfo->sv)) - sv_force_normal_flags(reginfo->sv, 0); -#endif - mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global, - &PL_vtbl_mglob, NULL, 0); + mg = sv_magicext_mglob(reginfo->sv); mg->mg_len = -1; } eval_state->pos_magic = mg; eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; } else eval_state->pos_magic = NULL; @@ -7561,7 +7977,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) /* this regexp is also owned by the new PL_reg_curpm, which will try to free it. */ av_push(PL_regex_padav, repointer); - PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); PL_regex_pad = AvARRAY(PL_regex_padav); } #endif @@ -7619,7 +8035,12 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } if (eval_state->pos_magic) + { eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } PL_curpm = eval_state->curpm; } diff --git a/src/5019001/regcomp.c b/src/5021001/regcomp.c similarity index 72% rename from src/5019001/regcomp.c rename to src/5021001/regcomp.c index bdc684d..d8dcede 100644 --- a/src/5019001/regcomp.c +++ b/src/5021001/regcomp.c @@ -81,7 +81,7 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" -extern const struct regexp_engine my_reg_engine; +EXTERN_C const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif @@ -91,51 +91,48 @@ extern const struct regexp_engine my_reg_engine; #include "inline_invlist.c" #include "unicode_constants.h" -#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) -#ifdef op -#undef op -#endif /* op */ - -#ifdef MSDOS -# if defined(BUGGY_MSC6) - /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ -# pragma optimize("a",off) - /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ -# pragma optimize("w",on ) -# endif /* BUGGY_MSC6 */ -#endif /* MSDOS */ - #ifndef STATIC #define STATIC static #endif -typedef struct RExC_state_t { +struct RExC_state_t { U32 flags; /* RXf_* are we folding, multilining? */ U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ - regexp_internal *rxi; /* internal data for regexp object pprivate field */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ - I32 whilem_seen; /* number of WHILEM in this expr */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ - regnode *emit_bound; /* First regnode outside of the allocated space */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ regnode *emit; /* Code-emit pointer; if = &emit_dummy, implies compiling, so don't emit */ - regnode emit_dummy; /* placeholder for emit to point to */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; - I32 size; /* Code size. */ - I32 npar; /* Capture buffer count, (OPEN). */ - I32 cpar; /* Capture buffer count, (CLOSE). */ - I32 nestroot; /* root parens we are in - used by accept */ + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ I32 extralen; I32 seen_zerolen; regnode **open_parens; /* pointers to open parens */ @@ -152,15 +149,20 @@ typedef struct RExC_state_t { regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ I32 in_lookbehind; I32 contains_locale; + I32 contains_i; I32 override_recoding; I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ -#if ADD_TO_REGEXEC + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif @@ -173,7 +175,7 @@ typedef struct RExC_state_t { #define RExC_lastnum (pRExC_state->lastnum) #define RExC_paren_name_list (pRExC_state->paren_name_list) #endif -} RExC_state_t; +}; #define RExC_flags (pRExC_state->flags) #define RExC_pm_flags (pRExC_state->pm_flags) @@ -186,7 +188,8 @@ typedef struct RExC_state_t { #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) #ifdef RE_TRACK_PATTERN_OFFSETS -#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ #endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_dummy (pRExC_state->emit_dummy) @@ -196,6 +199,7 @@ typedef struct RExC_state_t { #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) @@ -209,19 +213,20 @@ typedef struct RExC_state_t { #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) #define RExC_override_recoding (pRExC_state->override_recoding) #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ - ((*s) == '{' && regcurly(s, FALSE))) + ((*s) == '{' && regcurly(s))) -#ifdef SPSTART -#undef SPSTART /* dratted cpp namespace... */ -#endif /* * Flags to be passed up and down. */ @@ -267,6 +272,11 @@ typedef struct RExC_state_t { #define namedclass_to_classnum(class) ((int) ((class) / 2)) #define classnum_to_namedclass(classnum) ((classnum) * 2) +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + /* About scan_data_t. During optimisation we recurse through the regexp program performing @@ -301,7 +311,7 @@ typedef struct RExC_state_t { - max_offset Only used for floating strings. This is the rightmost point that - the string can appear at. If set to I32 max it indicates that the + the string can appear at. If set to SSize_t_MAX it indicates that the string can occur infinitely far to the right. - minlenp @@ -345,28 +355,46 @@ typedef struct RExC_state_t { typedef struct scan_data_t { /*I32 len_min; unused */ /*I32 len_delta; unused */ - I32 pos_min; - I32 pos_delta; + SSize_t pos_min; + SSize_t pos_delta; SV *last_found; - I32 last_end; /* min value, <0 unless valid. */ - I32 last_start_min; - I32 last_start_max; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; SV **longest; /* Either &l_fixed, or &l_float. */ SV *longest_fixed; /* longest fixed string found in pattern */ - I32 offset_fixed; /* offset where it starts */ - I32 *minlen_fixed; /* pointer to the minlen relevant to the string */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ I32 lookbehind_fixed; /* is the position of the string modfied by LB */ SV *longest_float; /* longest floating string found in pattern */ - I32 offset_float_min; /* earliest point in string it can appear */ - I32 offset_float_max; /* latest point in string it can appear */ - I32 *minlen_float; /* pointer to the minlen relevant to the string */ - I32 lookbehind_float; /* is the position of the string modified by LB */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ I32 flags; I32 whilem_c; - I32 *last_closep; - struct regnode_charclass_class *start_class; + SSize_t *last_closep; + regnode_ssc *start_class; } scan_data_t; +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + /* * Forward declarations for pregcomp()'s friends. */ @@ -380,13 +408,8 @@ static const scan_data_t zero_scan_data = #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) -#ifdef NO_UNARY_PLUS -# define SF_FIX_SHIFT_EOL (0+2) -# define SF_FL_SHIFT_EOL (0+4) -#else -# define SF_FIX_SHIFT_EOL (+2) -# define SF_FL_SHIFT_EOL (+4) -#endif +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) @@ -411,15 +434,25 @@ static const scan_data_t zero_scan_data = /* The enums for all these are ordered so things work out correctly */ #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) -#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) -#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) -#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + #define OOB_NAMEDCLASS -1 /* There is no code point that is out-of-bounds, so this is problematic. But @@ -442,7 +475,12 @@ static const scan_data_t zero_scan_data = #define MARKER1 "<-- HERE" /* marker as it appears in the description */ #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ -#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given @@ -464,12 +502,12 @@ static const scan_data_t zero_scan_data = } STMT_END #define FAIL(msg) _FAIL( \ - Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) #define FAIL2(msg,arg) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ - arg, (int)len, RExC_precomp, ellipses)) + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan @@ -477,7 +515,7 @@ static const scan_data_t zero_scan_data = #define Simple_vFAIL(m) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -494,8 +532,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL2(m,a1) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -513,8 +551,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL3(m, a1, a2) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END /* @@ -531,8 +569,8 @@ static const scan_data_t zero_scan_data = */ #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vFAIL4(m,a1,a2,a3) STMT_START { \ @@ -541,80 +579,90 @@ static const scan_data_t zero_scan_data = Simple_vFAIL4(m, a1, a2, a3); \ } STMT_END +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + /* m is not necessarily a "literal string", in this macro */ #define reg_warn_non_literal_string(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + m, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN_dep(loc, m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ m REPORT_LOCATION, \ - (int)offset, RExC_precomp, RExC_precomp + offset); \ + REPORT_LOCATION_ARGS(offset)); \ } STMT_END -#define ckWARN2regdep(loc,m, a1) STMT_START { \ +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN3(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN3reg(loc, m, a1, a2) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ } STMT_END #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ } STMT_END @@ -649,7 +697,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ __LINE__, (int)(node), (int)(byte))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)-1] = (byte); \ } \ @@ -665,7 +714,8 @@ static const scan_data_t zero_scan_data = MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ __LINE__, (int)(node), (int)(len))); \ if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ } else { \ RExC_offsets[2*(node)] = (len); \ } \ @@ -691,6 +741,49 @@ static const scan_data_t zero_scan_data = #define EXPERIMENTAL_INPLACESCAN #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log, \ @@ -730,7 +823,8 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ floating substrings if needed. */ STATIC void -S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf) +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) { const STRLEN l = CHR_SVLEN(data->last_found); const STRLEN old_l = CHR_SVLEN(*data->longest); @@ -754,9 +848,12 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min data->offset_float_min = l ? data->last_start_min : data->pos_min; data->offset_float_max = (l ? data->last_start_max - : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta)); - if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX) - data->offset_float_max = I32_MAX; + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; if (data->flags & SF_BEFORE_EOL) data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); @@ -780,299 +877,591 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min DEBUG_STUDYDATA("commit: ",data,0); } -/* These macros set, clear and test whether the synthetic start class ('ssc', - * given by the parameter) matches an empty string (EOS). This uses the - * 'next_off' field in the node, to save a bit in the flags field. The ssc - * stands alone, so there is never a next_off, so this field is otherwise - * unused. The EOS information is used only for compilation, but theoretically - * it could be passed on to the execution code. This could be used to store - * more than one bit of information, but only this one is currently used. */ -#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END -#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END -#define TEST_SSC_EOS(node) cBOOL((node)->next_off) - -/* Can match anything (initialization) */ +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + STATIC void -S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) { - PERL_ARGS_ASSERT_CL_ANYTHING; + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ - ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_UNICODE_ALL; - SET_SSC_EOS(cl); + PERL_ARGS_ASSERT_SSC_INIT; + + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); /* If any portion of the regex is to operate under locale rules, * initialization includes it. The reason this isn't done for all regexes * is that the optimizer was written under the assumption that locale was * all-or-nothing. Given the complexity and lack of documentation in the - * optimizer, and that there are inadequate test cases for locale, so many + * optimizer, and that there are inadequate test cases for locale, many * parts of it may not work properly, it is safest to avoid locale unless * necessary. */ if (RExC_contains_locale) { - ANYOF_CLASS_SETALL(cl); /* /l uses class */ - cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD; + ANYOF_POSIXL_SETALL(ssc); } else { - ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ + ANYOF_POSIXL_ZERO(ssc); } } -/* Can match anything (initialization) */ STATIC int -S_cl_is_anything(const struct regnode_charclass_class *cl) +S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) { - int value; + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ - PERL_ARGS_ASSERT_CL_IS_ANYTHING; + UV start, end; + bool ret; - for (value = 0; value < ANYOF_MAX; value += 2) - if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) - return 1; - if (!(cl->flags & ANYOF_UNICODE_ALL)) - return 0; - if (!ANYOF_BITMAP_TESTALLSET((const void*)cl)) - return 0; - return 1; + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; } -/* Can match anything (initialization) */ -STATIC void -S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) { - PERL_ARGS_ASSERT_CL_INIT; + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ + + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } - Zero(cl, 1, struct regnode_charclass_class); - cl->type = ANYOF; - cl_anything(pRExC_state, cl); - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } + + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } + + return invlist; } /* These two functions currently do the exact same thing */ -#define cl_init_zero S_cl_init +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ -/* 'AND' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if - * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_and(struct regnode_charclass_class *cl, - const struct regnode_charclass_class *and_with) +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) { - PERL_ARGS_ASSERT_CL_AND; + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ - assert(PL_regkind[and_with->type] == ANYOF); + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; - /* I (khw) am not sure all these restrictions are necessary XXX */ - if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) - && !(ANYOF_CLASS_TEST_ANY_SET(cl)) - && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(and_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD)) { - int i; + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); - if (and_with->flags & ANYOF_INVERT) - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= ~and_with->bitmap[i]; - else - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] &= and_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ - - if (and_with->flags & ANYOF_INVERT) { - - /* Here, the and'ed node is inverted. Get the AND of the flags that - * aren't affected by the inversion. Those that are affected are - * handled individually below */ - U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; - cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); - cl->flags |= affected_flags; - - /* We currently don't know how to deal with things that aren't in the - * bitmap, but we know that the intersection is no greater than what - * is already in cl, so let there be false positives that get sorted - * out after the synthetic start class succeeds, and the node is - * matched for real. */ - - /* The inversion of these two flags indicate that the resulting - * intersection doesn't have them */ - if (and_with->flags & ANYOF_UNICODE_ALL) { - cl->flags &= ~ANYOF_UNICODE_ALL; - } - if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { - cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* and'd node is not inverted */ - U8 outside_bitmap_but_not_utf8; /* Temp variable */ - - if (! ANYOF_NONBITMAP(and_with)) { - - /* Here 'and_with' doesn't match anything outside the bitmap - * (except possibly ANYOF_UNICODE_ALL), which means the - * intersection can't either, except for ANYOF_UNICODE_ALL, in - * which case we don't know what the intersection is, but it's no - * greater than what cl already has, so can just leave it alone, - * with possible false positives */ - if (! (and_with->flags & ANYOF_UNICODE_ALL)) { - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); - cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; - } - } - else if (! ANYOF_NONBITMAP(cl)) { - - /* Here, 'and_with' does match something outside the bitmap, and cl - * doesn't have a list of things to match outside the bitmap. If - * cl can match all code points above 255, the intersection will - * be those above-255 code points that 'and_with' matches. If cl - * can't match all Unicode code points, it means that it can't - * match anything outside the bitmap (since the 'if' that got us - * into this block tested for that), so we leave the bitmap empty. - */ - if (cl->flags & ANYOF_UNICODE_ALL) { - ARG_SET(cl, ARG(and_with)); + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); - /* and_with's ARG may match things that don't require UTF8. - * And now cl's will too, in spite of this being an 'and'. See - * the comments below about the kludge */ - cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8; + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } } } - else { - /* Here, both 'and_with' and cl match something outside the - * bitmap. Currently we do not do the intersection, so just match - * whatever cl had at the beginning. */ - } - - - /* Take the intersection of the two sets of flags. However, the - * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a - * kludge around the fact that this flag is not treated like the others - * which are initialized in cl_anything(). The way the optimizer works - * is that the synthetic start class (SSC) is initialized to match - * anything, and then the first time a real node is encountered, its - * values are AND'd with the SSC's with the result being the values of - * the real node. However, there are paths through the optimizer where - * the AND never gets called, so those initialized bits are set - * inappropriately, which is not usually a big deal, as they just cause - * false positives in the SSC, which will just mean a probably - * imperceptible slow down in execution. However this bit has a - * higher false positive consequence in that it can cause utf8.pm, - * utf8_heavy.pl ... to be loaded when not necessary, which is a much - * bigger slowdown and also causes significant extra memory to be used. - * In order to prevent this, the code now takes a different tack. The - * bit isn't set unless some part of the regular expression needs it, - * but once set it won't get cleared. This means that these extra - * modules won't get loaded unless there was some path through the - * pattern that would have required them anyway, and so any false - * positives that occur by not ANDing them out when they could be - * aren't as severe as they would be if we treated this bit like all - * the others */ - outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags) - & ANYOF_NONBITMAP_NON_UTF8; - cl->flags &= and_with->flags; - cl->flags |= outside_bitmap_but_not_utf8; + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } } } -/* 'OR' a given class with another one. Can create false positives. 'cl' - * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if - * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void -S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) { - PERL_ARGS_ASSERT_CL_OR; - - if (or_with->flags & ANYOF_INVERT) { - - /* Here, the or'd node is to be inverted. This means we take the - * complement of everything not in the bitmap, but currently we don't - * know what that is, so give up and match anything */ - if (ANYOF_NONBITMAP(or_with)) { - cl_anything(pRExC_state, cl); - } - /* We do not use - * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) - * <= (B1 | !B2) | (CL1 | !CL2) - * which is wasteful if CL2 is small, but we ignore CL2: - * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 - * XXXX Can we handle case-fold? Unclear: - * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = - * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) - */ - else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(or_with->flags & ANYOF_LOC_FOLD) - && !(cl->flags & ANYOF_LOC_FOLD) ) { - int i; - - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= ~or_with->bitmap[i]; - } /* XXXX: logic is complicated otherwise */ - else { - cl_anything(pRExC_state, cl); - } + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ - /* And, we can just take the union of the flags that aren't affected - * by the inversion */ - cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; + SV* ored_cp_list; + U8 ored_flags; - /* For the remaining flags: - ANYOF_UNICODE_ALL and inverted means to not match anything above - 255, which means that the union with cl should just be - what cl has in it, so can ignore this flag - ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord - is 127-255 to match them, but then invert that, so the - union with cl should just be what cl has in it, so can - ignore this flag - */ - } else { /* 'or_with' is not inverted */ - /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ - if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_LOC_FOLD) - || (cl->flags & ANYOF_LOC_FOLD)) ) { - int i; + PERL_ARGS_ASSERT_SSC_OR; - /* OR char bitmap and class bitmap separately */ - for (i = 0; i < ANYOF_BITMAP_SIZE; i++) - cl->bitmap[i] |= or_with->bitmap[i]; - if (or_with->flags & ANYOF_CLASS) { - ANYOF_CLASS_OR(or_with, cl); - } - } - else { /* XXXX: logic is complicated, leave it along for a moment. */ - cl_anything(pRExC_state, cl); - } + assert(is_ANYOF_SYNTHETIC(ssc)); - if (ANYOF_NONBITMAP(or_with)) { + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } - /* Use the added node's outside-the-bit-map match if there isn't a - * conflict. If there is a conflict (both nodes match something - * outside the bitmap, but what they match outside is not the same - * pointer, and hence not easily compared until XXX we extend - * inversion lists this far), give up and allow the start class to - * match everything outside the bitmap. If that stuff is all above - * 255, can just set UNICODE_ALL, otherwise caould be anything. */ - if (! ANYOF_NONBITMAP(cl)) { - ARG_SET(cl, ARG(or_with)); - } - else if (ARG(cl) != ARG(or_with)) { + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ - if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { - cl_anything(pRExC_state, cl); - } - else { - cl->flags |= ANYOF_UNICODE_ALL; + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); } } } + } + + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} + +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; - /* Take the union */ - cl->flags |= or_with->flags; + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); } #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) -#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) #ifdef DEBUGGING @@ -1135,10 +1524,12 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; - PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state); + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); if ( trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum ); + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); } else { PerlIO_printf( Perl_debug_log, "%6s", "" ); } @@ -1150,19 +1541,23 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, while( ( base + ofs < trie->uniquecharcount ) || ( base + ofs - trie->uniquecharcount < trie->lasttrans - && trie->trans[ base + ofs - trie->uniquecharcount ].check != state)) + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) ofs++; PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) && - ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && - trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) { PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, - (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); } else { PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); } @@ -1173,7 +1568,8 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, } PerlIO_printf( Perl_debug_log, "\n" ); } - PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, ""); + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); for (word=1; word <= trie->wordcount; word++) { PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", (int)word, (int)(trie->wordinfo[word].prev), @@ -1217,14 +1613,16 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, ); } for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { - SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); if ( tmp ) { PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR ) , TRIE_LIST_ITEM(state,charid).forid, (UV)TRIE_LIST_ITEM(state,charid).newstate @@ -1300,9 +1698,11 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); } if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); } else { - PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, trie->states[ TRIE_NODENUM( state ) ].wordnum ); } } @@ -1319,7 +1719,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, May be the same as tail. tail : item following the branch sequence count : words in the sequence - flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/ depth : indent depth Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. @@ -1421,7 +1821,7 @@ and would end up looking like: 8: EXACT (10) 10: END(0) - d = uvuni_to_utf8_flags(d, uv, 0); + d = uvchr_to_utf8_flags(d, uv, 0); is the recommended Unicode-aware way of saying @@ -1433,7 +1833,7 @@ is the recommended Unicode-aware way of saying if (UTF) { \ SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ @@ -1444,31 +1844,28 @@ is the recommended Unicode-aware way of saying } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - /* if it is UTF then it is either already folded, or does not need folding */ \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ - } \ - else if (folder == PL_fold_latin1) { \ - /* if we use this folder we have to obey unicode rules on latin-1 data */ \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ - skiplen = UNISKIP(uvc); \ - foldlen -= skiplen; \ - scan = foldbuf + skiplen; \ - } \ - } else { \ - /* raw data, will be folded later if needed */ \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1511,7 +1908,8 @@ is the recommended Unicode-aware way of saying \ if ( noper_next < tail ) { \ if (!trie->jump) \ - trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ trie->jump[curword] = (U16)(noper_next - convert); \ if (!jumper) \ jumper = noper_next; \ @@ -1546,7 +1944,9 @@ is the recommended Unicode-aware way of saying #define MADE_EXACT_TRIE 4 STATIC I32 -S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) { dVAR; /* first pass, loop through and scan words */ @@ -1554,7 +1954,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs HV *widecharmap = NULL; AV *revcharmap = newAV(); regnode *cur; - const U32 uniflags = UTF8_ALLOW_DEFAULT; STRLEN len = 0; UV uvc = 0; U16 curword = 0; @@ -1567,13 +1966,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 * folder = NULL; #ifdef DEBUGGING - const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); AV *trie_words = NULL; /* along with revcharmap, this only used during construction but both are * useful during debugging so we store them in the struct when debugging. */ #else - const U32 data_slot = add_data( pRExC_state, 2, "tu" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); STRLEN trie_charcount=0; #endif SV *re_trie_maxbuff; @@ -1588,10 +1987,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs case EXACT: break; case EXACTFA: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; - case EXACTFL: folder = PL_fold_locale; break; default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } @@ -1611,16 +2008,16 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs }); re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + assert(re_trie_maxbuff); if (!SvIOK(re_trie_maxbuff)) { sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } DEBUG_TRIE_COMPILE_r({ - PerlIO_printf( Perl_debug_log, - "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - (int)depth * 2 + 2, "", - REG_NODE_NUM(startbranch),REG_NODE_NUM(first), - REG_NODE_NUM(last), REG_NODE_NUM(tail), - (int)depth); + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); }); /* Find the node we are going to overwrite */ @@ -1640,9 +2037,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store Unicode characters. We use the - native representation of the character value as the key and IV's for the - coded index. + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. *TODO* If we keep track of how many times each character is used we can remap the columns so that the table compression later on is more @@ -1659,13 +2056,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); const U8 *e = uc + STR_LEN( noper ); - STRLEN foldlen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; - const U8 *scan = (U8*)NULL; + int foldlen = 0; U32 wordlen = 0; /* required init */ - STRLEN chars = 0; - bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1685,13 +2081,77 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regardless of encoding */ if (OP( noper ) == EXACTFU_SS) { /* false positives are ok, so just set this */ - TRIE_BITMAP_SET(trie,0xDF); + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } - for ( ; uc < e ; uc += len ) { + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; - chars++; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ if ( uvc < 256 ) { if ( folder ) { U8 folded= folder[ (U8) uvc ]; @@ -1715,13 +2175,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !UTF ) { /* store first byte of utf8 representation of variant codepoints */ - if (! UNI_IS_INVARIANT(uvc)) { + if (! UVCHR_IS_INVARIANT(uvc)) { TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); } } set_bit = 0; /* We've done our bit :-) */ } } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + SV** svpp; if ( !widecharmap ) widecharmap = newHV(); @@ -1736,30 +2204,22 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs TRIE_STORE_REVCHAR(uvc); } } - } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ if( cur == first ) { - trie->minlen = chars; - trie->maxlen = chars; - } else if (chars < trie->minlen) { - trie->minlen = chars; - } else if (chars > trie->maxlen) { - trie->maxlen = chars; - } - if (OP( noper ) == EXACTFU_SS) { - /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ - if (trie->minlen > 1) - trie->minlen= 1; + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; } - if (OP( noper ) == EXACTFU_TRICKYFOLD) { - /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" - * - We assume that any such sequence might match a 2 byte string */ - if (trie->minlen > 2 ) - trie->minlen= 2; - } - } /* end first pass */ DEBUG_TRIE_COMPILE_r( - PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", (int)depth * 2 + 2,"", ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, @@ -1791,7 +2251,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); prev_states[1] = 0; - if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) { + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { /* Second Pass -- Array Of Lists Representation @@ -1823,11 +2285,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; - STRLEN skiplen = 0; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -1846,14 +2304,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); if ( !svpp ) { charid = 0; } else { charid=(U16)SvIV( *svpp ); } } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ if ( charid ) { U16 check; @@ -1863,8 +2325,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !trie->states[ state ].trans.list ) { TRIE_LIST_NEW( state ); } - for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) { - if ( TRIE_LIST_ITEM( state, check ).forid == charid ) { + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { newstate = TRIE_LIST_ITEM( state, check ).newstate; break; } @@ -1934,7 +2401,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, transcount * sizeof(reg_trie_trans) ); - Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); } base = trie->uniquecharcount + tp - minid; if ( maxid == minid ) { @@ -1942,22 +2411,27 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( ; zp < tp ; zp++ ) { if ( ! trie->trans[ zp ].next ) { base = trie->uniquecharcount + zp - minid; - trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ zp ].check = state; set = 1; break; } } if ( !set ) { - trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate; + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; trie->trans[ tp ].check = state; tp++; zp = tp; } } else { for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid; - trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate; + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; trie->trans[ tid ].check = state; } tp += ( maxid - minid + 1 ); @@ -1977,26 +2451,26 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* Second Pass -- Flat Table Representation. - we dont use the 0 slot of either trans[] or states[] so we add 1 to each. - We know that we will need Charcount+1 trans at most to store the data - (one row per char at worst case) So we preallocate both structures - assuming worst case. + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. We then construct the trie using only the .next slots of the entry structs. - We use the .check field of the first entry of the node temporarily to - make compression both faster and easier by keeping track of how many non - zero fields are in the node. + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. Since trans are numbered from 1 any 0 pointer in the table is a FAIL transition. - There are two terms at use here: state as a TRIE_NODEIDX() which is a - number representing the first entry of the node, and state as a - TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and - TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there - are 2 entrys per node. eg: + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: A B A B 1. 2 4 1. 3 7 @@ -2004,9 +2478,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs 3. 0 0 5. 0 0 4. 0 0 7. 0 0 - The table is internally in the right hand, idx form. However as we also - have to deal with the states array which is indexed by nodenum we have to - use TRIE_NODENUM() to convert. + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. */ DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, @@ -2033,12 +2507,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U16 charid = 0; /* sanity init */ U32 accept_state = 0; /* sanity init */ - U8 *scan = (U8*)NULL; /* sanity init */ - STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ - STRLEN skiplen = 0; - U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); @@ -2057,7 +2527,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( uvc < 256 ) { charid = trie->charmap[ uvc ]; } else { - SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0); + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); charid = svpp ? (U16)SvIV(*svpp) : 0; } if ( charid ) { @@ -2073,7 +2546,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } else { Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); } - /* charid is now 0 if we dont know the char read, or nonzero if we do */ + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ } } accept_state = TRIE_NODENUM( state ); @@ -2160,7 +2634,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U32 used = trie->trans[ stateidx ].check; trie->trans[ stateidx ].check = 0; - for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) { + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { if ( flag || trie->trans[ stateidx + charid ].next ) { if ( trie->trans[ stateidx + charid ].next ) { if (o_used == 1) { @@ -2169,8 +2646,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs break; } } - trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ; - trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); trie->trans[ zp ].check = state; if ( ++zp > pos ) pos = zp; break; @@ -2179,9 +2661,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( !flag ) { flag = 1; - trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; } - trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); trie->trans[ pos ].check = state; pos++; } @@ -2192,19 +2677,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->states, laststate * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, - "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", - (int)depth * 2 + 2,"", - (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), - (IV)next_alloc, - (IV)pos, - ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); ); } /* end table compress */ } DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", (int)depth * 2 + 2, "", (UV)trie->statecount, (UV)trie->lasttrans) @@ -2255,7 +2742,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs }); } DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); @@ -2496,25 +2984,30 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs : MADE_TRIE; } -STATIC void -S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) { -/* The Trie is constructed and compressed now so we can build a fail array if it's needed +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed - This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the - "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 ISBN 0-201-10088-6 - We find the fail state for each state in the trie, this state is the longest proper - suffix of the current state's 'word' that is also a proper prefix of another word in our - trie. State 1 represents the word '' and is thus the default fail state. This allows - the DFA not to have to restart after its tried and failed a word at a given point, it - simply continues as though it had been matching the other word in the first place. + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. Consider 'abcdgu'=~/abcdefg|cdgu/ - When we get to 'd' we are still matching the first word, we would encounter 'g' which would - fail, which would bring us to the state representing 'd' in the second word where we would - try 'g' and succeed, proceeding to match 'cdgu'. + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. */ /* add a fail transition */ const U32 trie_offset = ARG(source); @@ -2529,14 +3022,28 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 base = trie->states[ 1 ].trans.base; U32 *fail; reg_ac_data *aho; - const U32 data_slot = add_data( pRExC_state, 1, "T" ); + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; + PERL_UNUSED_CONTEXT; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ ARG_SET( stclass, data_slot ); aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); @@ -2602,26 +3109,16 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode PerlIO_printf(Perl_debug_log, "\n"); }); Safefree(q); - /*RExC_seen |= REG_SEEN_TRIEDFA;*/ + /*RExC_seen |= REG_TRIEDFA_SEEN;*/ + return stclass; } -/* - * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. - * These need to be revisited when a newer toolchain becomes available. - */ -#if defined(__sparc64__) && defined(__GNUC__) -# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) -# undef SPARC64_GCC_WORKAROUND -# define SPARC64_GCC_WORKAROUND 1 -# endif -#endif - #define DEBUG_PEEP(str,scan,depth) \ DEBUG_OPTIMISE_r({if (scan){ \ SV * const mysv=sv_newmortal(); \ regnode *Next = regnext(scan); \ - regprop(RExC_rx, mysv, scan); \ + regprop(RExC_rx, mysv, scan, NULL); \ PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ Next ? (REG_NODE_NUM(Next)) : 0 ); \ @@ -2640,49 +3137,58 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta of the input nodes. + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. * - * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF - * and contains LATIN SMALL LETTER SHARP S + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) * * This is as good a place as any to discuss the design of handling these * multi-character fold sequences. It's been wrong in Perl for a very long * time. There are three code points in Unicode whose multi-character folds * were long ago discovered to mess things up. The previous designs for * dealing with these involved assigning a special node for them. This - * approach doesn't work, as evidenced by this example: + * approach doesn't always work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node that + * Both sides fold to "sss", but if the pattern is parsed to create a node that * would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node - * that is "sss". + * that is "sss" in this case. * * It turns out that there are problems with all multi-character folds, and not - * just these three. Now the code is general, for all such cases, but the - * three still have some special handling. The approach taken is: + * just these three. Now the code is general, for all such cases. The + * approach taken is: * 1) This routine examines each EXACTFish node that could contain multi- - * character fold sequences. It returns in *min_subtract how much to - * subtract from the the actual length of the string to get a real minimum - * match length; it is 0 if there are no multi-char folds. This delta is - * used by the caller to adjust the min length of the match, and the delta - * between min and max, so that the optimizer doesn't reject these - * possibilities based on size constraints. - * 2) Certain of these sequences require special handling by the trie code, - * so, if found, this code changes the joined node type to special ops: - * EXACTFU_TRICKYFOLD and EXACTFU_SS. - * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS * is used for an EXACTFU node that contains at least one "ss" sequence in * it. For non-UTF-8 patterns and strings, this is the only case where * there is a possible fold length change. That means that a regular * EXACTFU node without UTF-8 involvement doesn't have to concern itself * with length changes, and so can be processed faster. regexec.c takes * advantage of this. Generally, an EXACTFish node that is in UTF-8 is - * pre-folded by regcomp.c. This saves effort in regex matching. - * However, the pre-folding isn't done for non-UTF8 patterns because the - * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things - * down by forcing the pattern into UTF8 unless necessary. Also what - * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold * possibilities for the non-UTF8 patterns are quite simple, except for * the sharp s. All the ones that don't involve a UTF-8 target string are * members of a fold-pair, and arrays are set up for all of them so that @@ -2690,45 +3196,63 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * this file makes sure that in EXACTFU nodes, the sharp s gets folded to * 'ss', even if the pattern isn't UTF-8. This avoids the issues * described in the next item. - * 4) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the - * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a - * UTF-8 pattern.) An assumption that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes is that a - * character in the pattern corresponds to at most a single character in - * the target string. (And I do mean character, and not byte here, unlike - * other parts of the documentation that have never been updated to - * account for multibyte Unicode.) sharp s in EXACTF nodes can match the - * two character string 'ss'; in EXACTFA nodes it can match - * "\x{17F}\x{17F}". These violate the assumption, and they are the only - * instances where it is violated. I'm reluctant to try to change the - * assumption, as the code involved is impenetrable to me (khw), so - * instead the code here punts. This routine examines (when the pattern - * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a - * boolean indicating whether or not the node contains a sharp s. When it - * is true, the caller sets a flag that later causes the optimizer in this - * file to not set values for the floating and fixed string lengths, and - * thus avoids the optimizer code in regexec.c that makes the invalid + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid * assumption. Thus, there is no optimization based on string lengths for - * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s. - * (The reason the assumption is wrong only in these two cases is that all - * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all - * other folds to their expanded versions. We can't prefold sharp s to - * 'ss' in EXACTF nodes because we don't know at compile time if it - * actually matches 'ss' or not. It will match iff the target string is - * in UTF-8, unlike the EXACTFU nodes, where it always matches; and - * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8 - * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem; - * but in a non-UTF8 pattern, folding it to that above-Latin1 string would - * require the pattern to be forced into UTF-8, the overhead of which we - * want to avoid.) - */ - -#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2774,8 +3298,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); - /* XXX I (khw) kind of doubt that this works on platforms where - * U8_MAX is above 255 because of lots of other assumptions */ + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ /* Don't join if the sum can't fit into a single node */ if (oldl + STR_LEN(n) > U8_MAX) break; @@ -2810,7 +3335,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } *min_subtract = 0; - *has_exactf_sharp_s = FALSE; + *unfolded_multi_char = FALSE; /* Here, all the adjacent mergeable EXACTish nodes have been merged. We * can now analyze for sequences of problematic code points. (Prior to @@ -2818,15 +3343,68 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - const U8 * const s0 = (U8*) STRING(scan); - const U8 * s = s0; - const U8 * const s_end = s0 + STR_LEN(scan); + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ /* One pass is made over the node's string looking for all the - * possibilities. to avoid some tests in the loop, there are two main + * possibilities. To avoid some tests in the loop, there are two main * cases, for UTF-8 patterns (which can't have EXACTF nodes) and * non-UTF-8 */ if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ /* Examine the string for a multi-character fold sequence. UTF-8 * patterns have all characters pre-folded by the time this code is @@ -2834,60 +3412,32 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b while (s < s_end - 1) /* Can stop 1 before the end, as minimum length sequence we are looking for is 2 */ { - int count = 0; + int count = 0; /* How many characters in a multi-char fold */ int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); if (! len) { /* Not a multi-char fold: get next char */ s += UTF8SKIP(s); continue; } - /* Nodes with 'ss' require special handling, except for EXACTFL - * and EXACTFA for which there is no multi-char fold to this */ + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ if (len == 2 && *s == 's' && *(s+1) == 's' - && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) { count = 2; - OP(scan) = EXACTFU_SS; - s += 2; - } - else if (len == 6 /* len is the same in both ASCII and EBCDIC - for these */ - && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6) - || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8 - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8, - 6))) - { - count = 3; - - /* These two folds require special handling by trie's, so - * change the node type to indicate this. If EXACTFA and - * EXACTFL were ever to be handled by trie's, this would - * have to be changed. If this node has already been - * changed to EXACTFU_SS in this loop, leave it as is. (I - * (khw) think it doesn't matter in regexec.c for UTF - * patterns, but no need to change it */ - if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_TRICKYFOLD; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; } - s += 6; + s += 2; } else { /* Here is a generic multi-char fold. */ - const U8* multi_end = s + len; + U8* multi_end = s + len; - /* Count how many characters in it. In the case of /l and + /* Count how many characters are in it. In the case of * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. (In - * EXACTFL, no folds are allowed to any Latin1 code point, - * not just ASCII. But there aren't any of these - * currently, nor ever likely, so don't take the time to - * test for them. The code that generates the - * is_MULTI_foo() macros croaks should one actually get put - * into Unicode .) */ - if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + * allowed, so check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; } @@ -2907,70 +3457,78 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b /* The delta is how long the sequence is minus 1 (1 is how long * the character that folds to the sequence is) */ - *min_subtract += count - 1; + total_count_delta += count - 1; next_iteration: ; } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); } else if (OP(scan) == EXACTFA) { /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char * fold to the ASCII range (and there are no existing ones in the * upper latin1 range). But, as outlined in the comments preceding - * this function, we need to flag any occurrences of the sharp s */ + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { - *has_exactf_sharp_s = TRUE; + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; break; } s++; continue; } } - else if (OP(scan) != EXACTFL) { - - /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the - * multi-char folds that are all Latin1. (This code knows that - * there are no current multi-char folds possible with EXACTFL, - * relying on fold_grind.t to catch any errors if the very unlikely - * event happens that some get added in future Unicode versions.) - * As explained in the comments preceding this function, we look - * also for the sharp s in EXACTF nodes; it can be in the final - * position. Otherwise we can stop looking 1 byte earlier because - * have to find at least two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; - - /* The below is perhaps overboard, but this allows us to save a - * test each time through the loop at the expense of a mask. This - * is because on both EBCDIC and ASCII machines, 'S' and 's' differ - * by a single bit. On ASCII they are 32 apart; on EBCDIC, they - * are 64. This uses an exclusive 'or' to find that bit and then - * inverts it to form a mask, with just a single 0, in the bit - * position where 'S' and 's' differ. */ - const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); - const U8 s_masked = 's' & S_or_s_mask; + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; while (s < upper) { int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) { - *has_exactf_sharp_s = TRUE; + *unfolded_multi_char = TRUE; } s++; continue; } if (len == 2 - && ((*s & S_or_s_mask) == s_masked) - && ((*(s+1) & S_or_s_mask) == s_masked)) + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) { /* EXACTF nodes need to know that the minimum length * changed so that a sharp s in the string can match this * ss in the pattern, but they remain EXACTF nodes, as they * won't match this unless the target string is is UTF-8, - * which we don't know until runtime */ - if (OP(scan) != EXACTF) { + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { OP(scan) = EXACTFU_SS; } } @@ -3004,7 +3562,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b #define INIT_AND_WITHP \ assert(!and_withp); \ - Newx(and_withp,1,struct regnode_charclass_class); \ + Newx(and_withp,1, regnode_ssc); \ SAVEFREEPV(and_withp) /* this is a chain of data about sub patterns we are processing that @@ -3015,20 +3573,19 @@ typedef struct scan_frame { regnode *last; /* last node to process in this frame */ regnode *next; /* next node to process when last is reached */ struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; I32 stop; /* what stopparen do we use */ } scan_frame; -#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) - -STATIC I32 +STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, - I32 *minlenp, I32 *deltap, + SSize_t *minlenp, SSize_t *deltap, regnode *last, scan_data_t *data, I32 stopparen, - U8* recursed, - struct regnode_charclass_class *and_withp, + U32 recursed_depth, + regnode_ssc *and_withp, U32 flags, U32 depth) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ @@ -3039,17 +3596,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0; /* There must be at least this number of characters to match */ + /* There must be at least this number of characters to match */ + SSize_t min = 0; I32 pars = 0, code; regnode *scan = *scanp, *next; - I32 delta = 0; + SSize_t delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; SV *re_trie_maxbuff = NULL; regnode *first_non_open = scan; - I32 stopmin = I32_MAX; + SSize_t stopmin = SSize_t_MAX; scan_frame *frame = NULL; GET_RE_DEBUG_FLAGS_DECL; @@ -3058,7 +3616,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif - if ( depth == 0 ) { while (first_non_open && OP(first_non_open) == OPEN) first_non_open=regnext(first_non_open); @@ -3070,19 +3627,44 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV min_subtract = 0; /* How mmany chars to subtract from the minimum node length to get a real minimum (because the folded version may be shorter) */ - bool has_exactf_sharp_s = FALSE; + bool unfolded_multi_char = FALSE; /* Peephole optimizer: */ - DEBUG_STUDYDATA("Peep:", data,depth); - DEBUG_PEEP("Peep",scan,depth); - - /* Its not clear to khw or hv why this is done here, and not in the - * clauses that deal with EXACT nodes. khw's guess is that it's - * because of a previous design */ - JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); - - /* Follow the next-chain of the current node and optimize - away all the NOTHINGs from it. */ - if (OP(scan) != CURLYX) { + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); + + + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); + + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. */ + if (OP(scan) != CURLYX) { const int max = (reg_off_by_arg[OP(scan)] ? I32_MAX /* I32 may be smaller than U16 on CRAYs! */ @@ -3111,24 +3693,29 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, || OP(scan) == IFTHEN) { next = regnext(scan); code = OP(scan); - /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ if (OP(next) == code || code == IFTHEN) { - /* NOTE - There is similar code to this block below for handling - TRIE nodes on a re-study. If you change stuff here check there - too. */ - I32 max1 = 0, min1 = I32_MAX, num = 0; - struct regnode_charclass_class accum; + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; regnode * const startbranch=scan; - if (flags & SCF_DO_SUBSTR) - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + ssc_init_zero(pRExC_state, &accum); while (OP(scan) == code) { - I32 deltanext, minnext, f = 0, fake; - struct regnode_charclass_class this_class; + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; num++; data_fake.flags = 0; @@ -3145,7 +3732,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (code != BRANCH) scan = NEXTOPER(scan); if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -3153,14 +3740,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - next, &data_fake, - stopparen, recursed, NULL, f,depth+1); + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); if (min1 > minnext) min1 = minnext; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; + max1 = SSize_t_MAX; } else if (max1 < minnext + deltanext) max1 = minnext + deltanext; scan = next; @@ -3179,63 +3766,64 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; if (flags & SCF_DO_SUBSTR) { data->pos_min += min1; - if (data->pos_delta >= I32_MAX - (max1 - min1)) - data->pos_delta = I32_MAX; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; else data->pos_delta += max1 - min1; if (max1 != min1 || is_inf) data->longest = &(data->longest_float); } min += min1; - if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0) - delta = I32_MAX; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; else delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } - if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) { + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { /* demq. - Assuming this was/is a branch we are dealing with: 'scan' now - points at the item that follows the branch sequence, whatever - it is. We now start at the beginning of the sequence and look - for subsequences of + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of BRANCH->EXACT=>x1 BRANCH->EXACT=>x2 tail - which would be constructed from a pattern like /A|LIST|OF|WORDS/ + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ If we can find such a subsequence we need to turn the first element into a trie and then add the subsequent branch exact @@ -3243,7 +3831,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, We have two cases - 1. patterns where the whole set of branches can be converted. + 1. patterns where the whole set of branches can be + converted. 2. patterns where only a subset can be converted. @@ -3280,7 +3869,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U32 count=0; #ifdef DEBUGGING - SV * const mysv = sv_newmortal(); /* for dumping */ + SV * const mysv = sv_newmortal(); /* for dumping */ #endif /* var tail is used because there may be a TAIL regop in the way. Ie, the exacts will point to the @@ -3297,11 +3886,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, tail ); + regprop(RExC_rx, mysv, tail, NULL); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", - (int)depth * 2 + 2, "", - "Looking for TRIE'able sequences. Tail node is: ", - SvPV_nolen_const( mysv ) + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) ); }); @@ -3309,35 +3898,46 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, Step through the branches cur represents each branch, - noper is the first thing to be matched as part of that branch + noper is the first thing to be matched as part + of that branch noper_next is the regnext() of that node. - We normally handle a case like this /FOO[xyz]|BAR[pqr]/ - via a "jump trie" but we also support building with NOJUMPTRIE, - which restricts the trie logic to structures like /FOO|BAR/. - - If noper is a trieable nodetype then the branch is a possible optimization - target. If we are building under NOJUMPTRIE then we require that noper_next - is the same as scan (our current position in the regex program). - - Once we have two or more consecutive such branches we can create a - trie of the EXACT's contents and stitch it in place into the program. - - If the sequence represents all of the branches in the alternation we - replace the entire thing with a single TRIE node. - - Otherwise when it is a subsequence we need to stitch it in place and - replace only the relevant branches. This means the first branch has - to remain as it is used by the alternation logic, and its next pointer, - and needs to be repointed at the item on the branch chain following - the last branch we have optimized away. - - This could be either a BRANCH, in which case the subsequence is internal, - or it could be the item following the branch sequence in which case the - subsequence is at the end (which does not necessarily mean the first node - is the start of the alternation). - - TRIE_TYPE(X) is a define which maps the optype to a trietype. + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. optype | trietype ----------------+----------- @@ -3345,14 +3945,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, EXACT | EXACT EXACTFU | EXACTFU EXACTFU_SS | EXACTFU - EXACTFU_TRICKYFOLD | EXACTFU - EXACTFA | 0 + EXACTFA | EXACTFA */ #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ ( EXACT == (X) ) ? EXACT : \ - ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ 0 ) /* dont use tail as the end marker for this traverse */ @@ -3367,16 +3967,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); - regprop(RExC_rx, mysv, noper); + regprop(RExC_rx, mysv, noper, NULL); PerlIO_printf( Perl_debug_log, " -> %s", SvPV_nolen_const(mysv)); if ( noper_next ) { - regprop(RExC_rx, mysv, noper_next ); + regprop(RExC_rx, mysv, noper_next, NULL); PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } @@ -3386,8 +3986,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ); }); - /* Is noper a trieable nodetype that can be merged with the - * current trie (if there is one)? */ + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ if ( noper_trietype && ( @@ -3400,10 +4000,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif && count < U16_MAX) { - /* Handle mergable triable node - * Either we are the first node in a new trieable sequence, - * in which case we do some bookkeeping, otherwise we update - * the end pointer. */ + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ if ( !first ) { first = cur; if ( noper_trietype == NOTHING ) { @@ -3416,8 +4016,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_next_trietype ) { trietype = noper_next_trietype; } else if (noper_next_type) { - /* a NOTHING regop is 1 regop wide. We need at least two - * for a trie so we can't merge this in */ + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ first = NULL; } } else { @@ -3433,31 +4034,39 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle mergable triable node */ else { /* handle unmergable node - - * noper may either be a triable node which can not be tried - * together with the current trie, or a non triable node */ + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ if ( last ) { - /* If last is set and trietype is not NOTHING then we have found - * at least two triable branch sequences in a row of a similar - * trietype so we can turn them into a trie. If/when we - * allow NOTHING to start a trie sequence this condition will be - * required, and it isn't expensive so we leave it in for now. */ + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ if ( trietype && trietype != NOTHING ) make_trie( pRExC_state, - startbranch, first, cur, tail, count, - trietype, depth+1 ); - last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ } if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ - /* noper is triable, so we can start a new trie sequence */ + /* noper is triable, so we can start a new + * trie sequence */ count = 1; first = cur; trietype = noper_trietype; } else if (first) { - /* if we already saw a first but the current node is not triable then we have + /* if we already saw a first but the + * current node is not triable then we have * to reset the first information. */ count = 0; first = NULL; @@ -3466,18 +4075,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle unmergable node */ } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) \n", (int)depth * 2 + 2, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); if ( last && trietype ) { if ( trietype != NOTHING ) { - /* the last branch of the sequence was part of a trie, - * so we have to construct it here outside of the loop - */ - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) @@ -3487,20 +4099,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( startbranch == first && scan == tail ) { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } } #endif } else { - /* at this point we know whatever we have is a NOTHING sequence/branch - * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING */ if ( startbranch == first ) { regnode *opt; - /* the entire thing is a NOTHING sequence, something like this: - * (?:|) So we can turn it into a plain NOTHING op. */ + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur); + regprop(RExC_rx, mysv, cur, NULL); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); @@ -3528,9 +4143,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 paren; regnode *start; regnode *end; + U32 my_recursed_depth= recursed_depth; if (OP(scan) != SUSPEND) { - /* set the pointer */ + /* set the pointer */ if (OP(scan) == GOSUB) { paren = ARG(scan); RExC_recurse[ARG2L(scan)] = scan; @@ -3541,21 +4157,33 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, start = RExC_rxi->program + 1; end = RExC_opend; } - if (!recursed) { - Newxz(recursed, (((RExC_npar)>>3) +1), U8); - SAVEFREEPV(recursed); - } - if (!PAREN_TEST(recursed,paren+1)) { - PAREN_SET(recursed,paren+1); + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; Newx(newframe,1,scan_frame); } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } } else { @@ -3572,17 +4200,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, newframe->last = last; newframe->stop = stopparen; newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); frame = newframe; scan = start; stopparen = paren; last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; continue; } } else if (OP(scan) == EXACT) { - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc; if (UTF) { const U8 * const s = (U8*)STRING(scan); @@ -3598,7 +4232,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data->last_end == -1) { /* Update the start info. */ data->last_start_min = data->pos_min; data->last_start_max = is_inf - ? I32_MAX : data->pos_min + data->pos_delta; + ? SSize_t_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); if (UTF) @@ -3609,83 +4243,48 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) mg->mg_len += utf8_length((U8*)STRING(scan), - (U8*)STRING(scan)+STR_LEN(scan)); + (U8*)STRING(scan)+STR_LEN(scan)); } data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; } - if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - - - /* If compatible, we or it in below. It is compatible if is - * in the bitmp and either 1) its bit or its fold is set, or 2) - * it's for a locale. Even if there isn't unicode semantics - * here, at runtime there may be because of matching against a - * utf8 string, so accept a possible false positive for - * latin1-range folds */ - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && (!(data->start_class->flags & ANYOF_LOC_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - ) - { - compat = 0; - } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) - ANYOF_BITMAP_SET(data->start_class, uc); - else if (uc >= 0x100) { - int i; - /* Some Unicode code points fold to the Latin1 range; as - * XXX temporary code, instead of figuring out if this is - * one, just assume it is and set all the start class bits - * that could be some such above 255 code point's fold - * which will generate fals positives. As the code - * elsewhere that does compute the fold settles down, it - * can be extracted out and re-used here */ - for (i = 0; i < 256; i++){ - if (HAS_NONLATIN1_FOLD_CLOSURE(i)) { - ANYOF_BITMAP_SET(data->start_class, i); - } - } - } - CLEAR_SSC_EOS(data->start_class); - if (uc < 0x100) - data->start_class->flags &= ~ANYOF_UNICODE_ALL; + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ + if (flags & SCF_DO_STCLASS_AND) { + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); } else if (flags & SCF_DO_STCLASS_OR) { - /* false positive possible if the class is case-folded */ - if (uc < 0x100) - ANYOF_BITMAP_SET(data->start_class, uc); - else - data->start_class->flags |= ANYOF_UNICODE_ALL; - CLEAR_SSC_EOS(data->start_class); - cl_and(data->start_class, and_withp); + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } - else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ - I32 l = STR_LEN(scan); + else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is + EXACTFish */ + SSize_t l = STR_LEN(scan); UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ + const U8 * s = (U8*)STRING(scan); /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { assert(data); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { - const U8 * const s = (U8 *)STRING(scan); uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - if (has_exactf_sharp_s) { - RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; } min += l - min_subtract; assert (min >= 0); @@ -3700,99 +4299,164 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - if (flags & SCF_DO_STCLASS_AND) { - /* Check whether it is compatible with what we know already! */ - int compat = 1; - if (uc >= 0x100 || - (!(data->start_class->flags & ANYOF_LOCALE) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) - { - compat = 0; + + if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + + if (! UTF) { + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); } - ANYOF_CLASS_ZERO(data->start_class); - ANYOF_BITMAP_ZERO(data->start_class); - if (compat) { - ANYOF_BITMAP_SET(data->start_class, uc); - CLEAR_SSC_EOS(data->start_class); + else { + + /* Any Latin1 range character can potentially match any + * other depending on the locale */ if (OP(scan) == EXACTFL) { - /* XXX This set is probably no longer necessary, and - * probably wrong as LOCALE now is on in the initial - * state */ - data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; + _invlist_union(EXACTF_invlist, PL_Latin1, + &EXACTF_invlist); } else { - - /* Also set the other member of the fold pair. In case - * that unicode semantics is called for at runtime, use - * the full latin1 fold. (Can't do this for locale, - * because not known until runtime) */ - ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); - - /* All other (EXACTFL handled above) folds except under - * /iaa that include s, S, and sharp_s also may include - * the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) { + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); } } + + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, + (U8) uc, + &EXACTF_invlist); + } } - else if (uc >= 0x100) { + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + STR_LEN(scan); + SV** listp; + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(scan) == EXACTFL + && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) + { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; int i; - for (i = 0; i < 256; i++){ - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { - ANYOF_BITMAP_SET(data->start_class, i); + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); } } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; } - } - else if (flags & SCF_DO_STCLASS_OR) { - if (data->start_class->flags & ANYOF_LOC_FOLD) { - /* false positive possible if the class is case-folded. - Assume that the locale settings are the same... */ - if (uc < 0x100) { - ANYOF_BITMAP_SET(data->start_class, uc); - if (OP(scan) != EXACTFL) { - - /* And set the other member of the fold pair, but - * can't do that in locale because not known until - * run-time */ - ANYOF_BITMAP_SET(data->start_class, - PL_fold_latin1[uc]); - - /* All folds except under /iaa that include s, S, - * and sharp_s also may include the others */ - if (OP(scan) != EXACTFA) { - if (uc == 's' || uc == 'S') { - ANYOF_BITMAP_SET(data->start_class, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - ANYOF_BITMAP_SET(data->start_class, 's'); - ANYOF_BITMAP_SET(data->start_class, 'S'); - } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; } + + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c); } } - CLEAR_SSC_EOS(data->start_class); } - cl_and(data->start_class, and_withp); + } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); } else if (REGNODE_VARIES(OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, fl = 0; - I32 f = flags, pos_before = 0; + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; regnode * const oscan = scan; - struct regnode_charclass_class this_class; - struct regnode_charclass_class *oclass = NULL; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; I32 next_is_eval = 0; switch (PL_regkind[OP(scan)]) { @@ -3813,7 +4477,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_SUBSTR) data->pos_min++; min++; - /* Fall through. */ + /* FALLTHROUGH */ case STAR: if (flags & SCF_DO_STCLASS) { mincount = 0; @@ -3822,12 +4486,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan = NEXTOPER(scan); goto do_curly; } - is_inf = is_inf_internal = 1; - scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */ + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } + is_inf = is_inf_internal = 1; + scan = regnext(scan); goto optimize_curly_tail; case CURLY: if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) @@ -3848,7 +4513,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */ + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ pos_before = data->pos_min; } if (data) { @@ -3858,7 +4525,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags |= SF_IS_INF; } if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); oclass = data->start_class; data->start_class = &this_class; f |= SCF_DO_STCLASS_AND; @@ -3878,35 +4545,35 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* This will finish on WHILEM, setting scan, or on NULL: */ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - last, data, stopparen, recursed, NULL, - (mincount == 0 - ? (f & ~SCF_DO_SUBSTR) : f),depth+1); + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); if (flags & SCF_DO_STCLASS) data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&this_class, data->start_class, - struct regnode_charclass_class); + StructCopy(&this_class, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &this_class); - cl_and(data->start_class, and_withp); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); } else if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); flags &= ~SCF_DO_STCLASS; } if (!scan) /* It was not CURLYX, but CURLY. */ @@ -3916,7 +4583,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3) /* Complement check for big count */ + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ { /* Fatal warnings may leak the regexp without this: */ SAVEFREESV(RExC_rx_sv); @@ -3926,14 +4594,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } min += minnext * mincount; - is_inf_internal |= deltanext == I32_MAX - || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); is_inf |= is_inf_internal; - if (is_inf) - delta = I32_MAX; - else - delta += (minnext + deltanext) * maxcount - minnext * mincount; - + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } /* Try powerful optimization CURLYX => CURLYN. */ if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR @@ -3984,7 +4653,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ - && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4031,7 +4703,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #endif /* Optimize again: */ study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, - NULL, stopparen, recursed, NULL, 0,depth+1); + NULL, stopparen, recursed_depth, NULL, 0,depth+1); } else oscan->flags = 0; @@ -4056,43 +4728,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, pars++; if (flags & SCF_DO_SUBSTR) { SV *last_str = NULL; + STRLEN last_chrs = 0; int counted = mincount != 0; - if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ -#if defined(SPARC64_GCC_WORKAROUND) - I32 b = 0; - STRLEN l = 0; - const char *s = NULL; - I32 old = 0; - - if (pos_before >= data->last_start_min) - b = pos_before; - else - b = data->last_start_min; - - l = 0; - s = SvPV_const(data->last_found, l); - old = b - data->last_start_min; - -#else - I32 b = pos_before >= data->last_start_min + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min ? pos_before : data->last_start_min; STRLEN l; const char * const s = SvPV_const(data->last_found, l); - I32 old = b - data->last_start_min; -#endif + SSize_t old = b - data->last_start_min; if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { + SvGROW(last_str, (mincount * l) + 1); repeatcpy(SvPVX(last_str) + l, - SvPVX_const(last_str), l, mincount - 1); + SvPVX_const(last_str), l, + mincount - 1); SvCUR_set(last_str, SvCUR(last_str) * mincount); /* Add additional parts. */ SvCUR_set(data->last_found, @@ -4104,34 +4765,41 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str) - l; + mg->mg_len += last_chrs * (mincount-1); } + last_chrs *= mincount; data->last_end += l * (mincount - 1); } } else { /* start offset must point into the last copy */ data->last_start_min += minnext * (mincount - 1); - data->last_start_max += is_inf ? I32_MAX + data->last_start_max += is_inf ? SSize_t_MAX : (maxcount - 1) * (minnext + data->pos_delta); } } /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n", - counted, deltanext, I32_MAX, minnext, maxcount, mincount); -if (deltanext != I32_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta); +PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf + " SSize_t_MAX=%"UVuf" minnext=%"UVuf + " maxcount=%"UVuf" mincount=%"UVuf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif - if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta) - data->pos_delta = I32_MAX; + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; else data->pos_delta += - counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount; if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); if (mincount && last_str) { SV * const sv = data->last_found; MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? @@ -4141,12 +4809,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext mg->mg_len = -1; sv_setsv(sv, last_str); data->last_end = data->pos_min; - data->last_start_min = - data->pos_min - CHR_SVLEN(last_str); + data->last_start_min = data->pos_min - last_chrs; data->last_start_max = is_inf - ? I32_MAX - : data->pos_min + data->pos_delta - - CHR_SVLEN(last_str); + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; } data->longest = &(data->longest_float); } @@ -4161,164 +4827,212 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext NEXT_OFF(oscan) += NEXT_OFF(next); } continue; - default: /* REF, and CLUMP only? */ + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) - cl_anything(pRExC_state, data->start_class); + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } flags &= ~SCF_DO_STCLASS; break; } } else if (OP(scan) == LNBREAK) { if (flags & SCF_DO_STCLASS) { - int value = 0; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ if (flags & SCF_DO_STCLASS_AND) { - for (value = 0; value < 256; value++) - if (!is_VERTWS_cp(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - else { - for (value = 0; value < 256; value++) - if (is_VERTWS_cp(value)) - ANYOF_BITMAP_SET(data->start_class, value); + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; } - if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } min++; delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += 1; data->pos_delta += 1; data->longest = &(data->longest_float); } } else if (REGNODE_SIMPLE(OP(scan))) { - int value = 0; if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min++; } min++; if (flags & SCF_DO_STCLASS) { - int loop_max = 256; - CLEAR_SSC_EOS(data->start_class); /* No match on empty */ + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; /* Some of the logic below assumes that switching locale on will only add false positives. */ - switch (PL_regkind[OP(scan)]) { - U8 classnum; + switch (OP(scan)) { - case SANY: default: #ifdef DEBUGGING - Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); #endif - do_default: + case CANY: + case SANY: if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_match_all_cp(data->start_class); break; + case REG_ANY: - if (OP(scan) == SANY) - goto do_default; - if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ - value = (ANYOF_BITMAP_TEST(data->start_class,'\n') - || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); - cl_anything(pRExC_state, data->start_class); - } - if (flags & SCF_DO_STCLASS_AND || !value) - ANYOF_BITMAP_CLEAR(data->start_class,'\n'); + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); + } break; + case ANYOF: if (flags & SCF_DO_STCLASS_AND) - cl_and(data->start_class, - (struct regnode_charclass_class*)scan); + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); else - cl_or(pRExC_state, data->start_class, - (struct regnode_charclass_class*)scan); + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); break; - case POSIXA: - loop_max = 128; - /* FALL THROUGH */ + + case NPOSIXL: + invert = 1; + /* FALLTHROUGH */ + case POSIXL: - case POSIXD: - case POSIXU: - classnum = FLAGS(scan); + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1); - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); } else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum)); - } - else { - - /* Even if under locale, set the bits for non-locale - * in case it isn't a true locale-node. This will - * create false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); } } break; - case NPOSIXA: - loop_max = 128; - /* FALL THROUGH */ - case NPOSIXL: - case NPOSIXU: + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; + /* FALLTHROUGH */ + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + case NPOSIXD: - classnum = FLAGS(scan); + case NPOSIXU: + invert = 1; + /* FALLTHROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } + + join_posix: + if (flags & SCF_DO_STCLASS_AND) { - if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum)); - for (value = 0; value < loop_max; value++) { - if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value)); - } - } - } + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); } else { - if (data->start_class->flags & ANYOF_LOCALE) { - ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1); - } - else { - - /* Even if under locale, set the bits for non-locale in - * case it isn't a true locale-node. This will create - * false positives if it truly is locale */ - for (value = 0; value < loop_max; value++) { - if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) { - ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value)); - } - } - if (PL_regkind[OP(scan)] == NPOSIXD) { - data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); } - break; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } @@ -4326,7 +5040,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); - SCAN_COMMIT(pRExC_state, data, minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); } else if ( PL_regkind[OP(scan)] == BRANCHJ @@ -4345,11 +5059,12 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("OPFAIL",data,depth); /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, mysv_val, upto); - PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(mysv_val), - (IV)REG_NODE_NUM(upto), - (IV)(upto - scan) + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) ); }); OP(scan) = OPFAIL; @@ -4366,9 +5081,9 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext In this case we can't do fixed string optimisation. */ - I32 deltanext, minnext, fake = 0; + SSize_t deltanext, minnext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; data_fake.flags = 0; @@ -4381,7 +5096,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.pos_delta = delta; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4390,13 +5105,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f, depth+1); + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (minnext > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)minnext; } @@ -4415,14 +5132,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext * *** HACK *** for now just treat as "no information". * See [perl #56690]. */ - cl_init(pRExC_state, data->start_class); + ssc_init(pRExC_state, data->start_class); } else { /* AND before and after: combine and continue */ - const int was = TEST_SSC_EOS(data->start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } } } @@ -4435,9 +5148,9 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext length of the pattern, something we won't know about until after the recurse. */ - I32 deltanext, fake = 0; + SSize_t deltanext, fake = 0; regnode *nscan; - struct regnode_charclass_class intrnl; + regnode_ssc intrnl; int f = 0; /* We use SAVEFREEPV so that when the full compile is finished perl will clean up the allocated @@ -4445,8 +5158,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext have to worry about freeing them when we know they wont be used, which would be a pain. */ - I32 *minnextp; - Newx( minnextp, 1, I32 ); + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); SAVEFREEPV(minnextp); if (data) { @@ -4454,7 +5167,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext if ((flags & SCF_DO_SUBSTR) && data->last_found) { f |= SCF_DO_SUBSTR; if (scan->flags) - SCAN_COMMIT(pRExC_state, &data_fake,minlenp); + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); data_fake.last_found=newSVsv(data->last_found); } } @@ -4466,7 +5179,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.flags |= SF_IS_INF; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(pRExC_state, &intrnl); + ssc_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } @@ -4475,14 +5188,17 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, - last, &data_fake, stopparen, recursed, NULL, f,depth+1); + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); if (scan->flags) { if (deltanext) { FAIL("Variable length lookbehind not implemented"); } else if (*minnextp > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); } scan->flags = (U8)*minnextp; } @@ -4490,11 +5206,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext *minnextp += min; if (f & SCF_DO_STCLASS_AND) { - const int was = TEST_SSC_EOS(data.start_class); - - cl_and(data->start_class, &intrnl); - if (was) - SET_SSC_EOS(data->start_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4505,7 +5217,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { if (RExC_rx->minlen<*minnextp) RExC_rx->minlen=*minnextp; - SCAN_COMMIT(pRExC_state, &data_fake, minnextp); + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); SvREFCNT_dec_NN(data_fake.last_found); if ( data_fake.minlen_fixed != minlenp ) @@ -4549,7 +5261,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext } else if ( PL_regkind[OP(scan)] == ENDLIKE ) { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); flags &= ~SCF_DO_SUBSTR; } if (data && OP(scan)==ACCEPT) { @@ -4561,24 +5273,24 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ { if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); + scan_commit(pRExC_state, data, minlenp, is_inf); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(pRExC_state, data->start_class); + ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; } else if (OP(scan) == GPOS) { - if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) && + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && !(delta || is_inf || (data && data->pos_delta))) { - if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR)) - RExC_rx->extflags |= RXf_ANCH_GPOS; - if (RExC_rx->gofs < (U32)min) + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) RExC_rx->gofs = min; } else { - RExC_rx->extflags |= RXf_GPOS_FLOAT; + RExC_rx->intflags |= PREGf_GPOS_FLOAT; RExC_rx->gofs = 0; } } @@ -4591,13 +5303,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext regnode *trie_node= scan; regnode *tail= regnext(scan); reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - I32 max1 = 0, min1 = I32_MAX; - struct regnode_charclass_class accum; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; - if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */ + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } if (flags & SCF_DO_STCLASS) - cl_init_zero(pRExC_state, &accum); + ssc_init_zero(pRExC_state, &accum); if (!trie->jump) { min1= trie->minlen; @@ -4608,8 +5322,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext for ( word=1 ; word <= trie->wordcount ; word++) { - I32 deltanext=0, minnext=0, f = 0, fake; - struct regnode_charclass_class this_class; + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; data_fake.flags = 0; if (data) { @@ -4620,7 +5334,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data_fake.last_closep = &fake; data_fake.pos_delta = delta; if (flags & SCF_DO_STCLASS) { - cl_init(pRExC_state, &this_class); + ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } @@ -4632,22 +5346,21 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext nextbranch = trie_node + trie->jump[0]; scan= trie_node + trie->jump[word]; /* We go from the jump point to the branch that follows - it. Note this means we need the vestigal unused branches - even though they arent otherwise used. - */ + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, (regnode *)nextbranch, &data_fake, - stopparen, recursed, NULL, f,depth+1); + stopparen, recursed_depth, NULL, f,depth+1); } if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode*)nextbranch); - if (min1 > (I32)(minnext + trie->minlen)) + if (min1 > (SSize_t)(minnext + trie->minlen)) min1 = minnext + trie->minlen; - if (deltanext == I32_MAX) { + if (deltanext == SSize_t_MAX) { is_inf = is_inf_internal = 1; - max1 = I32_MAX; - } else if (max1 < (I32)(minnext + deltanext + trie->maxlen)) + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) max1 = minnext + deltanext + trie->maxlen; if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -4665,7 +5378,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(pRExC_state, &accum, &this_class); + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -4677,28 +5390,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(pRExC_state, data->start_class, &accum); + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); if (min1) { - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); flags &= ~SCF_DO_STCLASS; } } else if (flags & SCF_DO_STCLASS_AND) { if (min1) { - cl_and(data->start_class, &accum); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); flags &= ~SCF_DO_STCLASS; } else { /* Switch to OR mode: cache the old value of * data->start_class */ INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, - struct regnode_charclass_class); + StructCopy(data->start_class, and_withp, regnode_ssc); flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, - struct regnode_charclass_class); + StructCopy(&accum, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - SET_SSC_EOS(data->start_class); } } scan= tail; @@ -4713,14 +5423,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext delta += (trie->maxlen - trie->minlen); flags &= ~SCF_DO_STCLASS; /* xxx */ if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); data->pos_min += trie->minlen; data->pos_delta += (trie->maxlen - trie->minlen); if (trie->maxlen != trie->minlen) data->longest = &(data->longest_float); } if (trie->jump) /* no more substrings -- for now /grr*/ - flags &= ~SCF_DO_SUBSTR; + flags &= ~SCF_DO_SUBSTR; } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ @@ -4728,10 +5439,24 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext /* Else: zero-length, ignore. */ scan = regnext(scan); } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ last = frame->last; scan = frame->next; stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + frame = frame->prev; goto fake_study_recurse; } @@ -4741,9 +5466,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext DEBUG_STUDYDATA("pre-fin:",data,depth); *scanp = scan; - *deltap = is_inf_internal ? I32_MAX : delta; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) - data->pos_delta = I32_MAX - data->pos_min; + data->pos_delta = SSize_t_MAX - data->pos_min; if (is_par > (I32)U8_MAX) is_par = 0; if (is_par && pars==1 && data) { @@ -4755,17 +5481,25 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) - cl_and(data->start_class, and_withp); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); if (flags & SCF_TRIE_RESTUDY) data->flags |= SCF_TRIE_RESTUDY; DEBUG_STUDYDATA("post-fin:",data,depth); - return min < stopmin ? min : stopmin; + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ } STATIC U32 -S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; @@ -4783,7 +5517,8 @@ S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) return count; } -/*XXX: todo make this not included in a non debugging perl */ +/*XXX: todo make this not included in a non debugging perl, but appears to be + * used anyway there, in 'use re' */ #ifndef PERL_IN_XSUB_RE void Perl_reginitcolors(pTHX) @@ -4848,7 +5583,7 @@ Perl_current_re_engine(pTHX) HV * const table = GvHV(PL_hintgv); SV **ptr; - if (!table) + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) return &reh_regexp_engine; ptr = hv_fetchs(table, "regcomp", FALSE); if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) @@ -4929,12 +5664,11 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, Newx(dst, *plen_p * 2 + 1, U8); while (s < *plen_p) { - const UV uv = NATIVE_TO_ASCII(src[s]); - if (UNI_IS_INVARIANT(uv)) - dst[d] = (U8)UTF_TO_NATIVE(uv); + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; else { - dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv); - dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv); + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); } if (n < num_code_blocks) { if (!do_end && pRExC_state->code_blocks[n].start == s) { @@ -4987,7 +5721,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, /* if we know we have at least two args, create an empty string, * then concatenate args to that. For no args, return an empty string */ if (!pat && pat_count != 1) { - pat = newSVpvn("", 0); + pat = newSVpvs(""); SAVEFREESV(pat); alloced = TRUE; } @@ -4998,6 +5732,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, STRLEN orig_patlen = 0; bool code = 0; SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; /* if we've got a delimiter, we go round the loop twice for each * svp slot (except the last), using the delimiter the second @@ -5016,7 +5751,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, * The code in this block is based on S_pushav() */ AV *const av = (AV*)msv; - const I32 maxarg = AvFILL(av) + 1; + const SSize_t maxarg = AvFILL(av) + 1; SV **array; if (oplist) { @@ -5026,11 +5761,11 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, } if (SvRMAGICAL(av)) { - U32 i; + SSize_t i; Newx(array, maxarg, SV*); SAVEFREEPV(array); - for (i=0; i < (U32)maxarg; i++) { + for (i=0; i < maxarg; i++) { SV ** const svp = av_fetch(av, i, FALSE); array[i] = svp ? *svp : &PL_sv_undef; } @@ -5193,6 +5928,8 @@ S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, int n = 0; STRLEN s; + PERL_UNUSED_CONTEXT; + for (s = 0; s < plen; s++) { if (n < pRExC_state->num_code_blocks && s == pRExC_state->code_blocks[n].start) @@ -5320,7 +6057,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, { Safefree(pRExC_state->code_blocks); /* use croak_sv ? */ - Perl_croak_nocontext("%s", SvPV_nolen_const(errsv)); + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); } } assert(SvROK(qr_ref)); @@ -5411,20 +6148,24 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, STATIC bool -S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol) +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) { /* This is the common code for setting up the floating and fixed length * string data extracted from Perl_re_op_compile() below. Returns a boolean * as to whether succeeded or not */ - I32 t,ml; + I32 t; + SSize_t ml; if (! (longest_length || (eol /* Can't have SEOL and MULTI */ && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) ) - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) { return FALSE; } @@ -5442,7 +6183,7 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, S follow this item. We calculate it ahead of time as once the lookbehind offset is added in we lose the ability to correctly calculate it.*/ - ml = minlen ? *(minlen) : (I32)longest_length; + ml = minlen ? *(minlen) : (SSize_t)longest_length; *rx_end_shift = ml - offset - longest_length + (SvTAIL(sv_longest) != 0) + lookbehind; @@ -5511,7 +6252,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, char *exp; regnode *scan; I32 flags; - I32 minlen = 0; + SSize_t minlen = 0; U32 rx_flags; SV *pat; SV *code_blocksv = NULL; @@ -5522,6 +6263,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, I32 sawlookahead = 0; I32 sawplus = 0; I32 sawopen = 0; + I32 sawminmod = 0; + regex_charset initial_charset = get_regex_charset(orig_rx_flags); bool recompile = 0; bool runtime_code = 0; @@ -5543,61 +6286,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * having to test them each time otherwise */ if (! PL_AboveLatin1) { PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); - PL_ASCII = _new_invlist_C_array(ASCII_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); - - PL_L1Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_Posix_ptrs[_CC_ALPHANUMERIC] - = _new_invlist_C_array(PosixAlnum_invlist); - - PL_L1Posix_ptrs[_CC_ALPHA] - = _new_invlist_C_array(L1PosixAlpha_invlist); - PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist); - PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist); - - /* Cased is the same as Alpha in the ASCII range */ - PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist); - PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist); - - PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist); - PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist); - - PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist); - - PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist); - PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist); - - PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist); - PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist); - - PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist); - PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist); - - PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist); - PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist); - - PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist); - PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist); - PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist); - PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist); - - PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist); - PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist); - - PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist); - - PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist); - PL_L1Posix_ptrs[_CC_WORDCHAR] - = _new_invlist_C_array(L1PosixWord_invlist); - - PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist); - PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist); - - PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); } #endif @@ -5713,6 +6406,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + RExC_contains_i = 0; pRExC_state->runtime_code_qr = NULL; DEBUG_COMPILE_r({ @@ -5734,11 +6428,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); /* return old regex if pattern hasn't changed */ - /* XXX: note in the below we have to check the flags as well as the pattern. + /* XXX: note in the below we have to check the flags as well as the + * pattern. * - * Things get a touch tricky as we have to compare the utf8 flag independently - * from the compile flags. - */ + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ if ( old_re && !recompile @@ -5755,10 +6449,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, rx_flags = orig_rx_flags; - if (initial_charset == REGEX_LOCALE_CHARSET) { - RExC_contains_locale = 1; + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; } - else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -5786,6 +6480,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_sawback = 0; RExC_seen = 0; + RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; @@ -5800,7 +6495,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_npar = 1; RExC_nestroot = 0; RExC_size = 0L; - RExC_emit = &RExC_emit_dummy; + RExC_emit = (regnode *) &RExC_emit_dummy; RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; @@ -5810,6 +6505,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_paren_name_list = NULL; #endif RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; RExC_recurse_count = 0; pRExC_state->code_index = 0; @@ -5888,7 +6585,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, FAIL("Regexp out of space"); #ifdef DEBUGGING /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ - Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char); + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); #else /* bulk initialize base fields with 0. */ Zero(ri, sizeof(regexp_internal), char); @@ -5915,14 +6613,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); /* The caret is output if there are any defaults: if not all the STD * flags are set, or if no character set specifier is needed */ bool has_default = (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) || ! has_charset); - bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ @@ -5984,12 +6684,23 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - if (RExC_seen & REG_SEEN_RECURSE) { + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { Newxz(RExC_open_parens, RExC_npar,regnode *); SAVEFREEPV(RExC_open_parens); Newxz(RExC_close_parens,RExC_npar,regnode *); SAVEFREEPV(RExC_close_parens); } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } /* Useful during FAIL. */ #ifdef RE_TRACK_PATTERN_OFFSETS @@ -6031,8 +6742,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } reStudy: - r->minlen = minlen = sawlookahead = sawplus = sawopen = 0; + r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); #ifdef TRIE_STUDY_OPT if (!restudied) { @@ -6043,10 +6757,10 @@ reStudy: DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); RExC_state = copyRExC_state; - if (seen & REG_TOP_LEVEL_BRANCHES) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; else - RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; StructCopy(&zero_scan_data, &data, scan_data_t); } #else @@ -6066,12 +6780,13 @@ reStudy: /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ - if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ - I32 fake; + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; STRLEN longest_float_length, longest_fixed_length; - struct regnode_charclass_class ch_class; /* pointed to by data */ + regnode_ssc ch_class; /* pointed to by data */ int stclass_flag; - I32 last_close = 0; /* pointed to by data */ + SSize_t last_close = 0; /* pointed to by data */ regnode *first= scan; regnode *first_next= regnext(first); /* @@ -6100,12 +6815,15 @@ reStudy: * the only op that could be a regnode is PLUS, all the rest * will be regnode_1 or regnode_2. * + * (yves doesn't think this is true) */ if (OP(first) == PLUS) sawplus = 1; - else + else { + if (OP(first) == MINMOD) + sawminmod = 1; first += regarglen[OP(first)]; - + } first = NEXTOPER(first); first_next= regnext(first); } @@ -6124,22 +6842,8 @@ reStudy: else if (PL_regkind[OP(first)] == TRIE && ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { - regnode *trie_op; /* this can happen only on restudy */ - if ( OP(first) == TRIE ) { - struct regnode_1 *trieop = (struct regnode_1 *) - PerlMemShared_calloc(1, sizeof(struct regnode_1)); - StructCopy(first,trieop,struct regnode_1); - trie_op=(regnode *)trieop; - } else { - struct regnode_charclass *trieop = (struct regnode_charclass *) - PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); - StructCopy(first,trieop,struct regnode_charclass); - trie_op=(regnode *)trieop; - } - OP(trie_op)+=2; - make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); - ri->regstclass = trie_op; + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); } #endif else if (REGNODE_SIMPLE(OP(first))) @@ -6148,35 +6852,35 @@ reStudy: PL_regkind[OP(first)] == NBOUND) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOL) { - r->extflags |= (OP(first) == MBOL - ? RXf_ANCH_MBOL + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL : (OP(first) == SBOL - ? RXf_ANCH_SBOL - : RXf_ANCH_BOL)); + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); first = NEXTOPER(first); goto again; } else if (OP(first) == GPOS) { - r->extflags |= RXf_ANCH_GPOS; + r->intflags |= PREGf_ANCH_GPOS; first = NEXTOPER(first); goto again; } else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = (OP(NEXTOPER(first)) == REG_ANY) - ? RXf_ANCH_MBOL - : RXf_ANCH_SBOL; - r->extflags |= type; - r->intflags |= PREGf_IMPLICIT; + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); first = NEXTOPER(first); goto again; } - if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6218,15 +6922,17 @@ reStudy: SAVEFREESV(data.last_found); first = scan; if (!ri->regstclass) { - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; } else /* XXXX Check for BOUND? */ stclass_flag = 0; data.last_closep = &last_close; - minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */ - &data, -1, NULL, NULL, + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 0); @@ -6238,9 +6944,11 @@ reStudy: if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen - && !(RExC_seen & REG_SEEN_VERBARG) - && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ r->extflags |= RXf_CHECK_ALL; + } scan_commit(pRExC_state, &data,&minlen,0); longest_float_length = CHR_SVLEN(data.longest_float); @@ -6262,7 +6970,7 @@ reStudy: { r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; - if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; SvREFCNT_inc_simple_void_NN(data.longest_float); } @@ -6300,49 +7008,53 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag - && ! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } - /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; r->check_end_shift = r->anchored_end_shift; r->check_substr = r->anchored_substr; r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; - if (r->extflags & RXf_ANCH_SINGLE) - r->extflags |= RXf_NOSCAN; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; } else { + r->substrs->check_ix = 1; r->check_end_shift = r->float_end_shift; r->check_substr = r->float_substr; r->check_utf8 = r->float_utf8; r->check_offset_min = r->float_min_offset; r->check_offset_max = r->float_max_offset; } - /* XXXX Currently intuiting is not compatible with ANCH_GPOS. - This should be changed ASAP! */ - if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) ) { r->extflags |= RXf_USE_INTUIT; if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->extflags |= RXf_INTUIT_TAIL; } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) if ( (STRLEN)minlen < longest_float_length ) minlen= longest_float_length; @@ -6352,22 +7064,23 @@ reStudy: } else { /* Several toplevels. Best we can is to set minlen. */ - I32 fake; - struct regnode_charclass_class ch_class; - I32 last_close = 0; + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = ri->program + 1; - cl_init(pRExC_state, &ch_class); + ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; data.last_closep = &last_close; - - minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, - &data, -1, NULL, NULL, - SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS - |(restudied ? SCF_TRIE_DOING_RESTUDY : 0), + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), 0); CHECK_RESTUDY_GOTO_butfirst(NOOP); @@ -6375,51 +7088,61 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - if (! TEST_SSC_EOS(data.start_class) - && !cl_is_anything(data.start_class)) + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) { - const U32 n = add_data(pRExC_state, 1, "f"); - OP(data.start_class) = ANYOF_SYNTHETIC; + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); - Newx(RExC_rxi->data->data[n], 1, - struct regnode_charclass_class); + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); StructCopy(data.start_class, - (struct regnode_charclass_class*)RExC_rxi->data->data[n], - struct regnode_charclass_class); + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class); + regprop(r, sv, (regnode*)data.start_class, NULL); PerlIO_printf(Perl_debug_log, "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); + data.start_class = NULL; } } + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", - (IV)minlen, (IV)r->minlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); }); r->minlenret = minlen; if (r->minlen < minlen) r->minlen = minlen; - if (RExC_seen & REG_SEEN_GPOS) - r->extflags |= RXf_GPOS_SEEN; - if (RExC_seen & REG_SEEN_LOOKBEHIND) - r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_SEEN_CANY) - r->extflags |= RXf_CANY_SEEN; - if (RExC_seen & REG_SEEN_VERBARG) + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ } - if (RExC_seen & REG_SEEN_CUTGROUP) + if (RExC_seen & REG_CUTGROUP_SEEN) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) r->intflags |= PREGf_USE_RE_EVAL; @@ -6428,7 +7151,20 @@ reStudy: else RXp_PAREN_NAMES(r) = NULL; + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); regnode *next = NEXTOPER(first); @@ -6438,16 +7174,28 @@ reStudy: r->extflags |= RXf_NULL; else if (PL_regkind[fop] == BOL && nop == END) r->extflags |= RXf_START_ONLY; - else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END) + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) r->extflags |= RXf_WHITE; - else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + #ifdef DEBUGGING if (RExC_paren_names) { - ri->name_list_idx = add_data( pRExC_state, 1, "a" ); - ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); } else #endif ri->name_list_idx = 0; @@ -6462,15 +7210,17 @@ reStudy: /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ + DEBUG_RExC_seen(); PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); #ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_OFFSETS_r(if (ri->u.offsets) { - const U32 len = ri->u.offsets[0]; - U32 i; + const STRLEN len = ri->u.offsets[0]; + STRLEN i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", @@ -6530,7 +7280,8 @@ Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, else if (flags & RXapif_NEXTKEY) return reg_named_buff_nextkey(rx, flags); else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); return NULL; } } @@ -6656,7 +7407,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) { SV *ret; AV *av; - I32 length; + SSize_t length; struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; @@ -6667,11 +7418,12 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) } else if (flags & RXapif_ONE) { ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = MUTABLE_AV(SvRV(ret)); - length = av_len(av); + length = av_tindex(av); SvREFCNT_dec_NN(ret); return newSViv(length + 1); } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); return NULL; } } @@ -6719,19 +7471,29 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, { struct regexp *const rx = ReANY(r); char *s = NULL; - I32 i = 0; - I32 s1, t1; + SSize_t i = 0; + SSize_t s1, t1; I32 n = paren; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + if ( n == RX_BUFF_IDX_CARET_PREMATCH || n == RX_BUFF_IDX_CARET_FULLMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - && !(rx->extflags & RXf_PMf_KEEPCOPY) ) - goto ret_undef; + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } if (!rx->subbeg) goto ret_undef; @@ -6768,9 +7530,9 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, } assert(s >= rx->subbeg); - assert(rx->sublen >= (s - rx->subbeg) + i ); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); if (i >= 0) { -#if NO_TAINT_SUPPORT +#ifdef NO_TAINT_SUPPORT sv_setpvn(sv, s, i); #else const int oldtainted = TAINT_get; @@ -6778,7 +7540,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->extflags & RXf_CANY_SEEN) + if ( (rx->intflags & PREGf_CANY_SEEN) ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) : (RXp_MATCH_UTF8(rx)) ) @@ -6837,13 +7599,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + /* Some of this code was originally in C in F */ switch (paren) { case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; @@ -6856,8 +7632,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, return 0; case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; @@ -6869,13 +7643,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } return 0; - case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ - if (!(rx->extflags & RXf_PMf_KEEPCOPY)) - goto warn_undef; - /*FALLTHROUGH*/ - - /* $& / ${^MATCH}, $1, $2, ... */ - default: + default: /* $& / ${^MATCH}, $1, $2, ... */ if (paren <= (I32)rx->nparens && (s1 = rx->offs[paren].start) != -1 && (t1 = rx->offs[paren].end) != -1) @@ -6932,7 +7700,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) PERL_ARGS_ASSERT_REG_SCAN_NAME; - if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { /* skip IDFIRST by using do...while */ if (UTF) do { @@ -6943,7 +7713,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) RExC_parse++; } while (isWORDCHAR(*RExC_parse)); } else { - RExC_parse++; /* so the <- from the vFAIL is after the offending character */ + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ vFAIL("Group name must start with a non-digit word character"); } if ( flags ) { @@ -7023,11 +7794,12 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* This section of code defines the inversion list object and its methods. The * interfaces are highly subject to change, so as much as possible is static to * this file. An inversion list is here implemented as a malloc'd C UV array - * with some added info that is placed as UVs at the beginning in a header - * portion. An inversion list for Unicode is an array of code points, sorted - * by ordinal number. The zeroth element is the first code point in the list. - * The 1th element is the first element beyond that not in the list. In other - * words, the first range is + * as an SVt_INVLIST scalar. + * + * An inversion list for Unicode is an array of code points, sorted by ordinal + * number. The zeroth element is the first code point in the list. The 1th + * element is the first element beyond that not in the list. In other words, + * the first range is * invlist[0]..(invlist[1]-1) * The other ranges follow. Thus every element whose index is divisible by two * marks the beginning of a range that is in the list, and every element not @@ -7045,9 +7817,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. * This implementation reserves an element at the beginning of each inversion - * list to contain 0 when the list contains 0, and contains 1 otherwise. The - * actual beginning of the list is either that element if 0, or the next one if - * 1. + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. @@ -7062,36 +7834,35 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * should eventually be made public */ /* The header definitions are in F */ -#define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV)) -#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH) - -#define INVLIST_INITIAL_LEN 10 PERL_STATIC_INLINE UV* -S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) +S__invlist_array_init(SV* const invlist, const bool will_have_0) { /* Returns a pointer to the first element in the inversion list's array. * This is called upon initialization of an inversion list. Where the - * array begins depends on whether the list has the code point U+0000 - * in it or not. The other parameter tells it whether the code that - * follows this call is about to put a 0 in the inversion list or not. - * The first element is either the element with 0, if 0, or the next one, - * if 1 */ + * array begins depends on whether the list has the code point U+0000 in it + * or not. The other parameter tells it whether the code that follows this + * call is about to put a 0 in the inversion list or not. The first + * element is either the element reserved for 0, if TRUE, or the element + * after it, if FALSE */ - UV* zero = get_invlist_zero_addr(invlist); + bool* offset = get_invlist_offset_addr(invlist); + UV* zero_addr = (UV *) SvPVX(invlist); PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; /* Must be empty */ - assert(! *_get_invlist_len_addr(invlist)); + assert(! _invlist_len(invlist)); + + *zero_addr = 0; /* 1^1 = 0; 1^0 = 1 */ - *zero = 1 ^ will_have_0; - return zero + *zero; + *offset = 1 ^ will_have_0; + return zero_addr + *offset; } PERL_STATIC_INLINE UV* -S_invlist_array(pTHX_ SV* const invlist) +S_invlist_array(SV* const invlist) { /* Returns the pointer to the inversion list's array. Every time the * length changes, this needs to be called in case malloc or realloc moved @@ -7101,57 +7872,47 @@ S_invlist_array(pTHX_ SV* const invlist) /* Must not be empty. If these fail, you probably didn't check for * being non-zero before trying to get the array */ - assert(*_get_invlist_len_addr(invlist)); - assert(*get_invlist_zero_addr(invlist) == 0 - || *get_invlist_zero_addr(invlist) == 1); - - /* The array begins either at the element reserved for zero if the - * list contains 0 (that element will be set to 0), or otherwise the next - * element (in which case the reserved element will be set to 1). */ - return (UV *) (get_invlist_zero_addr(invlist) - + *get_invlist_zero_addr(invlist)); + assert(_invlist_len(invlist)); + + /* The very first element always contains zero, The array begins either + * there, or if the inversion list is offset, at the element after it. + * The offset header field determines which; it contains 0 or 1 to indicate + * how much additionally to add */ + assert(0 == *(SvPVX(invlist))); + return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); } PERL_STATIC_INLINE void -S_invlist_set_len(pTHX_ SV* const invlist, const UV len) +S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) { - /* Sets the current number of elements stored in the inversion list */ - + /* Sets the current number of elements stored in the inversion list. + * Updates SvCUR correspondingly */ + PERL_UNUSED_CONTEXT; PERL_ARGS_ASSERT_INVLIST_SET_LEN; - *_get_invlist_len_addr(invlist) = len; - - assert(len <= SvLEN(invlist)); - - SvCUR_set(invlist, TO_INTERNAL_SIZE(len)); - /* If the list contains U+0000, that element is part of the header, - * and should not be counted as part of the array. It will contain - * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and - * subtract: - * SvCUR_set(invlist, - * TO_INTERNAL_SIZE(len - * - (*get_invlist_zero_addr(inv_list) ^ 1))); - * But, this is only valid if len is not 0. The consequences of not doing - * this is that the memory allocation code may think that 1 more UV is - * being used than actually is, and so might do an unnecessary grow. That - * seems worth not bothering to make this the precise amount. - * - * Note that when inverting, SvCUR shouldn't change */ + assert(SvTYPE(invlist) == SVt_INVLIST); + + SvCUR_set(invlist, + (len == 0) + ? 0 + : TO_INTERNAL_SIZE(len + offset)); + assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); } PERL_STATIC_INLINE IV* -S_get_invlist_previous_index_addr(pTHX_ SV* invlist) +S_get_invlist_previous_index_addr(SV* invlist) { - /* Return the address of the UV that is reserved to hold the cached index + /* Return the address of the IV that is reserved to hold the cached index * */ - PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; - return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV))); + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); } PERL_STATIC_INLINE IV -S_invlist_previous_index(pTHX_ SV* const invlist) +S_invlist_previous_index(SV* const invlist) { /* Returns cached index of previous search */ @@ -7161,7 +7922,7 @@ S_invlist_previous_index(pTHX_ SV* const invlist) } PERL_STATIC_INLINE void -S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) +S_invlist_set_previous_index(SV* const invlist, const IV index) { /* Caches for later retrieval */ @@ -7173,34 +7934,25 @@ S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) } PERL_STATIC_INLINE UV -S_invlist_max(pTHX_ SV* const invlist) +S_invlist_max(SV* const invlist) { /* Returns the maximum number of elements storable in the inversion list's * array, without having to realloc() */ PERL_ARGS_ASSERT_INVLIST_MAX; + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ - ? _invlist_len(invlist) - : FROM_INTERNAL_SIZE(SvLEN(invlist)); + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; } -PERL_STATIC_INLINE UV* -S_get_invlist_zero_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that is reserved to hold 0 if the inversion - * list contains 0. This has to be the last element of the heading, as the - * list proper starts with either it if 0, or the next element if not. - * (But we force it to contain either 0 or 1) */ - - PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR; - - return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV))); -} - -#ifndef PERL_IN_XSUB_RE -SV* -Perl__new_invlist(pTHX_ IV initial_size) +#ifndef PERL_IN_XSUB_RE +SV* +Perl__new_invlist(pTHX_ IV initial_size) { /* Return a pointer to a newly constructed inversion list, with enough @@ -7210,60 +7962,81 @@ Perl__new_invlist(pTHX_ IV initial_size) SV* new_list; if (initial_size < 0) { - initial_size = INVLIST_INITIAL_LEN; + initial_size = 10; } /* Allocate the initial space */ - new_list = newSV(TO_INTERNAL_SIZE(initial_size)); - invlist_set_len(new_list, 0); + new_list = newSV_type(SVt_INVLIST); - /* Force iterinit() to be used to get iteration to work */ - *get_invlist_iter_addr(new_list) = UV_MAX; + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(new_list, 0, 0); - /* This should force a segfault if a method doesn't initialize this - * properly */ - *get_invlist_zero_addr(new_list) = UV_MAX; + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; *get_invlist_previous_index_addr(new_list) = 0; - *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; -#if HEADER_LENGTH != 5 -# error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length -#endif return new_list; } -#endif -STATIC SV* -S__new_invlist_C_array(pTHX_ UV* list) +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) { /* Return a pointer to a newly constructed inversion list, initialized to * point to , which has to be in the exact correct inversion list * form, including internal fields. Thus this is a dangerous routine that - * should not be used in the wrong hands */ + * should not be used in the wrong hands. The passed in 'list' contains + * several header fields at the beginning that are not part of the + * inversion list body proper */ + + const STRLEN length = (STRLEN) list[0]; + const UV version_id = list[1]; + const bool offset = cBOOL(list[2]); +#define HEADER_LENGTH 3 + /* If any of the above changes in any way, you must change HEADER_LENGTH + * (if appropriate) and regenerate INVLIST_VERSION_ID by running + * perl -E 'say int(rand 2**31-1)' + */ +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and + data structure type, so that one being + passed in can be validated to be an + inversion list of the correct vintage. + */ - SV* invlist = newSV_type(SVt_PV); + SV* invlist = newSV_type(SVt_INVLIST); PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; - SvPV_set(invlist, (char *) list); + if (version_id != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + /* The generated array passed in includes header elements that aren't part + * of the list proper, so start it just after them */ + SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); + SvLEN_set(invlist, 0); /* Means we own the contents, and the system shouldn't touch it */ - SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist))); - if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { - Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); - } + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); - /* Initialize the iteration pointer. - * XXX This could be done at compile time in charclass_invlists.h, but I - * (khw) am not confident that the suffixes for specifying the C constant - * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured - * to use 64 bits; might need a Configure probe */ + invlist_set_previous_index(invlist, 0); + + /* Initialize the iteration pointer. */ invlist_iterfinish(invlist); + SvREADONLY_on(invlist); + return invlist; } +#endif /* ifndef PERL_IN_XSUB_RE */ STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) @@ -7272,24 +8045,28 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) PERL_ARGS_ASSERT_INVLIST_EXTEND; - SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max)); + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Add one to account for the zero element at the beginning which may not + * be counted by the calling parameters */ + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); } PERL_STATIC_INLINE void -S_invlist_trim(pTHX_ SV* const invlist) +S_invlist_trim(SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_TRIM; + assert(SvTYPE(invlist) == SVt_INVLIST); + /* Change the length of the inversion list to how many entries it currently * has */ - SvPV_shrink_to_cur((SV *) invlist); } -#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) - STATIC void -S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) { /* Subject to change or removal. Append the range from 'start' to 'end' at * the end of the inversion list. The range must be above any existing @@ -7298,11 +8075,13 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end UV* array; UV max = invlist_max(invlist); UV len = _invlist_len(invlist); + bool offset; PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; if (len == 0) { /* Empty lists must be initialized */ - array = _invlist_array_init(invlist, start == 0); + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); } else { /* Here, the existing list is non-empty. The current max entry in the @@ -7325,6 +8104,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end * value not in the set, it is extending the set, so the new first * value not in the set is one greater than the newly extended range. * */ + offset = *get_invlist_offset_addr(invlist); if (array[final_element] == start) { if (end != UV_MAX) { array[final_element] = end + 1; @@ -7332,7 +8112,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, * just let the range that this would extend to have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } return; } @@ -7342,16 +8122,18 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end len += 2; /* Includes an element each for the start and end of range */ - /* If overflows the existing space, extend, which may cause the array to be - * moved */ + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ if (max < len) { invlist_extend(invlist, len); - invlist_set_len(invlist, len); /* Have to set len here to avoid assert - failure in invlist_array() */ + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + array = invlist_array(invlist); } else { - invlist_set_len(invlist, len); + invlist_set_len(invlist, len, offset); } /* The next item on the list starts the range, the one after that is @@ -7363,14 +8145,14 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end else { /* But if the end is the maximum representable on the machine, just let * the range have no end */ - invlist_set_len(invlist, len - 1); + invlist_set_len(invlist, len - 1, offset); } } #ifndef PERL_IN_XSUB_RE IV -Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) +Perl__invlist_search(SV* const invlist, const UV cp) { /* Searches the inversion list for the entry that contains the input code * point . If is not in the list, -1 is returned. Otherwise, the @@ -7458,7 +8240,8 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) } void -Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) +Perl__invlist_populate_swatch(SV* const invlist, + const UV start, const UV end, U8* swatch) { /* populates a swatch of a swash the same way swatch_get() does in utf8.c, * but is used when the swash has an inversion list. This makes this much @@ -7551,14 +8334,16 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV } void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. The first list, - * , may be NULL, in which case a copy of the second list is returned. - * If is TRUE, the union is taken of the complement - * (inversion) of instead of b itself. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7573,8 +8358,8 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co * return the larger of the input lists, but then outside code might need * to keep track of whether to free the input list or not */ - UV* array_a; /* a's array */ - UV* array_b; + const UV* array_a; /* a's array */ + const UV* array_b; UV len_a; /* length of a's array */ UV len_b; @@ -7599,9 +8384,13 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co /* If either one is empty, the union is the other one */ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + if (*output == a) { if (a != NULL) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } } if (*output != b) { @@ -7610,18 +8399,27 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co _invlist_invert(*output); } } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; if (*output == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } /* The complement of an empty list is a list that has everything in it, * so the union with includes everything too */ if (complement_b) { if (a == *output) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } *output = _new_invlist(1); _append_range_to_invlist(*output, 0, UV_MAX); @@ -7630,6 +8428,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co *output = invlist_clone(a); } /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } return; } @@ -7642,23 +8444,17 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co if (complement_b) { /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later, and clear the - * flag as we don't have to do anything else later */ + * do this, we just pretend the array starts one later */ if (array_b[0] == 0) { array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -7754,7 +8550,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co /* Set result to final length, which can change the pointer to array_u, so * re-find it */ if (len_u != _invlist_len(u)) { - invlist_set_len(u, len_u); + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); invlist_trim(u); array_u = invlist_array(u); } @@ -7775,29 +8571,36 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co } } - /* If we've changed b, restore it */ - if (complement_b) { - array_b[0] = 1; - } - - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *output || b == *output) { assert(! invlist_is_iterating(*output)); - SvREFCNT_dec_NN(*output); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } } *output = u; + return; } void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. - * If is TRUE, the result will be the intersection of - * and the complement (or inversion) of instead of directly. + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -7808,8 +8611,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * union above */ - UV* array_a; /* a's array */ - UV* array_b; + const UV* array_a; /* a's array */ + const UV* array_b; UV len_a; /* length of a's array */ UV len_b; @@ -7833,8 +8636,9 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, assert(a != b); /* Special case if either one is empty */ - len_a = _invlist_len(a); + len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; if (len_a != 0 && complement_b) { @@ -7843,25 +8647,39 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * must be every possible code point. Thus the intersection is * simply 'a'. */ if (*i != a) { - *i = invlist_clone(a); - if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } + + *i = invlist_clone(a); } /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } return; } /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The * intersection must be empty */ if (*i == a) { - SvREFCNT_dec_NN(a); + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } } else if (*i == b) { - SvREFCNT_dec_NN(b); + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } } *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + return; } @@ -7874,23 +8692,17 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, if (complement_b) { /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later, and clear the - * flag as we don't have to do anything else later */ + * do this, we just pretend the array starts one later */ if (array_b[0] == 0) { array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -7972,7 +8784,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } /* The final length is what we've output so far plus what else is in the - * intersection. At most one of the subexpressions below will be non-zero */ + * intersection. At most one of the subexpressions below will be non-zero + * */ len_r = i_r; if (count >= 2) { len_r += (len_a - i_a) + (len_b - i_b); @@ -7981,7 +8794,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Set result to final length, which can change the pointer to array_r, so * re-find it */ if (len_r != _invlist_len(r)) { - invlist_set_len(r, len_r); + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); invlist_trim(r); array_r = invlist_array(r); } @@ -7997,18 +8810,21 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* If we've changed b, restore it */ - if (complement_b) { - array_b[0] = 1; - } - - /* We may be removing a reference to one of the inputs */ + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ if (a == *i || b == *i) { assert(! invlist_is_iterating(*i)); - SvREFCNT_dec_NN(*i); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } } *i = r; + return; } @@ -8055,6 +8871,35 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) return invlist; } +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + #endif PERL_STATIC_INLINE SV* @@ -8070,66 +8915,19 @@ Perl__invlist_invert(pTHX_ SV* const invlist) * have a zero; removes it otherwise. As described above, the data * structure is set up so that this is very efficient */ - UV* len_pos = _get_invlist_len_addr(invlist); - PERL_ARGS_ASSERT__INVLIST_INVERT; assert(! invlist_is_iterating(invlist)); /* The inverse of matching nothing is matching everything */ - if (*len_pos == 0) { + if (_invlist_len(invlist) == 0) { _append_range_to_invlist(invlist, 0, UV_MAX); return; } - /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the - * zero element was a 0, so it is being removed, so the length decrements - * by 1; and vice-versa. SvCUR is unaffected */ - if (*get_invlist_zero_addr(invlist) ^= 1) { - (*len_pos)--; - } - else { - (*len_pos)++; - } + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); } -void -Perl__invlist_invert_prop(pTHX_ SV* const invlist) -{ - /* Complement the input inversion list (which must be a Unicode property, - * all of which don't match above the Unicode maximum code point.) And - * Perl has chosen to not have the inversion match above that either. This - * adds a 0x110000 if the list didn't end with it, and removes it if it did - */ - - UV len; - UV* array; - - PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; - - _invlist_invert(invlist); - - len = _invlist_len(invlist); - - if (len != 0) { /* If empty do nothing */ - array = invlist_array(invlist); - if (array[len - 1] != PERL_UNICODE_MAX + 1) { - /* Add 0x110000. First, grow if necessary */ - len++; - if (invlist_max(invlist) < len) { - invlist_extend(invlist, len); - array = invlist_array(invlist); - } - invlist_set_len(invlist, len); - array[len - 1] = PERL_UNICODE_MAX + 1; - } - else { /* Remove the 0x110000 */ - invlist_set_len(invlist, len - 1); - } - } - - return; -} #endif PERL_STATIC_INLINE SV* @@ -8137,44 +8935,38 @@ S_invlist_clone(pTHX_ SV* const invlist) { /* Return a new inversion list that is a copy of the input one, which is - * unchanged */ + * unchanged. The new list will not be mortal even if the old one was. */ /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); - STRLEN length = SvCUR(invlist); + STRLEN physical_length = SvCUR(invlist); + bool offset = *(get_invlist_offset_addr(invlist)); PERL_ARGS_ASSERT_INVLIST_CLONE; - SvCUR_set(new_invlist, length); /* This isn't done automatically */ - Copy(SvPVX(invlist), SvPVX(new_invlist), length, char); + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, _invlist_len(invlist), offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); return new_invlist; } -PERL_STATIC_INLINE UV* -S_get_invlist_iter_addr(pTHX_ SV* invlist) +PERL_STATIC_INLINE STRLEN* +S_get_invlist_iter_addr(SV* invlist) { /* Return the address of the UV that contains the current iteration * position */ PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; - return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV))); -} - -PERL_STATIC_INLINE UV* -S_get_invlist_version_id_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that contains the version id. */ - - PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR; + assert(SvTYPE(invlist) == SVt_INVLIST); - return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV))); + return &(((XINVLIST*) SvANY(invlist))->iterator); } PERL_STATIC_INLINE void -S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ +S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ { PERL_ARGS_ASSERT_INVLIST_ITERINIT; @@ -8182,7 +8974,7 @@ S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ } PERL_STATIC_INLINE void -S_invlist_iterfinish(pTHX_ SV* invlist) +S_invlist_iterfinish(SV* invlist) { /* Terminate iterator for invlist. This is to catch development errors. * Any iteration that is interrupted before completed should call this @@ -8194,11 +8986,11 @@ S_invlist_iterfinish(pTHX_ SV* invlist) PERL_ARGS_ASSERT_INVLIST_ITERFINISH; - *get_invlist_iter_addr(invlist) = UV_MAX; + *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; } STATIC bool -S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) +S_invlist_iternext(SV* invlist, UV* start, UV* end) { /* An C call on must be used to set this up. * This call sets in <*start> and <*end>, the next range in . @@ -8207,14 +8999,14 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) * <*start> and <*end> are unchanged, and the next call to this function * will start over at the beginning of the list */ - UV* pos = get_invlist_iter_addr(invlist); + STRLEN* pos = get_invlist_iter_addr(invlist); UV len = _invlist_len(invlist); UV *array; PERL_ARGS_ASSERT_INVLIST_ITERNEXT; if (*pos >= len) { - *pos = UV_MAX; /* Force iterinit() to be required next time */ + *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ return FALSE; } @@ -8233,15 +9025,15 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) } PERL_STATIC_INLINE bool -S_invlist_is_iterating(pTHX_ SV* const invlist) +S_invlist_is_iterating(SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; - return *(get_invlist_iter_addr(invlist)) < UV_MAX; + return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; } PERL_STATIC_INLINE UV -S_invlist_highest(pTHX_ SV* const invlist) +S_invlist_highest(SV* const invlist) { /* Returns the highest code point that matches an inversion list. This API * has an ambiguity, as it returns 0 under either the highest is actually @@ -8303,51 +9095,85 @@ Perl__invlist_contents(pTHX_ SV* const invlist) } #endif -#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP +#ifndef PERL_IN_XSUB_RE void -Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) { - /* Dumps out the ranges in an inversion list. The string 'header' - * if present is output on a line before the first range */ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ UV start, end; + STRLEN count = 0; PERL_ARGS_ASSERT__INVLIST_DUMP; - if (header && strlen(header)) { - PerlIO_printf(Perl_debug_log, "%s\n", header); - } if (invlist_is_iterating(invlist)) { - PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n"); + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); return; } invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); } else if (end != start) { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", - start, end); + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); } else { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start); + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); } + count += 2; + } +} + +void +Perl__load_PL_utf8_foldclosures (pTHX) +{ + assert(! PL_utf8_foldclosures); + + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); } #endif -#if 0 +#ifdef PERL_ARGS_ASSERT__INVLISTEQ bool -S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) +S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) { /* Return a boolean as to if the two passed in inversion lists are * identical. The final argument, if TRUE, says to take the complement of * the second inversion list before doing the comparison */ - UV* array_a = invlist_array(a); - UV* array_b = invlist_array(b); + const UV* array_a = invlist_array(a); + const UV* array_b = invlist_array(b); UV len_a = _invlist_len(a); UV len_b = _invlist_len(b); @@ -8369,23 +9195,17 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) /* Otherwise, to complement, we invert. Here, the first element is * 0, just remove it. To do this, we just pretend the array starts - * one later, and clear the flag as we don't have to do anything - * else later */ + * one later */ array_b++; len_b--; - complement_b = FALSE; } else { - /* But if the first element is not zero, we unshift a 0 before the - * array. The data structure reserves a space for that 0 (which - * should be a '1' right now), so physical shifting is unneeded, - * but temporarily change that element to 0. Before exiting the - * routine, we must restore the element to '1' */ + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ array_b--; len_b++; - array_b[0] = 0; } } @@ -8402,27 +9222,19 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) } } - if (complement_b) { - array_b[0] = 1; - } return retval; } #endif #undef HEADER_LENGTH -#undef INVLIST_INITIAL_LENGTH #undef TO_INTERNAL_SIZE #undef FROM_INTERNAL_SIZE -#undef INVLIST_LEN_OFFSET -#undef INVLIST_ZERO_OFFSET -#undef INVLIST_ITER_OFFSET #undef INVLIST_VERSION_ID -#undef INVLIST_PREVIOUS_INDEX_OFFSET /* End of inversion list object */ STATIC void -S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) { /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' * constructs, and updates RExC_flags with them. On input, RExC_parse @@ -8482,7 +9294,6 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) } cs = REGEX_LOCALE_CHARSET; has_charset_modifier = LOCALE_PAT_MOD; - RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier) { @@ -8536,7 +9347,8 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); } else if (has_charset_modifier == *(RExC_parse - 1)) { - vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); } else { vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); @@ -8544,12 +9356,15 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) /*NOTREACHED*/ neg_modifier: RExC_parse++; - vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { - const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; if (! (wastedflags & wflagbit) ) { wastedflags |= wflagbit; /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ @@ -8602,13 +9417,17 @@ S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state) RExC_flags |= posflags; RExC_flags &= ~negflags; set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } return; /*NOTREACHED*/ default: fail_modifiers: - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", - RExC_parse-seqstart, seqstart); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } @@ -8656,6 +9475,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) bool is_open = 0; I32 freeze_paren = 0; I32 after_freeze = 0; + I32 num; /* numeric backreferences */ char * parse_start = RExC_parse; /* MJD */ char * const oregcomp_parse = RExC_parse; @@ -8683,10 +9503,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char *start_arg = NULL; unsigned char op = 0; int argok = 1; - int internal_argval = 0; /* internal_argval is only useful if !argok */ + int internal_argval = 0; /* internal_argval is only useful if + !argok */ - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); } while ( *RExC_parse && *RExC_parse != ')' ) { if ( *RExC_parse == ':' ) { @@ -8746,14 +9568,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* [19:06] :: is then */ if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; - RExC_seen |= REG_SEEN_CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; } break; } if ( ! op ) { - RExC_parse++; - vFAIL3("Unknown verb pattern '%.*s'", - verb_len, start_verb); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); } if ( argok ) { if ( start_arg && internal_argval ) { @@ -8766,8 +9589,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, op, internal_argval); if ( ! internal_argval && ! SIZE_ONLY ) { if (start_arg) { - SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); - ARG(ret) = add_data( pRExC_state, 1, "S" ); + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); RExC_rxi->data->data[ARG(ret)]=(void*)sv; ret->flags = 0; } else { @@ -8776,7 +9601,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if (!internal_argval) - RExC_seen |= REG_SEEN_VERBARG; + RExC_seen |= REG_VERBARG_SEEN; } else if ( start_arg ) { vFAIL3("Verb pattern '%.*s' may not have an argument", verb_len, start_verb); @@ -8789,8 +9614,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (*RExC_parse == '?') { /* (?...) */ bool is_logical = 0; const char * const seqstart = RExC_parse; - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated"); + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(?...)', the '(' and '?' must be adjacent"); } RExC_parse++; @@ -8806,17 +9632,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto named_recursion; } else if (paren == '=') { /* (?P=...) named backref */ - /* this pretty much dupes the code for \k in regatom(), if - you change this make sure you change that */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ char* name_start = RExC_parse; U32 num = 0; SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -8841,7 +9669,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); /*NOTREACHED*/ case '<': /* (?<...) */ if (*RExC_parse == '!') @@ -8855,15 +9685,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '\'': /* (?'...') */ name_start= RExC_parse; svname = reg_scan_name(pRExC_state, - SIZE_ONLY ? /* reverse test from the others */ - REG_RSN_RETURN_NAME : - REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - /*NOTREACHED*/ - } - if (*RExC_parse != paren) + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); if (SIZE_ONLY) { @@ -8903,20 +9728,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } if ( count ) { - pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); pv[count] = RExC_npar; SvIV_set(sv_dat, SvIVX(sv_dat) + 1); } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); - sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); SvIOK_on(sv_dat); SvIV_set(sv_dat, 1); } #ifdef DEBUGGING - /* Yes this does cause a memory leak in debugging Perls */ - if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) SvREFCNT_dec_NN(svname); #endif @@ -8926,9 +9755,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = 1; goto capturing_parens; } - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; + /* FALLTHROUGH */ case '=': /* (?=...) */ RExC_seen_zerolen++; break; @@ -8953,29 +9783,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '@': /* (?@...) */ vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; - case '#': /* (?#...) */ - /* XXX As soon as we disallow separating the '?' and '*' (by - * spaces or (?#...) comment), it is believed that this case - * will be unreachable and can be removed. See - * [perl #117327] */ - while (*RExC_parse && *RExC_parse != ')') - RExC_parse++; - if (*RExC_parse != ')') - FAIL("Sequence (?#... not terminated"); - nextchar(pRExC_state); - *flagp = TRYAGAIN; - return NULL; case '0' : /* (?0) */ case 'R' : /* (?R) */ if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; *flagp |= POSTPONED; nextchar(pRExC_state); return ret; /*notreached*/ - { /* named and numeric backreferences */ - I32 num; + /* named and numeric backreferences */ case '&': /* (?&NAME) */ parse_start = RExC_parse - 1; named_recursion: @@ -8984,6 +9802,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); goto gen_recurse_regop; assert(0); /* NOT REACHED */ case '+': @@ -8998,7 +9818,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse--; /* rewind to let it be handled later */ goto parse_flags; } - /*FALLTHROUGH */ + /* FALLTHROUGH */ case '1': case '2': case '3': case '4': /* (?1) */ case '5': case '6': case '7': case '8': case '9': RExC_parse--; @@ -9043,30 +9863,34 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ARG2L_SET( ret, RExC_recurse_count++); RExC_emit++; DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret))); + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); } else { RExC_size++; } - RExC_seen |= REG_SEEN_RECURSE; + RExC_seen |= REG_RECURSE_SEEN; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ *flagp |= POSTPONED; nextchar(pRExC_state); return ret; - } /* named and numeric backreferences */ + assert(0); /* NOT REACHED */ case '?': /* (??...) */ is_logical = 1; if (*RExC_parse != '{') { RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); /*NOTREACHED*/ } *flagp |= POSTPONED; paren = *RExC_parse++; - /* FALL THROUGH */ + /* FALLTHROUGH */ case '{': /* (?{...}) */ { U32 n = 0; @@ -9090,14 +9914,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { OP *o = cb->block; if (cb->src_regex) { - n = add_data(pRExC_state, 2, "rl"); + n = add_data(pRExC_state, STR_WITH_LEN("rl")); RExC_rxi->data->data[n] = (void*)SvREFCNT_inc((SV*)cb->src_regex); RExC_rxi->data->data[n+1] = (void*)o; } else { - n = add_data(pRExC_state, 1, - (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); RExC_rxi->data->data[n] = (void*)o; } } @@ -9158,7 +9982,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) (ch == '>' ? '<' : ch)); RExC_parse++; if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -9188,7 +10012,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV *sv_dat; RExC_parse++; sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } ret = reganode(pRExC_state,INSUBP,parno); @@ -9197,6 +10023,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; + char *tmp; parno = atoi(RExC_parse++); while (isDIGIT(*RExC_parse)) @@ -9204,8 +10031,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if ((c = *nextchar(pRExC_state)) != ')') + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); + } insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -9217,14 +10050,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); } else - REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); c = *nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { if (is_define) vFAIL("(?(DEFINE)....) does not allow branches"); - lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { if (flags & RESTART_UTF8) { *flagp = RESTART_UTF8; @@ -9256,7 +10093,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else { - vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); } } case '[': /* (?[ ... ]) */ @@ -9290,7 +10128,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE + if (RExC_seen & REG_RECURSE_SEEN && !RExC_open_parens[parno-1]) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, @@ -9347,7 +10185,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { ender = reganode(pRExC_state, LONGJMP,0); - REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); } if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ @@ -9379,7 +10219,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) { + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting close paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ender))); @@ -9395,7 +10235,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '=': case '!': *flagp &= ~HASWIDTH; - /* FALL THROUGH */ + /* FALLTHROUGH */ case '>': ender = reg_node(pRExC_state, SUCCEED); break; @@ -9411,8 +10251,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("lsbr"); - regprop(RExC_rx, mysv_val1, lastbr); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(lastbr), @@ -9426,20 +10266,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (have_branch && !SIZE_ONLY) { char is_nothing= 1; if (depth==1) - RExC_seen |= REG_TOP_LEVEL_BRANCHES; + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; /* Hook the tails of the branches to the closing node. */ for (br = ret; br; br = regnext(br)) { const U8 op = PL_regkind[OP(br)]; if (op == BRANCH) { REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); - if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender) + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) is_nothing= 0; } else if (op == BRANCHJ) { REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); /* for now we always disable this optimisation * / - if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender) + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) */ is_nothing= 0; } @@ -9450,8 +10292,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SV * const mysv_val1=sv_newmortal(); SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("NADA"); - regprop(RExC_rx, mysv_val1, ret); - regprop(RExC_rx, mysv_val2, ender); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(mysv_val1), (IV)REG_NODE_NUM(ret), @@ -9645,7 +10487,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) op = *RExC_parse; - if (op == '{' && regcurly(RExC_parse, FALSE)) { + if (op == '{' && regcurly(RExC_parse)) { maxpos = NULL; #ifdef RE_TRACK_PATTERN_OFFSETS parse_start = RExC_parse; /* MJD */ @@ -9693,23 +10535,18 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, OPFAIL); return ret; } - else if (max == 0) { /* replace {0} with a nothing node */ + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { if (SIZE_ONLY) { - RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING]; - } - else { - RExC_emit = orig_emit; - } - ret = reg_node(pRExC_state, NOTHING); - - /* But the quantifier includes any '?', the non-greedy - * modifier, after the {}, [perl #118375] - * Likewise the '+', the possessive modifier. They are mutually exclusive. - */ - if (RExC_parse < RExC_end && (*RExC_parse == '?' || *RExC_parse == '+') ) { - nextchar(pRExC_state); + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); } - return ret; + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); } do_curly: @@ -9752,6 +10589,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; goto nest_check; } @@ -9789,6 +10628,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '*') { min = 0; @@ -9798,6 +10638,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '+') { min = 1; @@ -9810,10 +10651,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) nest_check: if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN3reg(RExC_parse, - "%.*s matches null string many times", - (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), - origparse); + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); (void)ReREFCNT_inc(RExC_rx_sv); } @@ -9843,8 +10686,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } STATIC bool -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, - const bool strict /* Apply stricter parsing rules? */ +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ ) { @@ -9910,27 +10754,30 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ /* The [^\n] meaning of \N ignores spaces and comments under the /x - * modifier. The other meaning does not */ + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) - ? regwhite( pRExC_state, RExC_parse ) + ? regpatws(pRExC_state, RExC_parse, + TRUE) /* means recognize comments */ : RExC_parse; /* Disambiguate between \N meaning a named character versus \N meaning * [^\n]. The former is assumed when it can't be the latter. */ - if (*p != '{' || regcurly(p, FALSE)) { + if (*p != '{' || regcurly(p)) { RExC_parse = p; if (! node_p) { - /* no bare \N in a charclass */ + /* no bare \N allowed in a charclass */ if (in_char_class) { vFAIL("\\N in a character class must be a named character: \\N{...}"); } return FALSE; } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ nextchar(pRExC_state); *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; - RExC_parse--; Set_Node_Length(*node_p, 1); /* MJD */ return TRUE; } @@ -9949,8 +10796,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ || ! (endbrace == RExC_parse /* nothing between the {} */ - || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */ - && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ { if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); @@ -10136,7 +10985,7 @@ S_reg_recode(pTHX_ const char value, SV **encp) } PERL_STATIC_INLINE U8 -S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) +S_compute_EXACTish(RExC_state_t *pRExC_state) { U8 op; @@ -10156,7 +11005,9 @@ S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) } PERL_STATIC_INLINE void -S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) { /* This knows the details about sizing an EXACTish node, setting flags for * it (by setting <*flagp>, and potentially populating it with a single @@ -10171,48 +11022,111 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 * If is zero, the function assumes that the node is to contain only * the single character given by and calculates what * should be. In pass 1, it sizes the node appropriately. In pass 2, it - * additionally will populate the node's STRING with , if - * is 0. In both cases <*flagp> is appropriately set + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set * * It knows that under FOLD, the Latin Sharp S and UTF characters above * 255, must be folded (the former only when the rules indicate it can - * match 'ss') */ + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + if (! len_passed_in) { if (UTF) { - if (FOLD && (! LOC || code_point > 255)) { - _to_uni_fold_flags(NATIVE_TO_UNI(code_point), + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, character, &len, - FOLD_FLAGS_FULL | ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; } else { uvchr_to_utf8( character, code_point); len = UTF8SKIP(character); } - } - else if (! FOLD - || code_point != LATIN_SMALL_LETTER_SHARP_S - || ASCII_FOLD_RESTRICTED - || ! AT_LEAST_UNI_SEMANTICS) - { + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { *character = (U8) code_point; len = 1; - } - else { + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { *character = 's'; *(character + 1) = 's'; len = 2; } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } } if (SIZE_ONLY) { @@ -10236,8 +11150,29 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 { *flagp |= SIMPLE; } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } +} + + +/* return atoi(p), unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + char *q = p; + + for (;isDIGIT(*q); q++) {} /* calculate length of num */ + if (q - p == 0 || q - p > 9) + return I32_MAX; + return atoi(p); } + /* - regatom - the lowest level @@ -10313,6 +11248,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start = RExC_parse; U8 op; int invert = 0; + U8 arg; GET_RE_DEBUG_FLAGS_DECL; @@ -10395,7 +11331,8 @@ tryagain: *flagp = RESTART_UTF8; return NULL; } - FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; @@ -10408,12 +11345,6 @@ tryagain: vFAIL("Internal urp"); /* Supposed to be caught earlier. */ break; - case '{': - if (!regcurly(RExC_parse, FALSE)) { - RExC_parse++; - goto defchar; - } - /* FALL THROUGH */ case '?': case '+': case '*': @@ -10434,7 +11365,6 @@ tryagain: literal text handling code. */ switch ((U8)*++RExC_parse) { - U8 arg; /* Special Escapes */ case 'A': RExC_seen_zerolen++; @@ -10443,7 +11373,7 @@ tryagain: goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); - RExC_seen |= REG_SEEN_GPOS; + RExC_seen |= REG_GPOS_SEEN; *flagp |= SIMPLE; goto finish_meta_pat; case 'K': @@ -10454,7 +11384,7 @@ tryagain: * be necessary here to avoid cases of memory corruption, as * with: C<$_="x" x 80; s/x\K/y/> -- rgs */ - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -10468,7 +11398,7 @@ tryagain: goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_SEEN_CANY; + RExC_seen |= REG_CANY_SEEN; *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'X': @@ -10485,30 +11415,38 @@ tryagain: case 'b': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = BOUND + get_regex_charset(RExC_flags); if (op > BOUNDA) { /* /aa is same as /a */ op = BOUNDA; } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_LOOKBEHIND; + RExC_seen |= REG_LOOKBEHIND_SEEN; op = NBOUND + get_regex_charset(RExC_flags); if (op > NBOUNDA) { /* /aa is same as /a */ op = NBOUNDA; } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead"); + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); } goto finish_meta_pat; @@ -10552,6 +11490,9 @@ tryagain: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } join_posix_op_known: @@ -10565,7 +11506,7 @@ tryagain: } *flagp |= HASWIDTH|SIMPLE; - /* FALL THROUGH */ + /* FALLTHROUGH */ finish_meta_pat: nextchar(pRExC_state); @@ -10626,6 +11567,7 @@ tryagain: char ch= RExC_parse[1]; if (ch != '<' && ch != '\'' && ch != '{') { RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.2s... not terminated",parse_start); } else { /* this pretty much dupes the code for (?P=...) in reg(), if @@ -10636,10 +11578,11 @@ tryagain: SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.3s... not terminated",parse_start); if (!SIZE_ONLY) { - num = add_data( pRExC_state, 1, "S" ); + num = add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void(sv_dat); } @@ -10671,10 +11614,11 @@ tryagain: case '5': case '6': case '7': case '8': case '9': { I32 num; - bool isg = *RExC_parse == 'g'; - bool isrel = 0; bool hasbrace = 0; - if (isg) { + + if (*RExC_parse == 'g') { + bool isrel = 0; + RExC_parse++; if (*RExC_parse == '{') { RExC_parse++; @@ -10688,25 +11632,40 @@ tryagain: if (isrel) RExC_parse--; RExC_parse -= 2; goto parse_named_seq; - } } - num = atoi(RExC_parse); - if (isg && num == 0) { - if (*RExC_parse == '0') { + } + + num = S_backref_value(RExC_parse); + if (num == 0) vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); } - else { - vFAIL("Unterminated \\g... pattern"); + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); } } - if (isrel) { - num = RExC_npar - num; - if (num < 1) - vFAIL("Reference to nonexistent or unclosed group"); - } - if (!isg && num > 9 && num >= RExC_npar) - /* Probably a character specified in octal, e.g. \35 */ - goto defchar; else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } + } + + /* at this point RExC_parse definitely points to a backref + * number */ + { #ifdef RE_TRACK_PATTERN_OFFSETS char * const parse_start = RExC_parse - 1; /* MJD */ #endif @@ -10746,7 +11705,7 @@ tryagain: case '\0': if (RExC_parse >= RExC_end) FAIL("Trailing \\"); - /* FALL THROUGH */ + /* FALLTHROUGH */ default: /* Do not generate "unrecognized" warnings here, we fall back into the quick-grab loop below */ @@ -10757,10 +11716,11 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - if ( reg_skipcomment( pRExC_state ) ) + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) goto tryagain; } - /* FALL THROUGH */ + /* FALLTHROUGH */ default: @@ -10770,25 +11730,35 @@ tryagain: defchar: { STRLEN len = 0; - UV ender; + UV ender = 0; char *p; char *s; #define MAX_NODE_STRING_SIZE 127 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; char *s0; U8 upper_parse = MAX_NODE_STRING_SIZE; - STRLEN foldlen; - U8 node_type; + U8 node_type = compute_EXACTish(pRExC_state); bool next_is_quantifier; char * oldp = NULL; + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); + /* If a folding node contains only code points that don't * participate in folds, it can be changed into an EXACT node, * which allows the optimizer more things to look for */ bool maybe_exact; - ender = 0; - node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); /* In pass1, folded, we use a temporary buffer instead of the @@ -10799,10 +11769,9 @@ tryagain: reparse: - /* We do the EXACTFish to EXACT node only if folding, and not if in - * locale, as whether a character folds or not isn't known until - * runtime */ - maybe_exact = FOLD && ! LOC; + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10829,7 +11798,8 @@ tryagain: oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ switch ((U8)*p) { case '^': case '$': @@ -10867,7 +11837,8 @@ tryagain: case 's': case 'S': /* space class */ case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ - case 'X': /* eXtended Unicode "combining character sequence" */ + case 'X': /* eXtended Unicode "combining + character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ --p; goto loopdone; @@ -10915,7 +11886,7 @@ tryagain: p++; break; case 'a': - ender = ASCII_TO_NATIVE('\007'); + ender = '\a'; p++; break; case 'o': @@ -10978,12 +11949,37 @@ tryagain: } case 'c': p++; - ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); + ender = grok_bslash_c(*p++, SIZE_ONLY); break; - case '0': case '1': case '2': case '3':case '4': + case '8': case '9': /* must be a backreference */ + --p; + goto loopdone; + case '1': case '2': case '3':case '4': case '5': case '6': case '7': - if (*p == '0' || - (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. + * + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) + { /* Not to be treated as an octal constant, go + find backref */ + --p; + goto loopdone; + } + /* FALLTHROUGH */ + case '0': { I32 flags = PERL_SCAN_SILENT_ILLDIGIT; STRLEN numlen = 3; @@ -11002,11 +11998,6 @@ tryagain: form_short_octal_warning(p, numlen)); } } - else { /* Not to be treated as an octal constant, go - find backref */ - --p; - goto loopdone; - } if (PL_encoding && ender < 0x100) goto recode_encoding; break; @@ -11022,7 +12013,7 @@ tryagain: case '\0': if (p >= RExC_end) FAIL("Trailing \\"); - /* FALL THROUGH */ + /* FALLTHROUGH */ default: if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { /* Include any { following the alpha to emphasize @@ -11034,17 +12025,18 @@ tryagain: goto normal_default; } /* End of switch on '\' */ break; - default: /* A literal character */ - + case '{': + /* Currently we don't warn when the lbrace is at the start + * of a construct. This catches it in the middle of a + * literal string, or when its the first thing after + * something like "\b" */ if (! SIZE_ONLY - && RExC_flags & RXf_PMf_EXTENDED - && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low(p, UTF)) + && (len || (p > RExC_start && isALPHA_A(*(p -1))))) { - vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), - "Escape literal pattern white space under /x"); + ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); } - + /*FALLTHROUGH*/ + default: /* A literal character */ normal_default: if (UTF8_IS_START(*p) && UTF) { STRLEN numlen; @@ -11062,7 +12054,8 @@ tryagain: */ if ( RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ /* If the next thing is a quantifier, it applies to this * character only, which means that this character has to be in @@ -11077,7 +12070,10 @@ tryagain: goto loopdone; } - if (! FOLD) { + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { if (UTF) { const STRLEN unilen = reguni(pRExC_state, ender, s); if (unilen > 0) { @@ -11095,81 +12091,100 @@ tryagain: else { REGC((char)ender, s++); } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } } - else /* FOLD */ + else /* FOLD */ if (! ( UTF /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU && ender == LATIN_SMALL_LETTER_SHARP_S))) { - *(s++) = (char) ender; - maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender); - } - else { /* UTF */ - - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UNI_IS_INVARIANT(ender)) { - *s = (U8) ender; - foldlen = 1; - } else { - *s = UTF8_TWO_BYTE_HI(ender); - *(s + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) + { + maybe_exactfu = FALSE; } } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ + *(s++) = (char) ender; + } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; + } else { - UV folded = _to_uni_fold_flags( + STRLEN foldlen; + + folded = _to_uni_fold_flags( ender, (U8 *) s, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) - ); + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; - /* If this node only contains non-folding code points - * so far, see if this new one is also non-folding */ - if (maybe_exact) { - if (folded != ender) { - maybe_exact = FALSE; - } - else { - /* Here the fold is the original; we have - * to check further to see if anything - * folds to it */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", - "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = - _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - if (_invlist_contains_cp(PL_utf8_foldable, - ender)) - { - maybe_exact = FALSE; - } - } + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { + maybe_exact = FALSE; + } } - ender = folded; } - s += foldlen; - - /* The loop increments each time, as all but this - * path (and one other) through it add a single byte to the - * EXACTish node. But this one has changed len to be the - * correct final value, so subtract one to cancel out the - * increment that follows */ - len += foldlen - 1; + ender = folded; } if (next_is_quantifier) { @@ -11218,9 +12233,8 @@ tryagain: if (! UTF) { - /* These two have no multi-char folds to non-UTF characters - */ - if (ASCII_FOLD_RESTRICTED || LOC) { + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { goto loopdone; } @@ -11251,11 +12265,7 @@ tryagain: } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - - /* No Latin1 characters participate in multi-char - * folds under /l */ - if (LOC - || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( *s, *(s+1)))) { break; @@ -11293,6 +12303,15 @@ tryagain: * do any better */ if (len == 0) { len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } } else { /* Here, the node does contain some characters that aren't @@ -11351,14 +12370,26 @@ tryagain: if (len == 0) { OP(ret) = NOTHING; } - else{ - - /* If 'maybe_exact' is still set here, means there are no - * code points in the node that participate in folds */ - if (FOLD && maybe_exact) { - OP(ret) = EXACT; + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * (for /u), or depending on locale for /l */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } } - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); } RExC_parse = p - 1; @@ -11379,39 +12410,11 @@ tryagain: } STATIC char * -S_regwhite( RExC_state_t *pRExC_state, char *p ) -{ - const char *e = RExC_end; - - PERL_ARGS_ASSERT_REGWHITE; - - while (p < e) { - if (isSPACE(*p)) - ++p; - else if (*p == '#') { - bool ended = 0; - do { - if (*p++ == '\n') { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; - } - else - break; - } - return p; -} - -STATIC char * -S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) +S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) { /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is - * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */ + * ended by RExC_end. See also reg_skipcomment */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -11422,16 +12425,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) p += len; } else if (recognize_comment && *p == '#') { - bool ended = 0; - do { - p++; - if (is_LNBREAK_safe(p, e, UTF)) { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + p = reg_skipcomment(pRExC_state, p); } else break; @@ -11439,6 +12433,72 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) return p; } +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. Character classes ([:foo:]) can also be negated ([:^foo:]). Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. @@ -11557,8 +12617,9 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) } if (namedclass == OOB_NAMEDCLASS) - Simple_vFAIL3("POSIX class [:%.*s:] unknown", - t - s - 1, s + 1); + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); /* The #defines are structured so each complement is +1 to * the normal one */ @@ -11593,7 +12654,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) } STATIC bool -S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) +S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state) { /* This applies some heuristics at the current parse position (which should * be at a '[') to see if what follows might be intended to be a [:posix:] @@ -11646,8 +12707,9 @@ S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) } STATIC regnode * -S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth, - char * const oregcomp_parse) +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) { /* Handle the (?[...]) construct to do set operations */ @@ -11683,16 +12745,19 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__REGEX_SETS), "The regex_sets feature is experimental" REPORT_LOCATION, - (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse); + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); while (RExC_parse < RExC_end) { SV* current = NULL; RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE); /* means recognize comments */ switch (*RExC_parse) { case '?': if (RExC_parse[1] == '[') depth++, RExC_parse++; - /* FALL THROUGH */ + /* FALLTHROUGH */ default: break; case '\\': @@ -11805,7 +12870,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f /* Skip white space */ RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE /* means recognize comments */ ); if (RExC_parse >= RExC_end) { Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); } @@ -11879,7 +12944,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f RExC_flags = save_flags; goto handle_operand; } - /* FALL THROUGH */ + /* FALLTHROUGH */ default: RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; @@ -11975,7 +13040,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f top_index -= 2; SvREFCNT_dec_NN(lparen); - /* FALL THROUGH */ + /* FALLTHROUGH */ } handle_operand: @@ -12134,10 +13199,79 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *f } #undef IS_OPERAND +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a fold with other code points above Latin1. It + * would give false results if /aa has been specified. Multi-char folds + * are outside the scope of this, and must be handled specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character folds from code points + * that require UTF8 to express, so they can't match unless the + * target string is in UTF-8, so no action here is necessary, as + * regexec.c properly handles the general case for UTF-8 matching + * and multi-char folds */ + break; + default: + /* Use deprecated warning to increase the chances of this being + * output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + break; + } +} + /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ -#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, @@ -12187,8 +13321,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ SV* properties = NULL; /* Code points that match \p{} \P{} */ - SV* posixes = NULL; /* Code points that match classes like, [:word:], - extended beyond the Latin1 range */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ AV * multi_char_matches = NULL; /* Code points that fold to more than one @@ -12215,11 +13357,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * string is in UTF-8. (Because is under /d) */ SV* depends_list = NULL; - /* inversion list of code points this node matches. For much of the - * function, it includes only those that match regardless of the utf8ness - * of the target string */ + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ SV* cp_list = NULL; + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -12227,14 +13376,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif bool invert = FALSE; /* Is this class to be complemented */ - /* Is there any thing like \W or [:^digit:] that matches above the legal - * Unicode range? */ - bool runtime_posix_matches_above_Unicode = FALSE; + bool warn_super = ALWAYS_WARN_SUPER; regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; - const I32 orig_size = RExC_size; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -12255,9 +13403,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ANYOF_FLAGS(ret) = 0; RExC_emit += ANYOF_SKIP; - if (LOC) { - ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } listsv = newSVpvs_flags("# comment\n", SVs_TEMP); initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ @@ -12265,7 +13410,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ @@ -12275,7 +13420,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_naughty++; if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } } @@ -12313,7 +13458,7 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == ']') { @@ -12400,7 +13545,12 @@ parseit: char *e; /* We will handle any undefined properties ourselves */ - U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF; + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; if (RExC_parse >= RExC_end) vFAIL2("Empty \\%c{}", (U8)value); @@ -12409,12 +13559,12 @@ parseit: e = strchr(RExC_parse++, '}'); if (!e) vFAIL2("Missing right brace on \\%c{}", c); - while (isSPACE(UCHARAT(RExC_parse))) + while (isSPACE(*RExC_parse)) RExC_parse++; if (e == RExC_parse) vFAIL2("Empty \\%c{}", c); n = e - RExC_parse; - while (isSPACE(UCHARAT(RExC_parse + n - 1))) + while (isSPACE(*(RExC_parse + n - 1))) n--; } else { @@ -12433,7 +13583,7 @@ parseit: * that bit) */ value ^= 'P' ^ 'p'; - while (isSPACE(UCHARAT(RExC_parse))) { + while (isSPACE(*RExC_parse)) { RExC_parse++; n--; } @@ -12443,14 +13593,13 @@ parseit: * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - Newx(name, n + sizeof("_i__\n"), char); - - sprintf(name, "%s%.*s%s\n", - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - ); + name = savepv(Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + )); /* Look up the property name, and get its swash and * inversion list, if the property is found */ @@ -12464,6 +13613,9 @@ parseit: &swash_init_flags ); if (! swash || ! (invlist = _get_swash_invlist(swash))) { + HV* curpkg = (IN_PERL_COMPILETIME) + ? PL_curstash + : CopSTASH(PL_curcop); if (swash) { SvREFCNT_dec_NN(swash); swash = NULL; @@ -12475,11 +13627,29 @@ parseit: * otherwise add it to the list for run-time look up */ if (ret_invlist) { RExC_parse = e + 1; - vFAIL3("Property '%.*s' is unknown", (int) n, name); + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", + + /* If the property name doesn't already have a package + * name, add the current one to it so that it can be + * referred to outside it. [perl #121777] */ + if (curpkg && ! instr(name, "::")) { + char* pkgname = HvNAME(curpkg); + if (strNE(pkgname, "main")) { + char* full_name = Perl_form(aTHX_ + "%s::%s", + pkgname, + name); + n = strlen(full_name); + Safefree(name); + name = savepvn(full_name, n); + } + } + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", (value == 'p' ? '+' : '!'), - name); + UTF8fARG(UTF, n, name)); has_user_defined_property = TRUE; /* We don't know yet, so have to assume that the @@ -12488,7 +13658,7 @@ parseit: * would cause things in to match * inappropriately, except that any \p{}, including * this one forces Unicode semantics, which means there - * is */ + * is no */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; } else { @@ -12496,9 +13666,23 @@ parseit: /* Here, did get the swash and its inversion list. If * the swash is from a user-defined property, then this * whole character class should be regarded as such */ - has_user_defined_property = - (swash_init_flags - & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + /* Invert if asking for the complement */ if (value == 'P') { @@ -12532,7 +13716,7 @@ parseit: case 'f': value = '\f'; break; case 'b': value = '\b'; break; case 'e': value = ASCII_TO_NATIVE('\033');break; - case 'a': value = ASCII_TO_NATIVE('\007');break; + case 'a': value = '\a'; break; case 'o': RExC_parse--; /* function expects to be pointed at the 'o' */ { @@ -12572,7 +13756,7 @@ parseit: goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -12644,31 +13828,8 @@ parseit: /* Here, we have the current token in 'value' */ - /* What matches in a locale is not known until runtime. This includes - * what the Posix classes (like \w, [:space:]) match. Room must be - * reserved (one time per class) to store such classes, either if Perl - * is compiled so that locale nodes always should have this space, or - * if there is such class info to be stored. The space will contain a - * bit for each named class that is to be matched against. This isn't - * needed for \p{} and pseudo-classes, as they are not affected by - * locale, and hence are dealt with separately */ - if (LOC - && ! need_class - && (ANYOF_LOCALE == ANYOF_CLASS - || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX))) - { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; - ANYOF_CLASS_ZERO(ret); - } - ANYOF_FLAGS(ret) |= ANYOF_CLASS; - } - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a * literal, as is the character that began the false range, i.e. @@ -12679,16 +13840,19 @@ parseit: ? RExC_parse - rangebegin : 0; if (strict) { - vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); } else { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ - ckWARN4reg(RExC_parse, - "False [] range \"%*.*s\"", - w, w, rangebegin); + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); (void)ReREFCNT_inc(RExC_rx_sv); cp_list = add_cp_to_invlist(cp_list, '-'); - cp_list = add_cp_to_invlist(cp_list, prevvalue); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); } } @@ -12696,16 +13860,94 @@ parseit: element_count += 2; /* So counts for three values */ } - if (! SIZE_ONLY) { - U8 classnum = namedclass_to_classnum(namedclass); - if (namedclass >= ANYOF_MAX) { /* If a special class */ + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* Coverity thinks it is possible for this to be negative; both + * jhi and khw think it's not, but be safer */ + assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ - /* Here, should be \h, \H, \v, or \V. Neither /d nor - * /l make a difference in what these match. There - * would be problems if these characters had folds - * other than themselves, as cp_list is subject to - * folding. */ + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ if (classnum != _CC_VERTSPACE) { assert( namedclass == ANYOF_HORIZWS || namedclass == ANYOF_NHORIZWS); @@ -12718,245 +13960,24 @@ parseit: _invlist_union_maybe_complement_2nd( cp_list, PL_XPosix_ptrs[classnum], - cBOOL(namedclass % 2), /* Complement if odd + namedclass % 2 != 0, /* Complement if odd (NHORIZWS, NVERTWS) */ &cp_list); } } - else if (classnum == _CC_ASCII) { -#ifdef HAS_ISASCII - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else -#endif /* Not isascii(); just use the hard-coded definition for it */ - _invlist_union_maybe_complement_2nd( - posixes, - PL_ASCII, - cBOOL(namedclass % 2), /* Complement if odd - (NASCII) */ - &posixes); + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); } - else { /* Garden variety class */ - - /* The ascii range inversion list */ - SV* ascii_source = PL_Posix_ptrs[classnum]; - - /* The full Latin1 range inversion list */ - SV* l1_source = PL_L1Posix_ptrs[classnum]; - - /* This code is structured into two major clauses. The - * first is for classes whose complete definitions may not - * already be known. It not, the Latin1 definition - * (guaranteed to already known) is used plus code is - * generated to load the rest at run-time (only if needed). - * If the complete definition is known, it drops down to - * the second clause, where the complete definition is - * known */ - - if (classnum < _FIRST_NON_SWASH_CC) { - - /* Here, the class has a swash, which may or not - * already be loaded */ - - /* The name of the property to use to match the full - * eXtended Unicode range swash for this character - * class */ - const char *Xname = swash_property_names[classnum]; - - /* If returning the inversion list, we can't defer - * getting this until runtime */ - if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) { - PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", Xname, &PL_sv_undef, - 1, /* binary */ - 0, /* not tr/// */ - NULL, /* No inversion list */ - NULL /* No flags */ - ); - assert(PL_utf8_swash_ptrs[classnum]); - } - if ( ! PL_utf8_swash_ptrs[classnum]) { - if (namedclass % 2 == 0) { /* A non-complemented - class */ - /* If not /a matching, there are code points we - * don't know at compile time. Arrange for the - * unknown matches to be loaded at run-time, if - * needed */ - if (! AT_LEAST_ASCII_RESTRICTED) { - Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", - Xname); - } - if (LOC) { /* Under locale, set run-time - lookup */ - ANYOF_CLASS_SET(ret, namedclass); - } - else { - /* Add the current class's code points to - * the running total */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : l1_source, - &posixes); - } - } - else { /* A complemented class */ - if (AT_LEAST_ASCII_RESTRICTED) { - /* Under /a should match everything above - * ASCII, plus the complement of the set's - * ASCII matches */ - _invlist_union_complement_2nd(posixes, - ascii_source, - &posixes); - } - else { - /* Arrange for the unknown matches to be - * loaded at run-time, if needed */ - Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", - Xname); - runtime_posix_matches_above_Unicode = TRUE; - if (LOC) { - ANYOF_CLASS_SET(ret, namedclass); - } - else { - - /* We want to match everything in - * Latin1, except those things that - * l1_source matches */ - SV* scratch_list = NULL; - _invlist_subtract(PL_Latin1, l1_source, - &scratch_list); - - /* Add the list from this class to the - * running total */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, - scratch_list, - &posixes); - SvREFCNT_dec_NN(scratch_list); - } - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) - |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - } - } - goto namedclass_done; - } - - /* Here, there is a swash loaded for the class. If no - * inversion list for it yet, get it */ - if (! PL_XPosix_ptrs[classnum]) { - PL_XPosix_ptrs[classnum] - = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]); - } - } - - /* Here there is an inversion list already loaded for the - * entire class */ - - if (namedclass % 2 == 0) { /* A non-complemented class, - like ANYOF_PUNCT */ - if (! LOC) { - /* For non-locale, just add it to any existing list - * */ - _invlist_union(posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - } - else { /* Locale */ - SV* scratch_list = NULL; - - /* For above Latin1 code points, we use the full - * Unicode range */ - _invlist_intersection(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - /* And set the output to it, adding instead if - * there already is an output. Checking if - * 'posixes' is NULL first saves an extra clone. - * Its reference count will be decremented at the - * next union, etc, or if this is the only - * instance, at the end of the routine */ - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } - -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_BLANK) { -#endif - /* Set this class in the node for runtime - * matching */ - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* No isblank(), use the hard-coded ASCII-range - * blanks, adding them to the running total. */ - - _invlist_union(posixes, ascii_source, &posixes); - } -#endif - } - } - else { /* A complemented class, like ANYOF_NPUNCT */ - if (! LOC) { - _invlist_union_complement_2nd( - posixes, - (AT_LEAST_ASCII_RESTRICTED) - ? ascii_source - : PL_XPosix_ptrs[classnum], - &posixes); - /* Under /d, everything in the upper half of the - * Latin1 range matches this complement */ - if (DEPENDS_SEMANTICS) { - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; - } - } - else { /* Locale */ - SV* scratch_list = NULL; - _invlist_subtract(PL_AboveLatin1, - PL_XPosix_ptrs[classnum], - &scratch_list); - if (! posixes) { - posixes = scratch_list; - } - else { - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#ifndef HAS_ISBLANK - if (namedclass != ANYOF_NBLANK) { -#endif - ANYOF_CLASS_SET(ret, namedclass); -#ifndef HAS_ISBLANK - } - else { - /* Get the list of all code points in Latin1 - * that are not ASCII blanks, and add them to - * the running total */ - _invlist_subtract(PL_Latin1, ascii_source, - &scratch_list); - _invlist_union(posixes, scratch_list, &posixes); - SvREFCNT_dec_NN(scratch_list); - } -#endif - } - } - } - namedclass_done: continue; /* Go get next character */ } } /* end of namedclass \blah */ @@ -12970,13 +13991,15 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (range) { if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; - Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); range = 0; /* not a valid range */ } } @@ -13057,11 +14080,9 @@ parseit: value, foldbuf, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) ); /* Here, should be the first character of the @@ -13080,7 +14101,7 @@ parseit: AV* this_array; STRLEN cp_count = utf8_length(foldbuf, foldbuf + foldlen); - SV* multi_fold = sv_2mortal(newSVpvn("", 0)); + SV* multi_fold = sv_2mortal(newSVpvs("")); Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); @@ -13129,7 +14150,8 @@ parseit: /* Deal with this element of the class */ if (! SIZE_ONLY) { #ifndef EBCDIC - cp_list = _add_range_to_invlist(cp_list, prevvalue, value); + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); #else SV* this_range = _new_invlist(1); _append_range_to_invlist(this_range, prevvalue, value); @@ -13143,13 +14165,18 @@ parseit: * included. literal_endpoint==2 means both ends of the range used * a literal character, not \x{foo} */ if (literal_endpoint == 2 - && (prevvalue >= 'a' && value <= 'z') - || (prevvalue >= 'A' && value <= 'Z')) + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) { - _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA], + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII], + &this_range); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], &this_range); } - _invlist_union(cp_list, this_range, &cp_list); + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); literal_endpoint = 0; #endif } @@ -13179,7 +14206,7 @@ parseit: #endif /* Look at the longest folds first */ - for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -13234,15 +14261,29 @@ parseit: return ret; } - /* If the character class contains only a single element, it may be - * optimizable into another node type which is smaller and runs faster. - * Check if this is the case for this class */ - if (element_count == 1 && ! ret_invlist) { + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { U8 op = END; U8 arg = 0; - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or - [:digit:] or \p{foo} */ + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ /* All named classes are mapped into POSIXish nodes, with its FLAG * argument giving which class it is */ @@ -13298,14 +14339,6 @@ parseit: if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } -#ifndef HAS_ISBLANK - if (op == POSIXL - && (namedclass == ANYOF_BLANK - || namedclass == ANYOF_NBLANK)) - { - op = POSIXA; - } -#endif join_posix: /* The odd numbered ones are the complements of the @@ -13343,6 +14376,26 @@ parseit: op = POSIXA; } } + else if (prevvalue == 'A') { + if (value == 'Z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } + } + else if (prevvalue == 'a') { + if (value == 'z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } + } } /* Here, we have changed away from its initial value iff we found @@ -13360,13 +14413,16 @@ parseit: /* To get locale nodes to not use the full ANYOF size would * require moving the code above that writes the portions * of it that aren't in other nodes to after this point. - * e.g. ANYOF_CLASS_SET */ + * e.g. ANYOF_POSIXL_SET */ RExC_size = orig_size; } } else { RExC_emit = (regnode *)orig_emit; if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } if (invert) { op += NPOSIXD - POSIXD; } @@ -13382,13 +14438,17 @@ parseit: *flagp |= HASWIDTH|SIMPLE; } else if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } RExC_parse = (char *) cur_parse; SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); return ret; } } @@ -13399,238 +14459,179 @@ parseit: /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ - if (FOLD && cp_list) { - UV start, end; /* End points of code point ranges */ - - SV* fold_intersection = NULL; - - /* If the highest code point is within Latin1, we can use the - * compiled-in Alphas list, and not have to go out to disk. This - * yields two false positives, the masculine and feminine ordinal - * indicators, which are weeded out below using the - * IS_IN_SOME_FOLD_L1() macro */ - if (invlist_highest(cp_list) < 256) { - _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, - &fold_intersection); - } - else { - - /* Here, there are non-Latin1 code points, so we will have to go - * fetch the list of all the characters that participate in folds - */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } - - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { - - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode - * rules hard-coded into Perl. (This case happens legitimately - * during compilation of Perl itself before the Unicode tables - * are generated) */ - if (_invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = newHV(); - } - else { - /* If the folds haven't been read in, call a fold function - * to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - - /* This string is just a short named one above \xff */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ - } - PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); - } + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; } /* Only the characters in this class that participate in folds need * be checked. Get the intersection of this class and all the * possible characters that are foldable. This can quickly narrow * down a large class */ - _invlist_intersection(PL_utf8_foldable, cp_list, + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, &fold_intersection); - } - /* Now look at the foldable characters in this class individually */ - invlist_iterinit(fold_intersection); - while (invlist_iternext(fold_intersection, &start, &end)) { - UV j; + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { - /* Locale folding for Latin1 characters is deferred until runtime */ - if (LOC && start < 256) { - start = 256; + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } } - /* Look at every character in the range */ - for (j = start; j <= end; j++) { - - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - SV** listp; + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; - if (j < 256) { + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; - /* We have the latin1 folding rules hard-coded here so that - * an innocent-looking character class, like /[ks]/i won't - * have to go out to disk to find the possible matches. - * XXX It would be better to generate these via regen, in - * case a new version of the Unicode standard adds new - * mappings, though that is not really likely, and may be - * caught by the default: case of the switch below. */ + if (j < 256) { - if (IS_IN_SOME_FOLD_L1(j)) { + if (IS_IN_SOME_FOLD_L1(j)) { - /* ASCII is always matched; non-ASCII is matched only - * under Unicode rules */ - if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { - cp_list = - add_cp_to_invlist(cp_list, PL_fold_latin1[j]); + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } } - else { - depends_list = - add_cp_to_invlist(depends_list, PL_fold_latin1[j]); + + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); } + continue; } - if (HAS_NONLATIN1_FOLD_CLOSURE(j) - && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for to the - * inversion list. */ - switch (j) { - case 'k': - case 'K': - cp_list = - add_cp_to_invlist(cp_list, KELVIN_SIGN); - break; - case 's': - case 'S': - cp_list = add_cp_to_invlist(cp_list, - LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - cp_list = add_cp_to_invlist(cp_list, - GREEK_CAPITAL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - cp_list = - add_cp_to_invlist(cp_list, ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - cp_list = add_cp_to_invlist(cp_list, - LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - cp_list = add_cp_to_invlist(cp_list, - LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 to - * express, so they can't match unless the - * target string is in UTF-8, so no action here - * is necessary, as regexec.c properly handles - * the general case for UTF-8 matching and - * multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; - } - } - continue; - } + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } - /* Here is an above Latin1 character. We don't have the rules - * hard-coded for it. First, get its fold. This is the simple - * fold, as the multi-character folds have been handled earlier - * and separated out */ - _to_uni_fold_flags(j, foldbuf, &foldlen, - ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); - - /* Single character fold of above Latin1. Add everything in - * its fold closure to the list that this node should match. - * The fold closures data structure is a hash with the keys - * being the UTF-8 of every character that is folded to, like - * 'k', and the values each an array of all code points that - * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. - * Multi-character folds are not included */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); - - /* /aa doesn't allow folds between ASCII and non-; /l - * doesn't allow them between above and below 256 */ - if ((ASCII_FOLD_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && c < 256)) { - continue; - } + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ - if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) - { - cp_list = add_cp_to_invlist(cp_list, c); - } - else { - depends_list = add_cp_to_invlist(depends_list, c); + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } } } } } + SvREFCNT_dec_NN(fold_intersection); } - SvREFCNT_dec_NN(fold_intersection); + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); } /* And combine the result (if any) with any inversion list from posix * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ - if (posixes) { + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } if (! DEPENDS_SEMANTICS) { if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); @@ -13644,10 +14645,8 @@ parseit: /* Under /d, we put into a separate list the Latin1 things that * match only when the target string is utf8 */ SV* nonascii_but_latin1_properties = NULL; - _invlist_intersection(posixes, PL_Latin1, + _invlist_intersection(posixes, PL_UpperLatin1, &nonascii_but_latin1_properties); - _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII, - &nonascii_but_latin1_properties); _invlist_subtract(posixes, nonascii_but_latin1_properties, &posixes); if (cp_list) { @@ -13681,7 +14680,6 @@ parseit: * , because having a Unicode property forces Unicode * semantics */ if (properties) { - bool warn_super = ! has_user_defined_property; if (cp_list) { /* If it matters to the final outcome, see if a non-property @@ -13692,14 +14690,8 @@ parseit: * are using above-Unicode code points indicates they should know * the issues involved */ if (warn_super) { - bool non_prop_matches_above_Unicode = - runtime_posix_matches_above_Unicode - | (invlist_highest(cp_list) > PERL_UNICODE_MAX); - if (invert) { - non_prop_matches_above_Unicode = - ! non_prop_matches_above_Unicode; - } - warn_super = ! non_prop_matches_above_Unicode; + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); } _invlist_union(properties, cp_list, &cp_list); @@ -13710,7 +14702,7 @@ parseit: } if (warn_super) { - OP(ret) = ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; } } @@ -13723,12 +14715,33 @@ parseit: * shouldn't. Therefore we can't invert folded locale now, as it won't be * folded until runtime */ + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ - if (invert - && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) + if (cp_list + && invert + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { @@ -13758,15 +14771,6 @@ parseit: return orig_emit; } - /* If we didn't do folding, it's because some information isn't available - * until runtime; set the run-time fold flag for these. (We don't have to - * worry about properties folding, as that is taken care of by the swash - * fetching) */ - if (FOLD && LOC) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; - } - /* Some character classes are equivalent to other nodes. Such nodes take * up less room and generally fewer operations to execute than ANYOF nodes. * Above, we checked for and optimized into some such equivalents for @@ -13785,8 +14789,13 @@ parseit: if (cp_list && ! invert && ! depends_list - && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) { UV start, end; U8 op = END; /* The optimzation node-type */ @@ -13810,7 +14819,7 @@ parseit: && (start < 256 || UTF)) { /* Here, the list contains a single code point. Can optimize - * into an EXACT node */ + * into an EXACTish node */ value = start; @@ -13840,12 +14849,6 @@ parseit: } } else { - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "_Perl_Any_Folds", - &PL_sv_undef, 1, 0); - PL_utf8_foldable = _get_swash_invlist(swash); - SvREFCNT_dec_NN(swash); - } if (_invlist_contains_cp(PL_utf8_foldable, value)) { op = EXACT; } @@ -13885,7 +14888,9 @@ parseit: RExC_parse = (char *)cur_parse; if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); } SvREFCNT_dec_NN(cp_list); @@ -13898,55 +14903,8 @@ parseit: * for things that belong in the bitmap, put them there, and delete from * . While we are at it, see if everything above 255 is in the * list, and if so, set a flag to speed up execution */ - ANYOF_BITMAP_ZERO(ret); - if (cp_list) { - - /* This gets set if we actually need to modify things */ - bool change_invlist = FALSE; - - UV start, end; - - /* Start looking through */ - invlist_iterinit(cp_list); - while (invlist_iternext(cp_list, &start, &end)) { - UV high; - int i; - - if (end == UV_MAX && start <= 256) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - } - - /* Quit if are above what we should change */ - if (start > 255) { - break; - } - change_invlist = TRUE; - - /* Set all the bits in the range, up to the max that we are doing */ - high = (end < 255) ? end : 255; - for (i = start; i <= (int) high; i++) { - if (! ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - } - invlist_iterfinish(cp_list); - - /* Done with loop; remove any code points that are in the bitmap from - * */ - if (change_invlist) { - _invlist_subtract(cp_list, PL_Latin1, &cp_list); - } - - /* If have completely emptied it, remove it completely */ - if (_invlist_len(cp_list) == 0) { - SvREFCNT_dec_NN(cp_list); - cp_list = NULL; - } - } + populate_ANYOF_from_invlist(ret, &cp_list); if (invert) { ANYOF_FLAGS(ret) |= ANYOF_INVERT; @@ -13963,6 +14921,7 @@ parseit: else { cp_list = depends_list; } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; } /* If there is a swash and more than one element, we can't use the swash in @@ -13972,82 +14931,130 @@ parseit: swash = NULL; } - if (! cp_list - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - { - ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { - /* av[0] stores the character class description in its textual form: - * used later (regexec.c:Perl_regclass_swash()) to initialize the - * appropriate swash, and is also useful for dumping the regnode. - * av[1] if NULL, is a placeholder to later contain the swash computed - * from av[0]. But if no further computation need be done, the - * swash is stored there now. - * av[2] stores the cp_list inversion list for use in addition or - * instead of av[0]; used only if av[1] is NULL - * av[3] is set if any component of the class is from a user-defined - * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; - av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) - ? SvREFCNT_inc(listsv) : &PL_sv_undef); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { + assert(cp_list); av_store(av, 1, swash); SvREFCNT_dec_NN(cp_list); } else { - av_store(av, 1, NULL); + av_store(av, 1, &PL_sv_undef); if (cp_list) { - av_store(av, 2, cp_list); - av_store(av, 3, newSVuv(has_user_defined_property)); + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); } } + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + rv = newRV_noinc(MUTABLE_SV(av)); - n = add_data(pRExC_state, 1, "s"); + n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; - ARG_SET(ret, n); + ARG_SET(node, n); } - - *flagp |= HASWIDTH|SIMPLE; - return ret; } -#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* reg_skipcomment() - Absorbs an /x style # comments from the input stream. - Returns true if there is more text remaining in the stream. - Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment - terminates the pattern without including a newline. + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. - Note its the callers responsibility to ensure that we are + Note it's the callers responsibility to ensure that we are actually in /x mode */ -STATIC bool -S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +PERL_STATIC_INLINE char* +S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) { - bool ended = 0; - PERL_ARGS_ASSERT_REG_SKIPCOMMENT; - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') { - ended = 1; - break; + assert(*p = '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; } - if (!ended) { - /* we ran off the end of the pattern without ending - the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_SEEN_RUN_ON_COMMENT; - return 0; - } else - return 1; + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; } /* nextchar() @@ -14085,14 +15092,12 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } if (RExC_flags & RXf_PMf_EXTENDED) { - if (isSPACE(*RExC_parse)) { - RExC_parse++; + char * p = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + if (p != RExC_parse) { + RExC_parse = p; continue; } - else if (*RExC_parse == '#') { - if ( reg_skipcomment( pRExC_state ) ) - continue; - } } return retval; } @@ -14118,7 +15123,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) } if (RExC_emit >= RExC_emit_bound) Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", - op, RExC_emit, RExC_emit_bound); + op, (void*)RExC_emit, (void*)RExC_emit_bound); NODE_ALIGN_FILL(ret); ptr = ret; @@ -14126,7 +15131,8 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", "reg_node", __LINE__, PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] @@ -14175,7 +15181,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) } if (RExC_emit >= RExC_emit_bound) Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", - op, RExC_emit, RExC_emit_bound); + op, (void*)RExC_emit, (void*)RExC_emit_bound); NODE_ALIGN_FILL(ret); ptr = ret; @@ -14183,7 +15189,8 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", __LINE__, PL_reg_name[op], @@ -14202,7 +15209,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) /* - reguni - emit (if appropriate) a Unicode character */ -STATIC STRLEN +PERL_STATIC_INLINE STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; @@ -14229,6 +15236,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGINSERT; + PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); @@ -14263,7 +15271,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) StructCopy(--src, --dst, regnode); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", __LINE__, PL_reg_name[op], @@ -14282,7 +15291,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) place = opnd; /* Op node, where operand used to be. */ #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", __LINE__, PL_reg_name[op], @@ -14307,7 +15317,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) */ /* TODO: All three parms should be const */ STATIC void -S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14328,7 +15339,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tail" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), @@ -14366,7 +15377,8 @@ to control which is which. /* TODO: All four parms should be const */ STATIC U8 -S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) { dVAR; regnode *scan; @@ -14389,8 +15401,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN if (PL_regkind[OP(scan)] == EXACT) { - bool has_exactf_sharp_s; /* Unexamined in this routine */ - if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) return EXACT; } #endif @@ -14398,10 +15411,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, switch (OP(scan)) { case EXACT: case EXACTF: + case EXACTFA_NO_TRIE: case EXACTFA: case EXACTFU: case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -14416,7 +15429,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); - regprop(RExC_rx, mysv, scan); + regprop(RExC_rx, mysv, scan, NULL); PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), @@ -14429,8 +15442,9 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, DEBUG_PARSE_r({ SV * const mysv_val=sv_newmortal(); DEBUG_PARSE_MSG(""); - regprop(RExC_rx, mysv_val, val); - PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", SvPV_nolen_const(mysv_val), (IV)REG_NODE_NUM(val), (IV)(val - scan) @@ -14451,6 +15465,30 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ #ifdef DEBUGGING + +static void +S_regdump_intflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitcheck_substr == r->float_substr && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored")); - if (r->extflags & RXf_NOSCAN) + if (r->intflags & PREGf_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->extflags & RXf_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); @@ -14558,22 +15598,22 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, ") "); if (ri->regstclass) { - regprop(r, sv, ri->regstclass); + regprop(r, sv, ri->regstclass, NULL); PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); } - if (r->extflags & RXf_ANCH) { + if (r->intflags & PREGf_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); - if (r->extflags & RXf_ANCH_BOL) + if (r->intflags & PREGf_ANCH_BOL) PerlIO_printf(Perl_debug_log, "(BOL)"); - if (r->extflags & RXf_ANCH_MBOL) + if (r->intflags & PREGf_ANCH_MBOL) PerlIO_printf(Perl_debug_log, "(MBOL)"); - if (r->extflags & RXf_ANCH_SBOL) + if (r->intflags & PREGf_ANCH_SBOL) PerlIO_printf(Perl_debug_log, "(SBOL)"); - if (r->extflags & RXf_ANCH_GPOS) + if (r->intflags & PREGf_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); PerlIO_putc(Perl_debug_log, ' '); } - if (r->extflags & RXf_GPOS_SEEN) + if (r->intflags & PREGf_GPOS_SEEN) PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) PerlIO_printf(Perl_debug_log, "plus "); @@ -14583,7 +15623,10 @@ Perl_regdump(pTHX_ const regexp *r) if (r->extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); - DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); + DEBUG_FLAGS_r({ + regdump_extflags("r->extflags: ",r->extflags); + regdump_intflags("r->intflags: ",r->intflags); + }); #else PERL_ARGS_ASSERT_REGDUMP; PERL_UNUSED_CONTEXT; @@ -14592,21 +15635,11 @@ Perl_regdump(pTHX_ const regexp *r) } /* -- regprop - printable representation of opcode +- regprop - printable representation of opcode, with run time support */ -#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ -STMT_START { \ - if (do_sep) { \ - Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ - if (flags & ANYOF_INVERT) \ - /*make sure the invert info is in each */ \ - sv_catpvs(sv, "^"); \ - do_sep = 0; \ - } \ -} STMT_END void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) { #ifdef DEBUGGING dVAR; @@ -14622,10 +15655,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) || _CC_VERTSPACE != 16 #error Need to adjust order of anyofs[] #endif - "[\\w]", - "[\\W]", - "[\\d]", - "[\\D]", + "\\w", + "\\W", + "\\d", + "\\D", "[:alpha:]", "[:^alpha:]", "[:lower:]", @@ -14642,8 +15675,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^graph:]", "[:cased:]", "[:^cased:]", - "[\\s]", - "[\\S]", + "\\s", + "\\S", "[:blank:]", "[:^blank:]", "[:xdigit:]", @@ -14654,8 +15687,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^cntrl:]", "[:ascii:]", "[:^ascii:]", - "[\\v]", - "[\\V]" + "\\v", + "\\V" }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -14667,7 +15700,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from regexec.c, and it would be hard to supply pRExC_state. */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; @@ -14698,38 +15732,22 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); DEBUG_TRIE_COMPILE_r( - Perl_sv_catpvf(aTHX_ sv, - "", - (UV)trie->startstate, - (IV)trie->statecount-1, /* -1 because of the unused 0 element */ - (UV)trie->wordcount, - (UV)trie->minlen, - (UV)trie->maxlen, - (UV)TRIE_CHARCOUNT(trie), - (UV)trie->uniquecharcount - ) + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { - int i; - int rangestart = -1; - U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); sv_catpvs(sv, "["); - for (i = 0; i <= 256; i++) { - if (i < 256 && BITMAP_TEST(bitmap,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; - } - } + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); sv_catpvs(sv, "]"); } @@ -14740,7 +15758,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || (OP(o) < NREF)) { @@ -14764,21 +15784,36 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } } + if ( k == REF && reginfo) { + U32 n = ARG(o); /* which paren pair */ + I32 ln = prog->offs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } } else if (k == GOSUB) - Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */ + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); else if (k == VERB) { if (!o->flags) Perl_sv_catpvf(aTHX_ sv, ":%"SVf, SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { - int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - if (flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE_FLAGS) sv_catpvs(sv, "{loc}"); if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); @@ -14787,86 +15822,49 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "^"); /* output what the standard cp 0-255 bitmap matches */ - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i - 1); - } - do_sep = 1; - rangestart = -1; - } - } + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - /* output any special charclass tests (used entirely under use locale) */ - if (ANYOF_CLASS_TEST_ANY_SET(o)) - for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) - if (ANYOF_CLASS_TEST(o,i)) { + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { sv_catpv(sv, anyofs[i]); do_sep = 1; } - - EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); - - if (flags & ANYOF_NON_UTF8_LATIN1_ALL) { - sv_catpvs(sv, "{non-utf8-latin1-all}"); + } } - /* output information about the unicode matching */ - if (flags & ANYOF_UNICODE_ALL) - sv_catpvs(sv, "{unicode_all}"); - else if (ANYOF_NONBITMAP(o)) - sv_catpvs(sv, "{unicode}"); - if (flags & ANYOF_NONBITMAP_NON_UTF8) - sv_catpvs(sv, "{outside bitmap}"); - - if (ANYOF_NONBITMAP(o)) { - SV *lv; /* Set if there is something outside the bit map */ - SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL); - bool byte_output = FALSE; /* If something in the bitmap has been - output */ - - if (lv && lv != &PL_sv_undef) { - if (sw) { - U8 s[UTF8_MAXBYTES_CASE+1]; - - for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */ - uvchr_to_utf8(s, i); + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); + } - if (i < 256 - && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate - things already - output as part - of the bitmap */ - && swash_fetch(sw, s, TRUE)) - { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - byte_output = TRUE; - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) { - put_byte(sv, rangestart); - } - else { - put_byte(sv, rangestart); - sv_catpvs(sv, "-"); - put_byte(sv, i-1); - } - rangestart = -1; - } - } - } + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } - { + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { char *s = savesvpv(lv); char * const origs = s; @@ -14876,6 +15874,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (*s == '\n') { const char * const t = ++s; + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } + if (byte_output) { sv_catpvs(sv, " "); } @@ -14907,8 +15912,29 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) out_dump: Safefree(origs); + SvREFCNT_dec_NN(lv); + } + + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); } - SvREFCNT_dec_NN(lv); } } @@ -14916,11 +15942,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; - if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } } else { - sv_catpv(sv, anyofs[index]); + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); } } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -14930,9 +15962,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) PERL_UNUSED_ARG(sv); PERL_UNUSED_ARG(o); PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); #endif /* DEBUGGING */ } + + SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ @@ -15174,7 +16209,16 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) PerlMemShared_free(aho->fail); /* do this last!!!! */ PerlMemShared_free(ri->data->data[n]); - PerlMemShared_free(ri->regstclass); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } } } break; @@ -15201,7 +16245,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } break; default: - Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]); + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); } } Safefree(ri->data->what); @@ -15304,7 +16349,6 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) so we need to copy it locally. */ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); ret->mother_re = NULL; - ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ @@ -15335,7 +16379,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) len = ProgLen(ri); - Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); reti->num_code_blocks = ri->num_code_blocks; @@ -15377,9 +16422,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) break; case 'f': /* This is cheating. */ - Newx(d->data[i], 1, struct regnode_charclass_class); - StructCopy(ri->data->data[i], d->data[i], - struct regnode_charclass_class); + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); reti->regstclass = (regnode*)d->data[i]; break; case 'T': @@ -15388,18 +16432,19 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) * when the corresponding reg_ac_data struct is freed. */ reti->regstclass= ri->regstclass; - /* Fall through */ + /* FALLTHROUGH */ case 't': OP_REFCNT_LOCK; ((reg_trie_data*)ri->data->data[i])->refcount++; OP_REFCNT_UNLOCK; - /* Fall through */ + /* FALLTHROUGH */ case 'l': case 'L': d->data[i] = ri->data->data[i]; break; default: - Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]); + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); } } @@ -15439,7 +16484,8 @@ Perl_regnext(pTHX_ regnode *p) return(NULL); if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ - Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX); + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); } offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); @@ -15451,7 +16497,7 @@ Perl_regnext(pTHX_ regnode *p) #endif STATIC void -S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) { va_list args; STRLEN l1 = strlen(pat1); @@ -15470,20 +16516,15 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) Copy(pat2, buf + l1, l2 , char); buf[l1 + l2] = '\n'; buf[l1 + l2 + 1] = '\0'; -#ifdef I_STDARG - /* ANSI variant takes additional second argument */ va_start(args, pat2); -#else - va_start(args); -#endif msv = vmess(buf, &args); va_end(args); message = SvPV_const(msv,l1); if (l1 > 512) l1 = 512; Copy(message, buf, l1 , char); - buf[l1-1] = '\0'; /* Overwrite \n */ - Perl_croak(aTHX_ "%s", buf); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); } /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ @@ -15501,7 +16542,8 @@ Perl_save_re_context(pTHX) U32 i; for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; - const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); GV *const *const gvp = (GV**)hv_fetch(PL_defstash, digits, len, 0); @@ -15523,25 +16565,17 @@ S_put_byte(pTHX_ SV *sv, int c) { PERL_ARGS_ASSERT_PUT_BYTE; - /* Our definition of isPRINT() ignores locales, so only bytes that are - not part of UTF-8 are considered printable. I assume that the same - holds for UTF-EBCDIC. - Also, code point 255 is not printable in either (it's E0 in EBCDIC, - which Wikipedia says: - - EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all - ones (binary 1111 1111, hexadecimal FF). It is similar, but not - identical, to the ASCII delete (DEL) or rubout control character. ... - it is typically mapped to hexadecimal code 9F, in order to provide a - unique character mapping in both directions) - - So the old condition can be simplified to !isPRINT(c) */ if (!isPRINT(c)) { - if (c < 256) { - Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; } } else { @@ -15552,14 +16586,93 @@ S_put_byte(pTHX_ SV *sv, int c) } } +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end' */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) + put_byte(sv, start); + } + else if ( end > 255 + || ! isALPHANUMERIC(start) + || ! isALPHANUMERIC(end) + || isDIGIT(start) != isDIGIT(end) + || isUPPER(start) != isUPPER(end) + || isLOWER(start) != isLOWER(end) + + /* This final test should get optimized out except on EBCDIC + * platforms, where it causes ranges that cross discontinuities + * like i/j to be shown as hex instead of the misleading, + * e.g. H-K (since that range includes more than H, I, J, K). + * */ + || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start)) + { + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + } + else { /* Here, the ends of the range are both digits, or both uppercase, + or both lowercase; and there's no discontinuity in the range + (which could happen on EBCDIC platforms) */ + put_byte(sv, start); + sv_catpvs(sv, "-"); + put_byte(sv, end); + } +} + +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i < 256; i++) { + if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j <= 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); + has_output_anything = TRUE; + i = j; + } + } + + return has_output_anything; +} #define CLEAR_OPTSTART \ - if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ } STMT_END -#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); STATIC const regnode * S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, @@ -15585,6 +16698,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, last= plast; while (PL_regkind[op] != END && (!last || node < last)) { + assert(node); /* While that wasn't END last time... */ NODE_ALIGN(node); op = OP(node); @@ -15601,14 +16715,15 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else CLEAR_OPTSTART; - regprop(r, sv, node); + regprop(r, sv, node, NULL); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); if (OP(node) != OPTIMIZED) { if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, " (0)"); - else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) PerlIO_printf(Perl_debug_log, " (FAIL)"); else PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); @@ -15641,7 +16756,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const reg_trie_data * const trie = (reg_trie_data*)ri->data->data[optrie]; #ifdef DEBUGGING - AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); #endif const regnode *nextbranch= NULL; I32 word_idx; @@ -15651,18 +16767,22 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PerlIO_printf(Perl_debug_log, "%*s%s ", (int)(2*(indent+3)), "", - elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, - PL_colors[0], PL_colors[1], - (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELLIPSES | - PERL_PV_PRETTY_LTGT + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT ) - : "???" + : "???" ); if (trie->jump) { U16 dist= trie->jump[word_idx+1]; PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", - (UV)((dist ? this_trie + dist : next) - start)); + (UV)((dist ? this_trie + dist : next) - start)); if (dist) { if (!nextbranch) nextbranch= this_trie + trie->jump[0]; @@ -15692,8 +16812,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS) - ? ANYOF_CLASS_SKIP : ANYOF_SKIP); + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) { diff --git a/src/5019001/regexec.c b/src/5021001/regexec.c similarity index 78% rename from src/5019001/regexec.c rename to src/5021001/regexec.c index f29d0d4..5a3b4a3 100644 --- a/src/5019001/regexec.c +++ b/src/5021001/regexec.c @@ -37,16 +37,6 @@ #include "re_top.h" #endif -/* At least one required character in the target string is expressible only in - * UTF-8. */ -static const char* const non_utf8_target_but_utf8_required - = "Can't match, because target string needs to be in UTF-8\n"; - -#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ - goto target; \ -} STMT_END - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * @@ -94,16 +84,28 @@ static const char* const non_utf8_target_but_utf8_required #include "inline_invlist.c" #include "unicode_constants.h" +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #ifndef STATIC #define STATIC static #endif -/* Valid for non-utf8 strings: avoids the reginclass +/* Valid only for non-utf8 strings: avoids the reginclass * call if there are no complications: i.e., if everything matchable is * straight forward in the bitmap */ -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ : ANYOF_BITMAP_TEST(p,*(c))) /* @@ -118,6 +120,7 @@ static const char* const non_utf8_target_but_utf8_required ? reghop3((U8*)pos, off, \ (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ : (U8*)(pos + off)) + #define HOPBACKc(pos, off) \ (char*)(reginfo->is_utf8_target \ ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ @@ -128,6 +131,24 @@ static const char* const non_utf8_target_but_utf8_required #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ #define NEXTCHR_IS_EOS (nextchr < 0) @@ -140,11 +161,11 @@ static const char* const non_utf8_target_but_utf8_required SET_nextchr -#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \ +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ if (!swash_ptr) { \ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ - 1, 0, NULL, &flags); \ + 1, 0, invlist, &flags); \ assert(swash_ptr); \ } \ } STMT_END @@ -153,28 +174,33 @@ static const char* const non_utf8_target_but_utf8_required #ifdef DEBUGGING # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); #else # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ property_name, \ + invlist, \ utf8_char_in_property) \ - LOAD_UTF8_CHARCLASS(swash_ptr, property_name) + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) #endif #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ PL_utf8_swash_ptrs[_CC_WORDCHAR], \ - swash_property_names[_CC_WORDCHAR], \ - GREEK_SMALL_LETTER_IOTA_UTF8) + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ STMT_START { \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ "_X_regular_begin", \ - GREEK_SMALL_LETTER_IOTA_UTF8); \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ "_X_extend", \ + NULL, \ COMBINING_GRAVE_ACCENT_UTF8); \ } STMT_END @@ -191,14 +217,14 @@ static const char* const non_utf8_target_but_utf8_required * although it may be done at run time beause of the REF possibility - more * investigation required. -- demerphq */ -#define JUMPABLE(rn) ( \ - OP(rn) == OPEN || \ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ - OP(rn) == EVAL || \ - OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ - OP(rn) == PLUS || OP(rn) == MINMOD || \ - OP(rn) == KEEPS || \ - (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ ) #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) @@ -208,13 +234,13 @@ static const char* const non_utf8_target_but_utf8_required /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -224,7 +250,7 @@ static const char* const non_utf8_target_but_utf8_required Search for mandatory following text node; for lookahead, the text must follow but for lookbehind (rn->flags != 0) we skip to the next step. */ -#define FIND_NEXT_IMPT(rn) STMT_START { \ +#define FIND_NEXT_IMPT(rn) STMT_START { \ while (JUMPABLE(rn)) { \ const OPCODE type = OP(rn); \ if (type == SUSPEND || PL_regkind[type] == CURLY) \ @@ -275,8 +301,9 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) PERL_ARGS_ASSERT_REGCPPUSH; if (paren_elems_to_push < 0) - Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", - paren_elems_to_push); + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u", + (int)paren_elems_to_push, (int)maxopenparen, + (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS); if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf @@ -297,8 +324,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) ); for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(rex->offs[p].end); - SSPUSHINT(rex->offs[p].start); + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", @@ -370,10 +397,10 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) ); paren = *maxopenparen_p; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { - I32 tmps; + SSize_t tmps; rex->offs[paren].start_tmp = SSPOPINT; - rex->offs[paren].start = SSPOPINT; - tmps = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; if (paren <= rex->lastparen) rex->offs[paren].end = tmps; DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, @@ -485,7 +512,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } if (classnum < _FIRST_NON_SWASH_CC) { @@ -493,8 +520,11 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) /* Initialize the swash unless done already */ if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags); + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) @@ -527,7 +557,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) */ I32 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, - char *strbeg, I32 minend, SV *screamer, U32 nosave) + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -544,68 +574,70 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, } #endif -/* - * Need to implement the following flags for reg_anch: - * - * USE_INTUIT_NOML - Useful to call re_intuit_start() first - * USE_INTUIT_ML - * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer - * INTUIT_AUTORITATIVE_ML - * INTUIT_ONCE_NOML - Intuit can match in one location only. - * INTUIT_ONCE_ML - * - * Another flag for this function: SECOND_TIME (so that float substrs - * with giant delta may be not rechecked). - */ - -/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ -/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. - Otherwise, only SvCUR(sv) is used to get strbeg. */ - -/* XXXX We assume that strpos is strbeg unless sv. */ - -/* XXXX Some places assume that there is a fixed substring. - An update may be needed if optimizer marks as "INTUITable" - RExen without fixed substrings. Similarly, it is assumed that - lengths of all the strings are no more than minlen, thus they - cannot come from lookahead. - (Or minlen should take into account lookahead.) - NOTE: Some of this comment is not correct. minlen does now take account - of lookahead/behind. Further research is required. -- demerphq - -*/ -/* A failure to find a constant substring means that there is no need to make - an expensive call to REx engine, thus we celebrate a failure. Similarly, - finding a substring too deep into the string means that fewer calls to - regtry() should be needed. - - REx compiler's optimizer found 4 possible hints: - a) Anchored substring; - b) Fixed substring; - c) Whether we are anchored (beginning-of-line or \G); - d) First node (of those at offset 0) which may distinguish positions; - We use a)b)d) and multiline-part of c), and try to find a position in the - string which does not contradict any of them. - */ - -/* Most of decisions we do here should have been done at compile time. - The nodes of the REx which we used for the search should have been - deleted from the finite automaton. */ - -/* args: - * rx: the regex to match against - * sv: the SV being matched: only used for utf8 flag; the string - * itself is accessed via the pointers below. Note that on - * something like an overloaded SV, SvPOK(sv) may be false - * and the string pointers may point to something unrelated to - * the SV itself. - * strbeg: real beginning of string - * strpos: the point in the string at which to begin matching - * strend: pointer to the byte following the last char of the string - * flags currently unused; set to 0 - * data: currently unused; set to NULL +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. */ char * @@ -620,34 +652,60 @@ Perl_re_intuit_start(pTHX_ { dVAR; struct regexp *const prog = ReANY(rx); - I32 start_shift = 0; + SSize_t start_shift = prog->check_offset_min; /* Should be nonnegative! */ - I32 end_shift = 0; - char *s; + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; SV *check; - char *t; const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ - I32 ml_anch; - char *other_last = NULL; /* other substr checked before this */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ char *check_at = NULL; /* check substr found at this pos */ - char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; -#ifdef DEBUGGING - const char * const i_strpos = strpos; -#endif GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_START; PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(data); - /* CHR_DIST() would be more correct here but it makes things slow. */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ if (prog->minlen > strend - strpos) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "String too short... [re_intuit_start]\n")); + " String too short...\n")); goto fail; } @@ -672,551 +730,702 @@ Perl_re_intuit_start(pTHX_ } check = prog->check_substr; } - if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */ - ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) - || ( (prog->extflags & RXf_ANCH_BOL) - && !multiline ) ); /* Check after \n? */ - - if (!ml_anch) { - if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */ - && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ - && (strpos != strbeg)) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); - goto fail; + + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); } - if (prog->check_offset_min == prog->check_offset_max - && !(prog->extflags & RXf_CANY_SEEN) - && ! multiline) /* /m can cause \n's to match that aren't - accounted for in the string max length. - See [perl #115242] */ - { - /* Substring at constant offset from beg-of-str... */ - I32 slen; + }); - s = HOP3c(strpos, prog->check_offset_min, strend); + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ - if (SvTAIL(check)) { - slen = SvCUR(check); /* >= 1 */ + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); - if ( strend - s > slen || strend - s < slen - 1 - || (strend - s == slen && strend[-1] != '\n')) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n")); - goto fail_finish; + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; + } + + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; } - /* Now should match s[0..slen-2] */ - slen--; if (slen && (*SvPVX_const(check) != *s - || (slen > 1 - && memNE(SvPVX_const(check), s, slen)))) { - report_neq: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n")); + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); goto fail_finish; } + + check_at = s; + goto success_at_start; } - else if (*SvPVX_const(check) != *s - || ((slen = SvCUR(check)) > 1 - && memNE(SvPVX_const(check), s, slen))) - goto report_neq; - check_at = s; - goto success_at_start; - } - } - /* Match is anchored, but substr is not anchored wrt beg-of-str. */ - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - - if (!ml_anch) { - const I32 end = prog->check_offset_max + CHR_SVLEN(check) - - (SvTAIL(check) != 0); - const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; - - if (end_shift < eshift) - end_shift = eshift; } } - else { /* Can match at random position */ - ml_anch = 0; - s = strpos; - start_shift = prog->check_offset_min; /* okay to underestimate on CC */ - end_shift = prog->check_end_shift; - /* end shift should be non negative here */ - } + end_shift = prog->check_end_shift; -#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", (IV)end_shift, RX_PRECOMP(prog)); #endif restart: - /* Find a possible match in the region s..strend by looking for - the "check" substring in the region corrected by start/end_shift. */ + + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ { - I32 srch_start_shift = start_shift; - I32 srch_end_shift = end_shift; U8* start_point; U8* end_point; - if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { - srch_end_shift -= ((strbeg - s) - srch_start_shift); - srch_start_shift = strbeg - s; - } - DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n", - (IV)prog->check_offset_min, - (IV)srch_start_shift, - (IV)srch_end_shift, - (IV)prog->check_end_shift); - }); - if (prog->extflags & RXf_CANY_SEEN) { - start_point= (U8*)(s + srch_start_shift); - end_point= (U8*)(strend - srch_end_shift); + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); + + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; } else { - start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend); - end_point= HOP3(strend, -srch_end_shift, strbeg); + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; + } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } } + DEBUG_OPTIMISE_MORE_r({ - PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", (int)(end_point - start_point), (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), start_point); }); - s = fbm_instr( start_point, end_point, + check_at = fbm_instr( start_point, end_point, check, multiline ? FBMrf_MULTILINE : 0); - } - /* Update the count-of-usability, remove useless subpatterns, - unshift s. */ - - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(check), RE_SV_DUMPLEN(check), 30); - PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", - (s ? "Found" : "Did not find"), - (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) - ? "anchored" : "floating"), - quoted, - RE_SV_TAIL(check), - (s ? " at offset " : "...\n") ); - }); - if (!s) - goto fail_finish; - /* Finish the diagnostic message */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) ); + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ - /* XXX dmq: first branch is for positive lookbehind... - Our check string is offset from the beginning of the pattern. - So we need to do any stclass tests offset forward from that - point. I think. :-( - */ + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ - check_at=s; + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); + } - /* Got a candidate. Check MBOL anchoring, and the *other* substr. - Start with the other substr. - XXXX no SCREAM optimization yet - and a very coarse implementation - XXXX /ttx+/ results in anchored="ttx", floating="x". floating will - *always* match. Probably should be marked during compile... - Probably it is right to do no SCREAM here... - */ + /* now look for the 'other' substring if defined */ - if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8) - : (prog->float_substr && prog->anchored_substr)) + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) { /* Take into account the "other" substring. */ - /* XXXX May be hopelessly wrong for UTF... */ - if (!other_last) - other_last = strpos; - if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { - do_other_anchored: - { - char * const last = HOP3c(s, -start_shift, strbeg); - char *last1, *last2; - char * const saved_s = s; - SV* must; - - t = s - prog->check_offset_max; - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) - && t > strpos))) - NOOP; - else - t = strpos; - t = HOP3c(t, prog->anchored_offset, strend); - if (t < other_last) /* These positions already checked */ - t = other_last; - last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); - if (last < last1) - last1 = last; - /* XXXX It is not documented what units *_offsets are in. - We assume bytes, but this is clearly wrong. - Meaning this code needs to be carefully reviewed for errors. - dmq. - */ + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ - /* On end-of-str: see comment below. */ - must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->anchored_utf8); /* for debug */ - } - else - s = fbm_instr( - (unsigned char*)t, - HOP3(HOP3(last1, prog->anchored_offset, strend) - + SvCUR(must), -(SvTAIL(must)!=0), strbeg), - must, - multiline ? FBMrf_MULTILINE : 0 - ); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); - if (!s) { - if (last1 >= last2) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying floating at offset %ld...\n", - (long)(HOP3c(saved_s, 1, strend) - i_strpos))); - other_last = HOP3c(last1, prog->anchored_offset+1, strend); - s = HOP3c(last, 1, strend); - goto restart; - } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - t = HOP3c(s, -prog->anchored_offset, strbeg); - other_last = HOP3c(s, 1, strend); - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; - } - } + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); } - else { /* Take into account the floating substring. */ - char *last, *last1; - char * const saved_s = s; - SV* must; - - t = HOP3c(s, -start_shift, strbeg); - last1 = last = - HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); - if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) - last = HOP3c(t, prog->float_max_offset, strend); - s = HOP3c(t, prog->float_min_offset, strend); - if (s < other_last) - s = other_last; - /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ - must = utf8_target ? prog->float_utf8 : prog->float_substr; - /* fbm_instr() takes into account exact value of end-of-str - if the check is SvTAIL(ed). Since false positives are OK, - and end-of-str is not later than strend we are OK. */ - if (must == &PL_sv_undef) { - s = (char*)NULL; - DEBUG_r(must = prog->float_utf8); /* for debug message */ - } - else - s = fbm_instr((unsigned char*)s, - (unsigned char*)last + SvCUR(must) - - (SvTAIL(must)!=0), - must, multiline ? FBMrf_MULTILINE : 0); - DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), - SvPVX_const(must), RE_SV_DUMPLEN(must), 30); - PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", - (s ? "Found" : "Contradicts"), - quoted, RE_SV_TAIL(must)); - }); - if (!s) { - if (last1 == last) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", giving up...\n")); - goto fail_finish; - } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - ", trying anchored starting at offset %ld...\n", - (long)(saved_s + 1 - i_strpos))); - other_last = last; - s = HOP3c(t, 1, strend); - goto restart; + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; } else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", - (long)(s - i_strpos))); - other_last = s; /* Fix this later. --Hugo */ - s = saved_s; - if (t == strpos) - goto try_at_start; - goto try_at_offset; + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); } } } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); + } + postprocess_substr_matches: - t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos); + /* handle the extra constraint of /^.../m if present */ - DEBUG_OPTIMISE_MORE_r( - PerlIO_printf(Perl_debug_log, - "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n", - (IV)prog->check_offset_min, - (IV)prog->check_offset_max, - (IV)(s-strpos), - (IV)(t-strpos), - (IV)(t-s), - (IV)(strend-strpos) - ) - ); + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; - if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!utf8_target - || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) - && t > strpos))) - { - /* Fixed substring is found far enough so that the match - cannot start at strpos. */ - try_at_offset: - if (ml_anch && t[-1] != '\n') { - /* Eventually fbm_*() should handle this, but often - anchored_offset is not 0, so this check will not be wasted. */ - /* XXXX In the code below we prefer to look for "^" even in - presence of anchored substrings. And we search even - beyond the found float position. These pessimizations - are historical artefacts only. */ - find_anchor: - while (t < strend - prog->minlen) { - if (*t == '\n') { - if (t < check_at - prog->check_offset_min) { - if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { - /* Since we moved from the found position, - we definitely contradict the found anchored - substr. Due to the above check we do not - contradict "check" substr. - Thus we can arrive here only if check substr - is float. Redo checking for "other"=="fixed". - */ - strpos = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset))); - goto do_other_anchored; - } - /* We don't contradict the found floating substring. */ - /* XXXX Why not check for STCLASS? */ - s = t + 1; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(s - i_strpos))); - goto set_useful; - } - /* Position contradicts check-string */ - /* XXXX probably better to look for check-string - than for "\n", so one should lower the limit for t? */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos))); - other_last = strpos = s = t + 1; - goto restart; - } - t++; - } - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); goto fail_finish; } - else { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n", - PL_colors[0], PL_colors[1])); - } - s = t; - set_useful: - ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ - } - else { - /* The found string does not prohibit matching at strpos, - - no optimization of calling REx engine can be performed, - unless it was an MBOL and we are not after MBOL, - or a future STCLASS check will fail this. */ - try_at_start: - /* Even in this situation we may use MBOL flag if strpos is offset - wrt the start of the string. */ - if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n' - /* May be due to an implicit anchor of m{.*foo} */ - && !(prog->intflags & PREGf_IMPLICIT)) + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) { - t = strpos; - goto find_anchor; + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; } - DEBUG_EXECUTE_r( if (ml_anch) - PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n", - (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]); - ); - success_at_start: - if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ - && (utf8_target ? ( - prog->check_utf8 /* Could be deleted already */ - && --BmUSEFUL(prog->check_utf8) < 0 - && (prog->check_utf8 == prog->float_utf8) - ) : ( - prog->check_substr /* Could be deleted already */ - && --BmUSEFUL(prog->check_substr) < 0 - && (prog->check_substr == prog->float_substr) - ))) - { - /* If flags & SOMETHING - do not do it many times on the same match */ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); - /* XXX Does the destruction order has to change with utf8_target? */ - SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); - SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); - prog->check_substr = prog->check_utf8 = NULL; /* disable */ - prog->float_substr = prog->float_utf8 = NULL; /* clear */ - check = NULL; /* abort */ - s = strpos; - /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag - see http://bugs.activestate.com/show_bug.cgi?id=87173 */ - if (prog->intflags & PREGf_IMPLICIT) - prog->extflags &= ~RXf_ANCH_MBOL; - /* XXXX This is a remnant of the old implementation. It - looks wasteful, since now INTUIT can use many - other heuristics. */ - prog->extflags &= ~RXf_USE_INTUIT; - /* XXXX What other flags might need to be cleared in this branch? */ + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; } - else - s = strpos; + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); + } + + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ - /* Last resort... */ - /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */ - /* trie stclasses are too expensive to use here, we are better off to - leave it to regmatch itself */ if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { - /* minlen == 0 is possible if regstclass is \b or \B, - and the fixed substr is ''$. - Since minlen is already taken into account, s+1 is before strend; - accidentally, minlen >= 1 guaranties no false positives at s + 1 - even for \b or \B. But (minlen? 1 : 0) below assumes that - regstclass does not come from lookahead... */ - /* If regstclass takes bytelength more than 1: If charlength==1, OK. - This leaves EXACTF-ish only, which are dealt with in find_byclass(). */ const U8* const str = (U8*)STRING(progi->regstclass); + + /* XXX this value could be pre-computed */ const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT - ? CHR_DIST(str+STR_LEN(progi->regstclass), str) + ? (reginfo->is_utf8_pat + ? utf8_distance(str + STR_LEN(progi->regstclass), str) + : STR_LEN(progi->regstclass)) : 1); char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) - endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend); - else if (prog->float_substr || prog->float_utf8) - endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend); + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } else endpos= strend; - if (checked_upto < s) - checked_upto = s; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); - t = s; - s = find_byclass(prog, progi->regstclass, checked_upto, endpos, + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, reginfo); - if (s) { - checked_upto = s; - } else { -#ifdef DEBUGGING - const char *what = NULL; -#endif + if (!s) { if (endpos == strend) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); + " Could not match STCLASS...\n") ); goto fail; } DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "This position contradicts STCLASS...\n") ); - if ((prog->extflags & RXf_ANCH) && !ml_anch) + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) goto fail; - checked_upto = HOPBACKc(endpos, start_shift); - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { - if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { - DEBUG_EXECUTE_r( what = "anchored" ); - hop_and_restart: - s = HOP3c(t, 1, strend); - if (s + start_shift + end_shift > strend) { - /* XXXX Should be taken into account earlier? */ + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Could not match STCLASS...\n") ); - goto fail; + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; } - if (!check) - goto giveup; + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for %s substr starting at offset %ld...\n", - what, (long)(s + start_shift - i_strpos)) ); - goto restart; + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; } - /* Have both, check_string is floating */ - if (t + start_shift >= check_at) /* Contradicts floating=check */ - goto retry_floating_check; - /* Recheck anchored substring, but not floating... */ - s = check_at; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for anchored substr starting at offset %ld...\n", - (long)(other_last - i_strpos)) ); - goto do_other_anchored; - } - /* Another way we could have checked stclass at the - current position only: */ - if (ml_anch) { - s = t = t + 1; - if (!check) - goto giveup; - DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "Looking for /%s^%s/m starting at offset %ld...\n", - PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); - goto try_at_offset; + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; } - if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); goto fail; - /* Check is floating substring. */ - retry_floating_check: - t = check_at - start_shift; - DEBUG_EXECUTE_r( what = "floating" ); - goto hop_and_restart; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; } - if (t != s) { + + /* Success !!! */ + + if (rx_origin != s) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "By STCLASS: moving %ld --> %ld\n", - (long)(t - i_strpos), (long)(s - i_strpos)) + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) ); } else { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - "Does not contradict STCLASS...\n"); + " Does not contradict STCLASS...\n"); ); } } - giveup: - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n", - PL_colors[4], (check ? "Guessed" : "Giving up"), - PL_colors[5], (long)(s - i_strpos)) ); - return s; + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; + } + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; fail_finish: /* Substring not found */ if (prog->check_substr || prog->check_utf8) /* could be removed already */ @@ -1227,46 +1436,57 @@ Perl_re_intuit_start(pTHX_ return NULL; } + #define DECL_TRIE_TYPE(scan) \ - const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ trie_type = ((scan->flags == EXACT) \ ? (utf8_target ? trie_utf8 : trie_plain) \ - : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ -STMT_START { \ +STMT_START { \ STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ case trie_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ - uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ len = UTF8SKIP(uc); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ case trie_latin_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ case trie_utf8: \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ case trie_plain: \ uvc = (UV)*uc; \ @@ -1337,28 +1557,28 @@ REXEC_FBC_SCAN( \ tmp = 1; \ ) -#define REXEC_FBC_TRYIT \ +#define REXEC_FBC_TRYIT \ if ((reginfo->intuit || regtry(reginfo, &s))) \ goto got_it #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ - if (utf8_target) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ } -#define DUMP_EXEC_POS(li,s,doutf8) \ +#define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ startpos, doutf8) -#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ tmp = TEST_NON_UTF8(tmp); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1367,18 +1587,19 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ -#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ +#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ if (s == reginfo->strbeg) { \ tmp = '\n'; \ } \ else { \ U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ } \ tmp = TeSt1_UtF8; \ - LOAD_UTF8_CHARCLASS_ALNUM(); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ REXEC_FBC_UTF8_SCAN( \ - if (tmp == ! (TeSt2_UtF8)) { \ + if (tmp == ! (TeSt2_UtF8)) { \ tmp = !tmp; \ IF_SUCCESS; \ } \ @@ -1413,9 +1634,9 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ * one, and compare it with the wordness of this one. If they differ, we have * a boundary. At the beginning of the string, pretend that the previous * character was a new-line */ -#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ if (utf8_target) { \ - UTF8_CODE \ + UTF8_CODE \ } \ else { /* Not utf8 */ \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ @@ -1430,7 +1651,7 @@ if ((reginfo->intuit || regtry(reginfo, &s))) \ } \ ); \ } \ - if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ + if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \ goto got_it; /* We know what class REx starts with. Try to find this position... */ @@ -1469,11 +1690,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { case ANYOF: - case ANYOF_SYNTHETIC: - case ANYOF_WARN_SUPER: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( - reginclass(prog, c, (U8*)s, utf8_target)); + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); } else { REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); @@ -1488,6 +1707,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ case EXACTFA: if (is_utf8_pat || utf8_target) { utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; @@ -1497,10 +1719,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1; /* /a, except the sharp s one which */ goto do_exactf_non_utf8; /* isn't dealt with by these */ - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); if (utf8_target) { - - /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1509,8 +1730,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; case EXACTFL: - if (is_utf8_pat || utf8_target) { - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; } fold_array = PL_fold_locale; @@ -1523,7 +1744,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_TRICKYFOLD: case EXACTFU: if (is_utf8_pat || utf8_target) { utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -1536,7 +1756,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, fold_array = PL_fold_latin1; folder = foldEQ_latin1; - /* FALL THROUGH */ + /* FALLTHROUGH */ do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there are no glitches with fold-length differences @@ -1556,7 +1776,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * characters, and there are only 2 availabe, we know without * trying that it will fail; so don't start a match past the * required minimum number from the far end */ - e = HOP3c(strend, -((I32)ln), s); + e = HOP3c(strend, -((SSize_t)ln), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1602,7 +1822,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * only 2 are left, it's guaranteed to fail, so don't start a * match that would require us to go beyond the end of the string */ - e = HOP3c(strend, -((I32)lnc), s); + e = HOP3c(strend, -((SSize_t)lnc), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1628,15 +1848,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; } case BOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: - RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -1683,7 +1901,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); break; @@ -1701,9 +1918,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case NPOSIXA: if (utf8_target) { /* The complement of something that matches only ASCII matches all - * UTF-8 variant code points, plus everything in ASCII that isn't - * in the class */ - REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s) + * non-ASCII, plus everything in ASCII that isn't in the class. */ + REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s) || ! _generic_isCC_A(*s, FLAGS(c))); break; } @@ -1747,7 +1963,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), classnum)))) { if (tmp && (reginfo->intuit || regtry(reginfo, &s))) @@ -1766,7 +1983,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, macros */ case _CC_ENUM_SPACE: /* XXX would require separate code if we revert the change of \v matching this */ - /* FALL THROUGH */ + /* FALLTHROUGH */ case _CC_ENUM_PSXSPC: REXEC_FBC_UTF8_CLASS_SCAN( @@ -1805,8 +2022,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = - _core_swash_init("utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } /* This is a copy of the loop above for swash classes, though using the @@ -2041,20 +2260,169 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; default: Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); - break; } return 0; got_it: return s; } +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +static void +S_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) + { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + /* - regexec_flags - match a regexp against a string */ I32 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, - char *strbeg, I32 minend, SV *sv, void *data, U32 flags) + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -2062,21 +2430,17 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* sv: SV being matched: only used for utf8 flag, pos() etc; string * itself is accessed via the pointers above */ /* data: May be used for some additional optimizations. - Currently its only used, with a U32 cast, for transmitting - the ganch offset when doing a /g match. This will change */ -/* nosave: For optimizations. */ + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ { dVAR; struct regexp *const prog = ReANY(rx); char *s; regnode *c; - char *startpos = stringarg; - I32 minlen; /* must match at least this many chars */ - I32 dontbother = 0; /* how many characters not to try at end */ - I32 end_shift = 0; /* Same for the end. */ /* CC */ - I32 scream_pos = -1; /* Internal iterator of scream. */ - char *scream_olds = NULL; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); @@ -2090,16 +2454,69 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PERL_UNUSED_ARG(data); /* Be paranoid... */ - if (prog == NULL || startpos == NULL) { + if (prog == NULL || stringarg == NULL) { Perl_croak(aTHX_ "NULL regexp parameter"); - return 0; } DEBUG_EXECUTE_r( - debug_start_match(rx, utf8_target, startpos, strend, + debug_start_match(rx, utf8_target, stringarg, strend, "Matching"); ); + startpos = stringarg; + + if (prog->intflags & PREGf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if prog->gofs is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ + + if (prog->intflags & PREGf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->intflags & PREGf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; + } /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), * which will call destuctors to reset PL_regmatch_state, free higher @@ -2108,10 +2525,54 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, oldsave = PL_savestack_ix; + s = startpos; + + if ((prog->extflags & RXf_USE_INTUIT) + && !(flags & REXEC_CHECKED)) + { + s = re_intuit_start(rx, sv, strbeg, startpos, strend, + flags, NULL); + if (!s) + return 0; + + if (prog->extflags & RXf_CHECK_ALL) { + /* we can match based purely on the result of INTUIT. + * Set up captures etc just for $& and $-[0] + * (an intuit-only match wont have $1,$2,..) */ + assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + /* match via INTUIT shouldn't have any captures. + * Let @-, @+, $^N know */ + prog->lastparen = prog->lastcloseparen = 0; + RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, strend, + sv, flags, utf8_target); + + return 1; + } + } + multiline = prog->extflags & RXf_PMf_MULTILINE; - minlen = prog->minlen; - if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too short [regexec_flags]...\n")); goto phooey; @@ -2134,7 +2595,20 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, reginfo->poscache_maxiter = 0; /* not yet started a countdown */ reginfo->strend = strend; /* see how far we have to get to not match where we matched before */ - reginfo->till = startpos+minend; + reginfo->till = stringarg + minend; + + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { + /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after + S_cleanup_regmatch_info_aux has executed (registered by + SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies + magic belonging to this SV. + Not newSVsv, either, as it does not COW. + */ + assert(!IS_PADGV(sv)); + reginfo->sv = newSV(0); + SvSetSV_nosteal(reginfo->sv, sv); + SAVEFREESV(reginfo->sv); + } /* reserve next 2 or 3 slots in PL_regmatch_state: * slot N+0: may currently be in use: skip it @@ -2187,43 +2661,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* If there is a "must appear" string, look for it. */ - s = startpos; - if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */ - MAGIC *mg; - if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */ - reginfo->ganch = startpos + prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs)); - } else if (sv && SvTYPE(sv) >= SVt_PVMG - && SvMAGIC(sv) - && (mg = mg_find(sv, PERL_MAGIC_regex_global)) - && mg->mg_len >= 0) { - reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */ - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len)); - - if (prog->extflags & RXf_ANCH_GPOS) { - if (s > reginfo->ganch) - goto phooey; - s = reginfo->ganch - prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs)); - if (s < strbeg) - goto phooey; - } - } - else if (data) { - reginfo->ganch = strbeg + PTR2UV(data); - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data))); - - } else { /* pos() not defined */ - reginfo->ganch = strbeg; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS: reginfo->ganch = strbeg\n")); - } - } if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { /* We have to be careful. If the previous successful match was from this regex we don't want a subsequent partially @@ -2242,27 +2680,13 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PTR2UV(prog->offs) )); } - if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { - re_scream_pos_data d; - - d.scream_olds = &scream_olds; - d.scream_pos = &scream_pos; - s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d); - if (!s) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); - goto phooey; /* not present */ - } - } - - /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ - if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { - if (s == startpos && regtry(reginfo, &startpos)) + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { + if (s == startpos && regtry(reginfo, &s)) goto got_it; - else if (multiline || (prog->intflags & PREGf_IMPLICIT) - || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ { char *end; @@ -2336,14 +2760,15 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* end search for newline */ } /* end anchored/multiline check string search */ goto phooey; - } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) + } else if (prog->intflags & PREGf_ANCH_GPOS) { - /* the warning about reginfo->ganch being used without initialization - is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN - and we only enter this block when the same bit is set. */ - char *tmp_s = reginfo->ganch - prog->gofs; - - if (tmp_s >= strbeg && regtry(reginfo, &tmp_s)) + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) goto got_it; goto phooey; } @@ -2399,8 +2824,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, || ((prog->float_substr != NULL || prog->float_utf8 != NULL) && prog->float_max_offset < strend - s)) { SV *must; - I32 back_max; - I32 back_min; + SSize_t back_max; + SSize_t back_min; char *last; char *last1; /* Last position checked before */ #ifdef DEBUGGING @@ -2445,7 +2870,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, last = strend; } else { last = HOP3c(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) + -(SSize_t)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min), strbeg); } if (s > reginfo->strbeg) @@ -2455,11 +2880,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* XXXX check_substr already used to find "s", can optimize if check_substr==must. */ - scream_pos = -1; - dontbother = end_shift; + dontbother = 0; strend = HOPc(strend, -dontbother); while ( (s <= last) && - (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), (unsigned char*)strend, must, multiline ? FBMrf_MULTILINE : 0)) ) { DEBUG_EXECUTE_r( did_match = 1 ); @@ -2512,7 +2936,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, c); + regprop(prog, prop, c, reginfo); { RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); @@ -2649,6 +3073,18 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, goto phooey; got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + DEBUG_BUFFERS_r( if (swap) PerlIO_printf(Perl_debug_log, @@ -2671,123 +3107,10 @@ got_it: RX_MATCH_UTF8_set(rx, utf8_target); /* make sure $`, $&, $', and $digit will work later */ - if ( !(flags & REXEC_NOT_FIRST) ) { - if (flags & REXEC_COPY_STR) { -#ifdef PERL_ANY_COW - if (SvCANCOW(sv)) { - if (DEBUG_C_TEST) { - PerlIO_printf(Perl_debug_log, - "Copy on write: regexp capture, type %d\n", - (int) SvTYPE(sv)); - } - RX_MATCH_COPY_FREE(rx); - prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); - prog->subbeg = (char *)SvPVX_const(prog->saved_copy); - assert (SvPOKp(prog->saved_copy)); - prog->sublen = reginfo->strend - strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - } else -#endif - { - I32 min = 0; - I32 max = reginfo->strend - strbeg; - I32 sublen; - - if ( (flags & REXEC_COPY_SKIP_POST) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_RIGHT) - ) { /* don't copy $' part of string */ - U32 n = 0; - max = -1; - /* calculate the right-most part of the string covered - * by a capture. Due to look-ahead, this may be to - * the right of $&, so we have to scan all captures */ - while (n <= prog->lastparen) { - if (prog->offs[n].end > max) - max = prog->offs[n].end; - n++; - } - if (max == -1) - max = (PL_sawampersand & SAWAMPERSAND_LEFT) - ? prog->offs[0].start - : 0; - assert(max >= 0 && max <= reginfo->strend - strbeg); - } - - if ( (flags & REXEC_COPY_SKIP_PRE) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ - && !(PL_sawampersand & SAWAMPERSAND_LEFT) - ) { /* don't copy $` part of string */ - U32 n = 0; - min = max; - /* calculate the left-most part of the string covered - * by a capture. Due to look-behind, this may be to - * the left of $&, so we have to scan all captures */ - while (min && n <= prog->lastparen) { - if ( prog->offs[n].start != -1 - && prog->offs[n].start < min) - { - min = prog->offs[n].start; - } - n++; - } - if ((PL_sawampersand & SAWAMPERSAND_RIGHT) - && min > prog->offs[0].end - ) - min = prog->offs[0].end; - - } - - assert(min >= 0 && min <= max - && min <= reginfo->strend - strbeg); - sublen = max - min; - - if (RX_MATCH_COPIED(rx)) { - if (sublen > prog->sublen) - prog->subbeg = - (char*)saferealloc(prog->subbeg, sublen+1); - } - else - prog->subbeg = (char*)safemalloc(sublen+1); - Copy(strbeg + min, prog->subbeg, sublen, char); - prog->subbeg[sublen] = '\0'; - prog->suboffset = min; - prog->sublen = sublen; - RX_MATCH_COPIED_on(rx); - } - prog->subcoffset = prog->suboffset; - if (prog->suboffset && utf8_target) { - /* Convert byte offset to chars. - * XXX ideally should only compute this if @-/@+ - * has been seen, a la PL_sawampersand ??? */ - - /* If there's a direct correspondence between the - * string which we're matching and the original SV, - * then we can use the utf8 len cache associated with - * the SV. In particular, it means that under //g, - * sv_pos_b2u() will use the previously cached - * position to speed up working out the new length of - * subcoffset, rather than counting from the start of - * the string each time. This stops - * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; - * from going quadratic */ - if (SvPOKp(sv) && SvPVX(sv) == strbeg) - sv_pos_b2u(sv, &(prog->subcoffset)); - else - prog->subcoffset = utf8_length((U8*)strbeg, - (U8*)(strbeg+prog->suboffset)); - } - } - else { - RX_MATCH_COPY_FREE(rx); - prog->subbeg = strbeg; - prog->suboffset = 0; - prog->subcoffset = 0; - /* use reginfo->strend, as strend may have been modified */ - prog->sublen = reginfo->strend - strbeg; - } - } + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); return 1; @@ -2818,7 +3141,7 @@ phooey: /* Set which rex is pointed to by PL_reg_curpm, handling ref counting. * Do inc before dec, in case old and new rex are the same */ -#define SET_reg_curpm(Re2) \ +#define SET_reg_curpm(Re2) \ if (reginfo->info_aux_eval) { \ (void)ReREFCNT_inc(Re2); \ ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ @@ -2836,7 +3159,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) CHECKPOINT lastcp; REGEXP *const rx = reginfo->prog; regexp *const prog = ReANY(rx); - I32 result; + SSize_t result; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -3083,11 +3406,11 @@ regmatch(), slabs allocated since entry are freed. #define DEBUG_STATE_pp(pp) \ DEBUG_STATE_r({ \ - DUMP_EXEC_POS(locinput, scan, utf8_target); \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ PerlIO_printf(Perl_debug_log, \ " %*s"pp" %s%s%s%s%s\n", \ depth*2, "", \ - PL_reg_name[st->resume_state], \ + PL_reg_name[st->resume_state], \ ((st==yes_state||st==mark_state) ? "[" : ""), \ ((st==yes_state) ? "Y" : ""), \ ((st==mark_state) ? "M" : ""), \ @@ -3203,7 +3526,7 @@ S_dump_exec_pos(pTHX_ const char *locinput, * or 0 if non of the buffers matched. */ STATIC I32 -S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) +S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) { I32 n; RXi_GET_DECL(rex,rexi); @@ -3291,6 +3614,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, dVAR; U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; if (OP(text_node) == EXACT) { @@ -3310,136 +3634,186 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = c1 = valid_utf8_to_uvchr(pat, NULL); } } - else /* an EXACTFish node */ - if ((is_utf8_pat - && is_MULTI_CHAR_FOLD_utf8_safe(pat, - pat + STR_LEN(text_node))) - || (!is_utf8_pat - && is_MULTI_CHAR_FOLD_latin1_safe(pat, - pat + STR_LEN(text_node)))) - { - /* Multi-character folds require more context to sort out. Also - * PL_utf8_foldclosures used below doesn't handle them, so have to be - * handled outside this routine */ - use_chrtest_void = TRUE; - } - else { /* an EXACTFish node which doesn't begin with a multi-char fold */ - c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; - if (c1 > 256) { - /* Load the folds hash, if not already done */ - SV** listp; - if (! PL_utf8_foldclosures) { - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - - /* Force loading this by folding an above-Latin1 char */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); - } - - /* The fold closures data structure is a hash with the keys being - * the UTF-8 of every character that is folded to, like 'k', and - * the values each an array of all code points that fold to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are - * not included */ - if ((! (listp = hv_fetch(PL_utf8_foldclosures, - (char *) pat, - UTF8SKIP(pat), - FALSE)))) - { - /* Not found in the hash, therefore there are no folds - * containing it, so there is only a single character that - * could match */ - c2 = c1; } - else { /* Does participate in folds */ - AV* list = (AV*) *listp; - if (av_len(list) != 1) { + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; - /* If there aren't exactly two folds to this, it is outside - * the scope of this function */ - use_chrtest_void = TRUE; - } - else { /* There are two. Get them */ - SV** c_p = av_fetch(list, 0, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); } - c1 = SvUV(*c_p); + } + + pat = folded; + pat_end = d; + } + } + + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 255) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { - c_p = av_fetch(list, 1, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; } - c2 = SvUV(*c_p); - - /* Folds that cross the 255/256 boundary are forbidden if - * EXACTFL, or EXACTFA and one is ASCIII. Since the - * pattern character is above 256, and its only other match - * is below 256, the only legal match will be to itself. - * We have thrown away the original, so have to compute - * which is the one above 255 */ - if ((c1 < 256) != (c2 < 256)) { - if (OP(text_node) == EXACTFL - || (OP(text_node) == EXACTFA - && (isASCII(c1) || isASCII(c2)))) - { - if (c1 < 256) { - c1 = c2; - } - else { - c2 = c1; + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 255, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255. */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } } } } } } - } - else /* Here, c1 is < 255 */ - if (utf8_target - && HAS_NONLATIN1_FOLD_CLOSURE(c1) - && OP(text_node) != EXACTFL - && (OP(text_node) != EXACTFA || ! isASCII(c1))) - { - /* Here, there could be something above Latin1 in the target which - * folds to this character in the pattern. All such cases except - * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters - * involved in their folds, so are outside the scope of this - * function */ - if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { - c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; - } - else { - use_chrtest_void = TRUE; + else /* Here, c1 is <= 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } } - } - else { /* Here nothing above Latin1 can fold to the pattern character */ - switch (OP(text_node)) { + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { - case EXACTFL: /* /l rules */ - c2 = PL_fold_locale[c1]; - break; + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; - case EXACTF: - if (! utf8_target) { /* /d rules */ - c2 = PL_fold[c1]; + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; break; - } - /* FALLTHROUGH */ - /* /u rules for all these. This happens to work for - * EXACTFA as nothing in Latin1 folds to ASCII */ - case EXACTFA: - case EXACTFU_TRICKYFOLD: - case EXACTFU_SS: - case EXACTFU: - c2 = PL_fold_latin1[c1]; - break; - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); - assert(0); /* NOTREACHED */ + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } } } } @@ -3481,7 +3855,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } /* returns -1 on failure, $+[0] on success */ -STATIC I32 +STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { #if PERL_VERSION < 9 && !defined(PERL_CORE) @@ -3499,7 +3873,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regnode *scan; regnode *next; U32 n = 0; /* general value; init to avoid compiler warning */ - I32 ln = 0; /* len or last; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput) */ @@ -3527,7 +3901,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; - SV* const oreplsv = GvSV(PL_replgv); + SV* const oreplsv = GvSVn(PL_replgv); /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops @@ -3557,6 +3931,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) GET_RE_DEBUG_FLAGS_DECL; #endif + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ multicall_oldcatch = 0; multicall_cv = NULL; @@ -3582,7 +3959,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SV * const prop = sv_newmortal(); regnode *rnext=regnext(scan); DUMP_EXEC_POS( locinput, scan, utf8_target ); - regprop(rex, prop, scan); + regprop(rex, prop, scan, reginfo); PerlIO_printf(Perl_debug_log, "%3"IVdf":%*s%s(%"IVdf")\n", @@ -3605,7 +3982,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); switch (state_num) { - case BOL: /* /^../ */ + case BOL: /* /^../ */ + case SBOL: /* /^../s */ if (locinput == reginfo->strbeg) break; sayNO; @@ -3618,11 +3996,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } sayNO; - case SBOL: /* /^../s */ - if (locinput == reginfo->strbeg) - break; - sayNO; - case GPOS: /* \G */ if (locinput == reginfo->ganch) break; @@ -3640,16 +4013,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO_SILENT; assert(0); /*NOTREACHED*/ - case EOL: /* /..$/ */ - goto seol; - case MEOL: /* /..$/m */ if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; break; + case EOL: /* /..$/ */ + /* FALLTHROUGH */ case SEOL: /* /..$/s */ - seol: if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; if (reginfo->strend - locinput > 1) @@ -3693,7 +4064,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO_SILENT; assert(0); /* NOTREACHED */ } - /* FALL THROUGH */ + /* FALLTHROUGH */ case TRIE: /* (ab|cd) */ /* the basic plan of execution of the trie is: * At the beginning, run though all the states, and @@ -3947,7 +4318,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (chars) { if (utf8_target) { - uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; } @@ -3960,7 +4331,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (foldlen) { if (!--chars) break; - uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, uniflags); uscan += len; foldlen -= len; @@ -4051,7 +4422,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { sayNO; } l += 2; @@ -4074,7 +4446,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { sayNO; } s += 2; @@ -4104,27 +4477,31 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * s; U32 fold_utf8_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - fold_utf8_flags = FOLDEQ_UTF8_LOCALE; + fold_utf8_flags = FOLDEQ_LOCALE; goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ - case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ case EXACTFA: /* /abc/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; - case EXACTF: /* /abc/i */ + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); folder = foldEQ; fold_array = PL_fold; fold_utf8_flags = 0; @@ -4133,7 +4510,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) { + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { /* Either target or the pattern are utf8, or has the issue where * the fold lengths may differ. */ const char * const l = locinput; @@ -4168,8 +4549,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * have to set the FLAGS fields of these */ case BOUNDL: /* /\b/l */ case NBOUNDL: /* /\B/l */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* FALL THROUGH */ case BOUND: /* /\b/ */ case BOUNDU: /* /\b/u */ case BOUNDA: /* /\b/a */ @@ -4187,7 +4566,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 * const r = reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); - ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); } if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isWORDCHAR_uni(ln); @@ -4200,7 +4580,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -4239,7 +4619,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; default: Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); - break; } } /* Note requires that all BOUNDs be lower than all NBOUNDs in @@ -4249,11 +4628,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case ANYOF: /* /[abc]/ */ - case ANYOF_WARN_SUPER: if (NEXTCHR_IS_EOS) sayNO; if (utf8_target) { - if (!reginclass(rex, scan, (U8*)locinput, utf8_target)) + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) sayNO; locinput += UTF8SKIP(locinput); } @@ -4275,10 +4654,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (NEXTCHR_IS_EOS) sayNO; - /* The locale hasn't influenced the outcome before this, so defer - * tainting until now */ - RX_MATCH_TAINTED_on(reginfo->prog); - /* Use isFOO_lc() for characters within Latin1. (Note that * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else * wouldn't be invariant) */ @@ -4289,7 +4664,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_UNI(nextchr, + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -4370,7 +4745,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), FLAGS(scan))))) { @@ -4388,8 +4763,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8", - swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } if (! (to_complement ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], @@ -4651,11 +5027,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 *fold_array; UV utf8_fold_flags; - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; type = REFFL; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; case NREFFA: /* /\g{name}/iaa */ @@ -4696,10 +5071,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_nref_ref_common; case REFFL: /* /\1/il */ - RX_MATCH_TAINTED_on(reginfo->prog); folder = foldEQ_locale; fold_array = PL_fold_locale; - utf8_fold_flags = FOLDEQ_UTF8_LOCALE; + utf8_fold_flags = FOLDEQ_LOCALE; goto do_ref; case REFFA: /* /\1/iaa */ @@ -4739,8 +5113,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s = reginfo->strbeg + ln; if (type != REF /* REF can do byte comparison */ - && (utf8_target || type == REFFU)) - { /* XXX handle REFFL better */ + && (utf8_target || type == REFFU || type == REFFL)) + { char * limit = reginfo->strend; /* This call case insensitively compares the entire buffer @@ -4814,7 +5188,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) startpoint = rei->program+1; ST.close_paren = 0; } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ goto eval_recurse_doit; + assert(0); /* NOTREACHED */ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ @@ -4926,8 +5307,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->offs[0].end = locinput - reginfo->strbeg; if (reginfo->info_aux_eval->pos_magic) - reginfo->info_aux_eval->pos_magic->mg_len - = locinput - reginfo->strbeg; + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); @@ -4965,20 +5347,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* /(??{}) */ /* if its overloaded, let the regex compiler handle * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); if (!SvAMAGIC(ret)) { SV *sv = ret; if (SvROK(sv)) sv = SvRV(sv); if (SvTYPE(sv) == SVt_REGEXP) re_sv = (REGEXP*) sv; - else if (SvSMAGICAL(sv)) { - MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); if (mg) re_sv = (REGEXP *) mg->mg_obj; } - /* force any magic, undef warnings here */ - if (!re_sv) { + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { ret = sv_mortalcopy(ret); (void) SvPV_force_nolen(ret); } @@ -4993,6 +5377,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PL_op = oop; PL_curcop = ocurcop; S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); + PL_curpm = PL_reg_curpm; if (logical != 2) break; @@ -5031,17 +5416,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) pm_flags); if (!(SvFLAGS(ret) - & (SVs_TEMP | SVs_PADTMP | SVf_READONLY - | SVs_GMG))) { + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { /* This isn't a first class regexp. Instead, it's caching a regexp onto an existing, Perl visible scalar. */ sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); } - /* safe to do now that any $1 etc has been - * interpolated into the new pattern string and - * compiled */ - S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); } SAVEFREESV(re_sv); re = ReANY(re_sv); @@ -5051,6 +5432,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) re->sublen = rex->sublen; re->suboffset = rex->suboffset; re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; rei = RXi_GET(re); DEBUG_EXECUTE_r( debug_start_match(re_sv, utf8_target, locinput, @@ -5058,18 +5441,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ - - eval_recurse_doit: /* Share code with GOSUB below this line */ - /* run the pattern returned from (??{...}) */ - - /* Save *all* the positions. */ + /* Save all the seen positions so far. */ ST.cp = regcppush(rex, 0, maxopenparen); REGCP_SET(ST.lastcp); - - re->lastparen = 0; - re->lastcloseparen = 0; - + /* and set maxopenparen to 0, since we are starting a "fresh" match */ maxopenparen = 0; + /* run the pattern returned from (??{...}) */ + + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ /* invalidate the S-L poscache. We're now executing a * different set of WHILEM ops (and their associated @@ -5081,6 +5462,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * pattern again */ reginfo->poscache_maxiter = 0; + /* the new regexp might have a different is_utf8_pat than we do */ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); ST.prev_rex = rex_sv; @@ -5105,7 +5487,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); rexi = RXi_GET(rex); - regcpblow(ST.cp); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; @@ -5455,10 +5844,10 @@ NULL if (reginfo->poscache_iter-- == 0) { /* initialise cache */ - const I32 size = (reginfo->poscache_maxiter + 7)/8; + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; regmatch_info_aux *const aux = reginfo->info_aux; if (aux->poscache) { - if ((I32)reginfo->poscache_size < size) { + if ((SSize_t)reginfo->poscache_size < size) { Renew(aux->poscache, size, char); reginfo->poscache_size = size; } @@ -5476,7 +5865,7 @@ NULL if (reginfo->poscache_iter < 0) { /* have we already failed at this position? */ - I32 offset, mask; + SSize_t offset, mask; reginfo->poscache_iter = -1; /* stop eventual underflow */ offset = (scan->flags & 0xf) - 1 @@ -5537,7 +5926,7 @@ NULL assert(0); /* NOTREACHED */ case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ - /* FALL THROUGH */ + /* FALLTHROUGH */ case WHILEM_A_pre_fail: /* just failed to match even minimal A */ REGCP_UNWIND(ST.lastcp); regcppop(rex, &maxopenparen); @@ -5614,7 +6003,7 @@ NULL if (next == scan) next = NULL; scan = NEXTOPER(scan); - /* FALL THROUGH */ + /* FALLTHROUGH */ case BRANCH: /* /(...|A|...)/ */ scan = NEXTOPER(scan); /* scan now points to inner node */ @@ -5801,7 +6190,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", (int)(REPORT_CODE_OFF+(depth*2)),"", valid_utf8_to_uvchr((U8 *) locinput, NULL), valid_utf8_to_uvchr(ST.c1_utf8, NULL), @@ -5815,7 +6204,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", (int)(REPORT_CODE_OFF+(depth*2)),"", (int) nextchr, ST.c1, ST.c2) ); @@ -6169,7 +6558,7 @@ NULL assert(0); /* NOTREACHED */ } } - /* FALL THROUGH */ + /* FALLTHROUGH */ case CURLY_B_max_fail: /* failed to find B in a greedy match */ @@ -6290,7 +6679,7 @@ NULL case IFMATCH_A_fail: /* body of (?...A) failed */ ST.wanted = !ST.wanted; - /* FALL THROUGH */ + /* FALLTHROUGH */ case IFMATCH_A: /* body of (?...A) succeeded */ if (ST.logical) { @@ -6454,7 +6843,7 @@ NULL /* push a state that backtracks on success */ st->u.yes.prev_yes_state = yes_state; yes_state = st; - /* FALL THROUGH */ + /* FALLTHROUGH */ push_state: /* push a new regex state, then continue at scan */ { @@ -6554,6 +6943,10 @@ yes: * When popping the save stack, all these locals would be undone; * bypass this by setting the outermost saved $^R to the latest * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ if (oreplsv != GvSV(PL_replgv)) sv_setsv(oreplsv, GvSV(PL_replgv)); } @@ -6609,6 +7002,8 @@ no_silent: sv_commit = &PL_sv_yes; sv_yes_mark = &PL_sv_no; } + assert(sv_err); + assert(sv_mrk); sv_setsv(sv_err, sv_commit); sv_setsv(sv_mrk, sv_yes_mark); } @@ -6756,7 +7151,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -6783,21 +7178,23 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALLTHROUGH */ case EXACTFA: utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; case EXACTFL: - RXp_MATCH_TAINTED_on(prog); - utf8_flags = FOLDEQ_UTF8_LOCALE; + utf8_flags = FOLDEQ_LOCALE; goto do_exactf; - case EXACTF: - utf8_flags = 0; - goto do_exactf; + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -6861,11 +7258,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, break; } case ANYOF: - case ANYOF_WARN_SUPER: if (utf8_target) { while (hardcount < max && scan < loceol - && reginclass(prog, p, (U8*)scan, utf8_target)) + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; @@ -6883,7 +7279,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXL: - RXp_MATCH_TAINTED_on(prog); if (! utf8_target) { while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) @@ -6925,7 +7320,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, to_complement = 1; goto utf8_posix; } - /* FALL THROUGH */ + /* FALLTHROUGH */ case NPOSIXA: if (! utf8_target) { @@ -6936,10 +7331,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, else { /* The complement of something that matches only ASCII matches all - * UTF-8 variant code points, plus everything in ASCII that isn't - * in the class. */ + * non-ASCII, plus everything in ASCII that isn't in the class. */ while (hardcount < max && scan < loceol - && (! UTF8_IS_INVARIANT(*scan) + && (! isASCII_utf8(scan) || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) { scan += UTF8SKIP(scan); @@ -6983,8 +7377,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, - *(scan + 1)), + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), classnum)))) { break; @@ -7007,7 +7401,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, case _CC_ENUM_SPACE: /* XXX would require separate code if we revert the change of \v matching this */ - /* FALL THROUGH */ + /* FALLTHROUGH */ case _CC_ENUM_PSXSPC: while (hardcount < max && scan < loceol @@ -7066,8 +7460,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, if (! PL_utf8_swash_ptrs[classnum]) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; PL_utf8_swash_ptrs[classnum] = _core_swash_init( - "utf8", swash_property_names[classnum], - &PL_sv_undef, 1, 0, NULL, &flags); + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); } while (hardcount < max && scan < loceol @@ -7135,7 +7531,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, GET_RE_DEBUG_FLAGS_DECL; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, p); + regprop(prog, prop, p, reginfo); PerlIO_printf(Perl_debug_log, "%*s %s can match %"IVdf" times out of %"IVdf"...\n", REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); @@ -7161,31 +7557,39 @@ Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, *altsvp = NULL; } - return newSVsv(core_regclass_swash(prog, node, doinit, listsvp)); + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); } -#endif -STATIC SV * -S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp) +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) { - /* Returns the swash for the input 'node' in the regex 'prog'. - * If is true, will attempt to create the swash if not already + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. + * If is 'true', will attempt to create the swash if not already * done. - * If is non-null, will return the swash initialization string in - * it. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). * Tied intimately to how regcomp.c sets up the data structure */ dVAR; SV *sw = NULL; - SV *si = NULL; + SV *si = NULL; /* Input swash initialization string */ SV* invlist = NULL; RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; - PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; - assert(ANYOF_NONBITMAP(node)); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); if (data && data->count) { const U32 n = ARG(node); @@ -7198,26 +7602,40 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit si = *ary; /* ary[0] = the string to initialize the swash with */ - /* Elements 2 and 3 are either both present or both absent. [2] is - * any inversion list generated at compile time; [3] indicates if + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if * that inversion list has any user-defined properties in it. */ - if (av_len(av) >= 2) { - invlist = ary[2]; - if (SvUV(ary[3])) { - swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + assert(only_utf8_locale_ptr); + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; } - } - else { - invlist = NULL; } /* Element [1] is reserved for the set-up swash. If already there, * return it; if not, create it and store it there */ - if (SvROK(ary[1])) { + if (ary[1] && SvROK(ary[1])) { sw = ary[1]; } - else if (si && doinit) { - + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { + assert(si); sw = _core_swash_init("utf8", /* the utf8 package */ "", /* nameless */ si, @@ -7230,16 +7648,18 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit } } + /* If requested, return a printable version of what this swash matches */ if (listsvp) { - SV* matches_string = newSVpvn("", 0); + SV* matches_string = newSVpvs(""); - /* Use the swash, if any, which has to have incorporated into it all - * possibilities */ + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) && (si && si != &PL_sv_undef)) { - - /* If no swash, use the input initialization string, if available */ sv_catsv(matches_string, si); } @@ -7253,12 +7673,14 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit return sw; } +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ /* - reginclass - determine if a character falls into a character class n is the ANYOF regnode p is the target string + p_end points to one byte beyond the end of the target string utf8_target tells whether p is in UTF-8. Returns true if matched; false otherwise. @@ -7270,7 +7692,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit */ STATIC bool -S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target) +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); @@ -7283,7 +7705,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_IS_INVARIANT() works even if not in UTF-8 */ if (! UTF8_IS_INVARIANT(c) && utf8_target) { STRLEN c_len = 0; - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, + c = utf8n_to_uvchr(p, p_end - p, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for @@ -7296,21 +7718,19 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_NON_UTF8_LATIN1_ALL + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL && ! utf8_target && ! isASCII(c)) { match = TRUE; } - else if (flags & ANYOF_LOCALE) { - RXp_MATCH_TAINTED_on(prog); - - if ((flags & ANYOF_LOC_FOLD) - && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) - { - match = TRUE; + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } } - else if (ANYOF_CLASS_TEST_ANY_SET(n)) { + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { /* The data structure is arranged so bits 0, 2, 4, ... are set * if the class includes the Posix character class given by @@ -7344,8 +7764,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const int count = 0; int to_complement = 0; + while (count < ANYOF_MAX) { - if (ANYOF_CLASS_TEST(n, count) + if (ANYOF_POSIXL_TEST(n, count) && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) { match = TRUE; @@ -7358,60 +7779,63 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } } + /* If the bitmap didn't (or couldn't) match, and something outside the - * bitmap could match, try that. Locale nodes specify completely the - * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except in - * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes. So allow locale - * to match for the synthetic start class, which will give a false - * positive that will be resolved when the match is done again as not part - * of the synthetic start class */ + * bitmap could match, try that. */ if (!match) { - if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { match = TRUE; /* Everything above 255 matches */ } - else if (ANYOF_NONBITMAP(n) - && ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target - && (c >=256 - || (! (flags & ANYOF_LOCALE)) - || OP(n) == ANYOF_SYNTHETIC)))) + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) { - SV * const sw = core_regclass_swash(prog, n, TRUE, 0); + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); if (sw) { + U8 utf8_buffer[2]; U8 * utf8_p; if (utf8_target) { utf8_p = (U8 *) p; } else { /* Convert to utf8 */ - STRLEN len = 1; - utf8_p = bytes_to_utf8(p, &len); + utf8_p = utf8_buffer; + append_utf8_from_native_byte(*p, &utf8_p); + utf8_p = utf8_buffer; } if (swash_fetch(sw, utf8_p, TRUE)) { match = TRUE; } - - /* If we allocated a string above, free it */ - if (! utf8_target) Safefree(utf8_p); + } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); } } if (UNICODE_IS_SUPER(c) - && OP(n) == ANYOF_WARN_SUPER + && (flags & ANYOF_WARN_SUPER) && ckWARN_d(WARN_NON_UNICODE)) { Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); } } +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ - return cBOOL(flags & ANYOF_INVERT) ^ match; + return (flags & ANYOF_INVERT) ^ match; } STATIC U8 * -S_reghop3(U8 *s, I32 off, const U8* lim) +S_reghop3(U8 *s, SSize_t off, const U8* lim) { /* return the position 'off' UTF-8 characters away from 's', forward if * 'off' >= 0, backwards if negative. But don't go outside of position @@ -7440,13 +7864,8 @@ S_reghop3(U8 *s, I32 off, const U8* lim) return s; } -#ifdef XXX_dmq -/* there are a bunch of places where we use two reghop3's that should - be replaced with this routine. but since thats not done yet - we ifdef it out - dmq -*/ STATIC U8 * -S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) { dVAR; @@ -7470,10 +7889,12 @@ S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) } return s; } -#endif + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ STATIC U8 * -S_reghopmaybe3(U8* s, I32 off, const U8* lim) +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) { dVAR; @@ -7534,19 +7955,14 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) DEFSV_set(reginfo->sv); } - if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv) - && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) { + if (!(mg = mg_find_mglob(reginfo->sv))) { /* prepare for quick setting of pos */ -#ifdef PERL_OLD_COPY_ON_WRITE - if (SvIsCOW(reginfo->sv)) - sv_force_normal_flags(reginfo->sv, 0); -#endif - mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global, - &PL_vtbl_mglob, NULL, 0); + mg = sv_magicext_mglob(reginfo->sv); mg->mg_len = -1; } eval_state->pos_magic = mg; eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; } else eval_state->pos_magic = NULL; @@ -7563,7 +7979,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) /* this regexp is also owned by the new PL_reg_curpm, which will try to free it. */ av_push(PL_regex_padav, repointer); - PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); PL_regex_pad = AvARRAY(PL_regex_padav); } #endif @@ -7621,7 +8037,12 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } if (eval_state->pos_magic) + { eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } PL_curpm = eval_state->curpm; } diff --git a/src/5021002/dquote_static.c b/src/5021002/dquote_static.c new file mode 100644 index 0000000..dd47f14 --- /dev/null +++ b/src/5021002/dquote_static.c @@ -0,0 +1,328 @@ +/* dquote_static.c + * + * This file contains static functions that are related to + * parsing double-quotish expressions, but are used in more than + * one file. + * + * It is currently #included by regcomp.c and toke.c. +*/ + +#define PERL_IN_DQUOTE_STATIC_C +#include "embed.h" + +/* + - regcurly - a little FSA that accepts {\d+,?\d*} + Pulled from regcomp.c. + */ +PERL_STATIC_INLINE I32 +S_regcurly(const char *s) +{ + PERL_ARGS_ASSERT_REGCURLY; + + if (*s++ != '{') + return FALSE; + if (!isDIGIT(*s)) + return FALSE; + while (isDIGIT(*s)) + s++; + if (*s == ',') { + s++; + while (isDIGIT(*s)) + s++; + } + + return *s == '}'; +} + +/* XXX Add documentation after final interface and behavior is decided */ +/* May want to show context for error, so would pass Perl_bslash_c(pTHX_ const char* current, const char* start, const bool output_warning) + U8 source = *current; +*/ + +STATIC char +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) +{ + + U8 result; + + if (! isPRINT_A(source)) { + Perl_croak(aTHX_ "%s", + "Character following \"\\c\" must be printable ASCII"); + } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); + + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); + } + + result = toCTRL(source); + if (output_warning && isPRINT_A(result)) { + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; + } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); + } + + return result; +} + +STATIC bool +S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg, + const bool output_warning, const bool strict, + const bool silence_non_portable, + const bool UTF) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns FALSE if there is an error which the caller need not recover + * from; , otherwise TRUE. In either case the caller should look at *len + * On input: + * s is the address of a pointer to a NULL terminated string that begins + * with 'o', and the previous character was a backslash. At exit, *s + * will be advanced to the byte just after those absorbed by this + * function. Hence the caller can continue parsing from there. In + * the case of an error, this routine has generally positioned *s to + * point just to the right of the first bad spot, so that a message + * that has a "<--" to mark the spot will be correctly positioned. + * uv points to a UV that will hold the output value, valid only if the + * return from the function is TRUE + * error_msg is a pointer that will be set to an internal buffer giving an + * error message upon failure (the return is FALSE). Untouched if + * function succeeds + * output_warning says whether to output any warning messages, or suppress + * them + * strict is true if this should fail instead of warn if there are + * non-octal digits within the braces + * silence_non_portable is true if to suppress warnings about the code + * point returned being too large to fit on all platforms. + * UTF is true iff the string *s is encoded in UTF-8. + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + /* XXX Until the message is improved in grok_oct, handle errors + * ourselves */ + | PERL_SCAN_SILENT_ILLDIGIT; + + PERL_ARGS_ASSERT_GROK_BSLASH_O; + + + assert(**s == 'o'); + (*s)++; + + if (**s != '{') { + *error_msg = "Missing braces on \\o{}"; + return FALSE; + } + + e = strchr(*s, '}'); + if (!e) { + (*s)++; /* Move past the '{' */ + while (isOCTAL(**s)) { /* Position beyond the legal digits */ + (*s)++; + } + *error_msg = "Missing right brace on \\o{"; + return FALSE; + } + + (*s)++; /* Point to expected first digit (could be first byte of utf8 + sequence if not a digit) */ + numbers_len = e - *s; + if (numbers_len == 0) { + (*s)++; /* Move past the } */ + *error_msg = "Number with no digits"; + return FALSE; + } + + if (silence_non_portable) { + flags |= PERL_SCAN_SILENT_NON_PORTABLE; + } + + *uv = grok_oct(*s, &numbers_len, &flags, NULL); + /* Note that if has non-octal, will ignore everything starting with that up + * to the '}' */ + + if (numbers_len != (STRLEN) (e - *s)) { + if (strict) { + *s += numbers_len; + *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1; + *error_msg = "Non-octal character"; + return FALSE; + } + else if (output_warning) { + Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT), + /* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */ + "Non-octal character '%c'. Resolved as \"\\o{%.*s}\"", + *(*s + numbers_len), + (int) numbers_len, + *s); + } + } + + /* Return past the '}' */ + *s = e + 1; + + return TRUE; +} + +PERL_STATIC_INLINE bool +S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg, + const bool output_warning, const bool strict, + const bool silence_non_portable, + const bool UTF) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns FALSE if there is an error which the caller need not recover + * from; , otherwise TRUE. In either case the caller should look at *len + * On input: + * s is the address of a pointer to a NULL terminated string that begins + * with 'x', and the previous character was a backslash. At exit, *s + * will be advanced to the byte just after those absorbed by this + * function. Hence the caller can continue parsing from there. In + * the case of an error, this routine has generally positioned *s to + * point just to the right of the first bad spot, so that a message + * that has a "<--" to mark the spot will be correctly positioned. + * uv points to a UV that will hold the output value, valid only if the + * return from the function is TRUE + * error_msg is a pointer that will be set to an internal buffer giving an + * error message upon failure (the return is FALSE). Untouched if + * function succeeds + * output_warning says whether to output any warning messages, or suppress + * them + * strict is true if anything out of the ordinary should cause this to + * fail instead of warn or be silent. For example, it requires + * exactly 2 digits following the \x (when there are no braces). + * 3 digits could be a mistake, so is forbidden in this mode. + * silence_non_portable is true if to suppress warnings about the code + * point returned being too large to fit on all platforms. + * UTF is true iff the string *s is encoded in UTF-8. + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + + PERL_ARGS_ASSERT_GROK_BSLASH_X; + + PERL_UNUSED_ARG(output_warning); + + assert(**s == 'x'); + (*s)++; + + if (strict) { + flags |= PERL_SCAN_SILENT_ILLDIGIT; + } + + if (**s != '{') { + STRLEN len = (strict) ? 3 : 2; + + *uv = grok_hex(*s, &len, &flags, NULL); + *s += len; + if (strict && len != 2) { + if (len < 2) { + *s += (UTF) ? UTF8SKIP(*s) : 1; + *error_msg = "Non-hex character"; + } + else { + *error_msg = "Use \\x{...} for more than two hex characters"; + } + return FALSE; + } + return TRUE; + } + + e = strchr(*s, '}'); + if (!e) { + (*s)++; /* Move past the '{' */ + while (isXDIGIT(**s)) { /* Position beyond the legal digits */ + (*s)++; + } + /* XXX The corresponding message above for \o is just '\\o{'; other + * messages for other constructs include the '}', so are inconsistent. + */ + *error_msg = "Missing right brace on \\x{}"; + return FALSE; + } + + (*s)++; /* Point to expected first digit (could be first byte of utf8 + sequence if not a digit) */ + numbers_len = e - *s; + if (numbers_len == 0) { + if (strict) { + (*s)++; /* Move past the } */ + *error_msg = "Number with no digits"; + return FALSE; + } + return TRUE; + } + + flags |= PERL_SCAN_ALLOW_UNDERSCORES; + if (silence_non_portable) { + flags |= PERL_SCAN_SILENT_NON_PORTABLE; + } + + *uv = grok_hex(*s, &numbers_len, &flags, NULL); + /* Note that if has non-hex, will ignore everything starting with that up + * to the '}' */ + + if (strict && numbers_len != (STRLEN) (e - *s)) { + *s += numbers_len; + *s += (UTF) ? UTF8SKIP(*s) : 1; + *error_msg = "Non-hex character"; + return FALSE; + } + + /* Return past the '}' */ + *s = e + 1; + + return TRUE; +} + +STATIC char* +S_form_short_octal_warning(pTHX_ + const char * const s, /* Points to first non-octal */ + const STRLEN len /* Length of octals string, so + (s-len) points to first + octal */ +) { + /* Return a character string consisting of a warning message for when a + * string constant in octal is weird, like "\078". */ + + const char * sans_leading_zeros = s - len; + + PERL_ARGS_ASSERT_FORM_SHORT_OCTAL_WARNING; + + assert(*s == '8' || *s == '9'); + + /* Remove the leading zeros, retaining one zero so won't be zero length */ + while (*sans_leading_zeros == '0') sans_leading_zeros++; + if (sans_leading_zeros == s) { + sans_leading_zeros--; + } + + return Perl_form(aTHX_ + "'%.*s' resolved to '\\o{%.*s}%c'", + (int) (len + 2), s - len - 1, + (int) (s - sans_leading_zeros), sans_leading_zeros, + *s); +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021002/inline_invlist.c b/src/5021002/inline_invlist.c new file mode 100644 index 0000000..6cdeff4 --- /dev/null +++ b/src/5021002/inline_invlist.c @@ -0,0 +1,66 @@ +/* inline_invlist.c + * + * Copyright (C) 2012 by Larry Wall and others + * + * You may distribute under the terms of either the GNU General Public + * License or the Artistic License, as specified in the README file. + */ + +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) + +/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, + * etc */ +#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) +#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) + +/* This converts to/from our UVs to what the SV code is expecting: bytes. */ +#define TO_INTERNAL_SIZE(x) ((x) * sizeof(UV)) +#define FROM_INTERNAL_SIZE(x) ((x)/ sizeof(UV)) + +PERL_STATIC_INLINE bool* +S_get_invlist_offset_addr(SV* invlist) +{ + /* Return the address of the field that says whether the inversion list is + * offset (it contains 1) or not (contains 0) */ + PERL_ARGS_ASSERT_GET_INVLIST_OFFSET_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->is_offset); +} + +PERL_STATIC_INLINE UV +S__invlist_len(SV* const invlist) +{ + /* Returns the current number of elements stored in the inversion list's + * array */ + + PERL_ARGS_ASSERT__INVLIST_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return (SvCUR(invlist) == 0) + ? 0 + : FROM_INTERNAL_SIZE(SvCUR(invlist)) - *get_invlist_offset_addr(invlist); +} + +PERL_STATIC_INLINE bool +S__invlist_contains_cp(SV* const invlist, const UV cp) +{ + /* Does contain code point as part of the set? */ + + IV index = _invlist_search(invlist, cp); + + PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; + + return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); +} + +# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGEXEC_C) + +/* These symbols are only needed later in regcomp.c */ +# undef TO_INTERNAL_SIZE +# undef FROM_INTERNAL_SIZE +# endif + +#endif diff --git a/src/5021002/orig/dquote_static.c b/src/5021002/orig/dquote_static.c new file mode 100644 index 0000000..802d83b --- /dev/null +++ b/src/5021002/orig/dquote_static.c @@ -0,0 +1,328 @@ +/* dquote_static.c + * + * This file contains static functions that are related to + * parsing double-quotish expressions, but are used in more than + * one file. + * + * It is currently #included by regcomp.c and toke.c. +*/ + +#define PERL_IN_DQUOTE_STATIC_C +#include "embed.h" + +/* + - regcurly - a little FSA that accepts {\d+,?\d*} + Pulled from regcomp.c. + */ +PERL_STATIC_INLINE I32 +S_regcurly(const char *s) +{ + PERL_ARGS_ASSERT_REGCURLY; + + if (*s++ != '{') + return FALSE; + if (!isDIGIT(*s)) + return FALSE; + while (isDIGIT(*s)) + s++; + if (*s == ',') { + s++; + while (isDIGIT(*s)) + s++; + } + + return *s == '}'; +} + +/* XXX Add documentation after final interface and behavior is decided */ +/* May want to show context for error, so would pass Perl_bslash_c(pTHX_ const char* current, const char* start, const bool output_warning) + U8 source = *current; +*/ + +STATIC char +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) +{ + + U8 result; + + if (! isPRINT_A(source)) { + Perl_croak(aTHX_ "%s", + "Character following \"\\c\" must be printable ASCII"); + } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); + + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); + } + + result = toCTRL(source); + if (output_warning && isPRINT_A(result)) { + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; + } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); + } + + return result; +} + +STATIC bool +S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg, + const bool output_warning, const bool strict, + const bool silence_non_portable, + const bool UTF) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns FALSE if there is an error which the caller need not recover + * from; , otherwise TRUE. In either case the caller should look at *len + * On input: + * s is the address of a pointer to a NULL terminated string that begins + * with 'o', and the previous character was a backslash. At exit, *s + * will be advanced to the byte just after those absorbed by this + * function. Hence the caller can continue parsing from there. In + * the case of an error, this routine has generally positioned *s to + * point just to the right of the first bad spot, so that a message + * that has a "<--" to mark the spot will be correctly positioned. + * uv points to a UV that will hold the output value, valid only if the + * return from the function is TRUE + * error_msg is a pointer that will be set to an internal buffer giving an + * error message upon failure (the return is FALSE). Untouched if + * function succeeds + * output_warning says whether to output any warning messages, or suppress + * them + * strict is true if this should fail instead of warn if there are + * non-octal digits within the braces + * silence_non_portable is true if to suppress warnings about the code + * point returned being too large to fit on all platforms. + * UTF is true iff the string *s is encoded in UTF-8. + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + /* XXX Until the message is improved in grok_oct, handle errors + * ourselves */ + | PERL_SCAN_SILENT_ILLDIGIT; + + PERL_ARGS_ASSERT_GROK_BSLASH_O; + + + assert(**s == 'o'); + (*s)++; + + if (**s != '{') { + *error_msg = "Missing braces on \\o{}"; + return FALSE; + } + + e = strchr(*s, '}'); + if (!e) { + (*s)++; /* Move past the '{' */ + while (isOCTAL(**s)) { /* Position beyond the legal digits */ + (*s)++; + } + *error_msg = "Missing right brace on \\o{"; + return FALSE; + } + + (*s)++; /* Point to expected first digit (could be first byte of utf8 + sequence if not a digit) */ + numbers_len = e - *s; + if (numbers_len == 0) { + (*s)++; /* Move past the } */ + *error_msg = "Number with no digits"; + return FALSE; + } + + if (silence_non_portable) { + flags |= PERL_SCAN_SILENT_NON_PORTABLE; + } + + *uv = grok_oct(*s, &numbers_len, &flags, NULL); + /* Note that if has non-octal, will ignore everything starting with that up + * to the '}' */ + + if (numbers_len != (STRLEN) (e - *s)) { + if (strict) { + *s += numbers_len; + *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1; + *error_msg = "Non-octal character"; + return FALSE; + } + else if (output_warning) { + Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT), + /* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */ + "Non-octal character '%c'. Resolved as \"\\o{%.*s}\"", + *(*s + numbers_len), + (int) numbers_len, + *s); + } + } + + /* Return past the '}' */ + *s = e + 1; + + return TRUE; +} + +PERL_STATIC_INLINE bool +S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg, + const bool output_warning, const bool strict, + const bool silence_non_portable, + const bool UTF) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns FALSE if there is an error which the caller need not recover + * from; , otherwise TRUE. In either case the caller should look at *len + * On input: + * s is the address of a pointer to a NULL terminated string that begins + * with 'x', and the previous character was a backslash. At exit, *s + * will be advanced to the byte just after those absorbed by this + * function. Hence the caller can continue parsing from there. In + * the case of an error, this routine has generally positioned *s to + * point just to the right of the first bad spot, so that a message + * that has a "<--" to mark the spot will be correctly positioned. + * uv points to a UV that will hold the output value, valid only if the + * return from the function is TRUE + * error_msg is a pointer that will be set to an internal buffer giving an + * error message upon failure (the return is FALSE). Untouched if + * function succeeds + * output_warning says whether to output any warning messages, or suppress + * them + * strict is true if anything out of the ordinary should cause this to + * fail instead of warn or be silent. For example, it requires + * exactly 2 digits following the \x (when there are no braces). + * 3 digits could be a mistake, so is forbidden in this mode. + * silence_non_portable is true if to suppress warnings about the code + * point returned being too large to fit on all platforms. + * UTF is true iff the string *s is encoded in UTF-8. + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + + PERL_ARGS_ASSERT_GROK_BSLASH_X; + + PERL_UNUSED_ARG(output_warning); + + assert(**s == 'x'); + (*s)++; + + if (strict) { + flags |= PERL_SCAN_SILENT_ILLDIGIT; + } + + if (**s != '{') { + STRLEN len = (strict) ? 3 : 2; + + *uv = grok_hex(*s, &len, &flags, NULL); + *s += len; + if (strict && len != 2) { + if (len < 2) { + *s += (UTF) ? UTF8SKIP(*s) : 1; + *error_msg = "Non-hex character"; + } + else { + *error_msg = "Use \\x{...} for more than two hex characters"; + } + return FALSE; + } + return TRUE; + } + + e = strchr(*s, '}'); + if (!e) { + (*s)++; /* Move past the '{' */ + while (isXDIGIT(**s)) { /* Position beyond the legal digits */ + (*s)++; + } + /* XXX The corresponding message above for \o is just '\\o{'; other + * messages for other constructs include the '}', so are inconsistent. + */ + *error_msg = "Missing right brace on \\x{}"; + return FALSE; + } + + (*s)++; /* Point to expected first digit (could be first byte of utf8 + sequence if not a digit) */ + numbers_len = e - *s; + if (numbers_len == 0) { + if (strict) { + (*s)++; /* Move past the } */ + *error_msg = "Number with no digits"; + return FALSE; + } + return TRUE; + } + + flags |= PERL_SCAN_ALLOW_UNDERSCORES; + if (silence_non_portable) { + flags |= PERL_SCAN_SILENT_NON_PORTABLE; + } + + *uv = grok_hex(*s, &numbers_len, &flags, NULL); + /* Note that if has non-hex, will ignore everything starting with that up + * to the '}' */ + + if (strict && numbers_len != (STRLEN) (e - *s)) { + *s += numbers_len; + *s += (UTF) ? UTF8SKIP(*s) : 1; + *error_msg = "Non-hex character"; + return FALSE; + } + + /* Return past the '}' */ + *s = e + 1; + + return TRUE; +} + +STATIC char* +S_form_short_octal_warning(pTHX_ + const char * const s, /* Points to first non-octal */ + const STRLEN len /* Length of octals string, so + (s-len) points to first + octal */ +) { + /* Return a character string consisting of a warning message for when a + * string constant in octal is weird, like "\078". */ + + const char * sans_leading_zeros = s - len; + + PERL_ARGS_ASSERT_FORM_SHORT_OCTAL_WARNING; + + assert(*s == '8' || *s == '9'); + + /* Remove the leading zeros, retaining one zero so won't be zero length */ + while (*sans_leading_zeros == '0') sans_leading_zeros++; + if (sans_leading_zeros == s) { + sans_leading_zeros--; + } + + return Perl_form(aTHX_ + "'%.*s' resolved to '\\o{%.*s}%c'", + (int) (len + 2), s - len - 1, + (int) (s - sans_leading_zeros), sans_leading_zeros, + *s); +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021002/orig/inline_invlist.c b/src/5021002/orig/inline_invlist.c new file mode 100644 index 0000000..1875a05 --- /dev/null +++ b/src/5021002/orig/inline_invlist.c @@ -0,0 +1,66 @@ +/* inline_invlist.c + * + * Copyright (C) 2012 by Larry Wall and others + * + * You may distribute under the terms of either the GNU General Public + * License or the Artistic License, as specified in the README file. + */ + +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) + +/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, + * etc */ +#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) +#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) + +/* This converts to/from our UVs to what the SV code is expecting: bytes. */ +#define TO_INTERNAL_SIZE(x) ((x) * sizeof(UV)) +#define FROM_INTERNAL_SIZE(x) ((x)/ sizeof(UV)) + +PERL_STATIC_INLINE bool* +S_get_invlist_offset_addr(SV* invlist) +{ + /* Return the address of the field that says whether the inversion list is + * offset (it contains 1) or not (contains 0) */ + PERL_ARGS_ASSERT_GET_INVLIST_OFFSET_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->is_offset); +} + +PERL_STATIC_INLINE UV +S__invlist_len(SV* const invlist) +{ + /* Returns the current number of elements stored in the inversion list's + * array */ + + PERL_ARGS_ASSERT__INVLIST_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return (SvCUR(invlist) == 0) + ? 0 + : FROM_INTERNAL_SIZE(SvCUR(invlist)) - *get_invlist_offset_addr(invlist); +} + +PERL_STATIC_INLINE bool +S__invlist_contains_cp(SV* const invlist, const UV cp) +{ + /* Does contain code point as part of the set? */ + + IV index = _invlist_search(invlist, cp); + + PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; + + return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); +} + +# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGEXEC_C) + +/* These symbols are only needed later in regcomp.c */ +# undef TO_INTERNAL_SIZE +# undef FROM_INTERNAL_SIZE +# endif + +#endif diff --git a/src/5021002/orig/regcomp.c b/src/5021002/orig/regcomp.c new file mode 100644 index 0000000..3d4d348 --- /dev/null +++ b/src/5021002/orig/regcomp.c @@ -0,0 +1,16880 @@ +/* regcomp.c + */ + +/* + * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee + * + * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] + */ + +/* This file contains functions for compiling a regular expression. See + * also regexec.c which funnily enough, contains functions for executing + * a regular expression. + * + * This file is also copied at build time to ext/re/re_comp.c, where + * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. + * This causes the main functions to be compiled under new names and with + * debugging support added, which makes "use re 'debug'" work. + */ + +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +/* + * pregcomp and pregexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + * + **** Alterations to Henry's code are... + **** + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others + **** + **** You may distribute under the terms of either the GNU General Public + **** License or the Artistic License, as specified in the README file. + + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#define PERL_IN_REGCOMP_C +#include "perl.h" + +#ifndef PERL_IN_XSUB_RE +# include "INTERN.h" +#endif + +#define REG_COMP_C +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +EXTERN_C const struct regexp_engine my_reg_engine; +#else +# include "regcomp.h" +#endif + +#include "dquote_static.c" +#include "charclass_invlists.h" +#include "inline_invlist.c" +#include "unicode_constants.h" + +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) +#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) + +#ifndef STATIC +#define STATIC static +#endif + + +struct RExC_state_t { + U32 flags; /* RXf_* are we folding, multilining? */ + U32 pm_flags; /* PMf_* stuff from the calling PMOP */ + char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ + regexp *rx; /* perl core regexp structure */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ + char *start; /* Start of input for compile */ + char *end; /* End of input for compile */ + char *parse; /* Input-scan pointer. */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ + regnode *emit_start; /* Start of emitted-code area */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ + regnode *emit; /* Code-emit pointer; if = &emit_dummy, + implies compiling, so don't emit */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ + I32 naughty; /* How bad is this pattern? */ + I32 sawback; /* Did we see \1, ...? */ + U32 seen; + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ + I32 extralen; + I32 seen_zerolen; + regnode **open_parens; /* pointers to open parens */ + regnode **close_parens; /* pointers to close parens */ + regnode *opend; /* END node in program */ + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ + I32 uni_semantics; /* If a d charset modifier should use unicode + rules, even if the pattern is not in + utf8 */ + HV *paren_names; /* Paren names */ + + regnode **recurse; /* Recurse regops */ + I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ + I32 in_lookbehind; + I32 contains_locale; + I32 contains_i; + I32 override_recoding; + I32 in_multi_char_class; + struct reg_code_block *code_blocks; /* positions of literal (?{}) + within pattern */ + int num_code_blocks; /* size of code_blocks[] */ + int code_index; /* next code_blocks[] slot */ + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ +#define RExC_starttry (pRExC_state->starttry) +#endif + SV *runtime_code_qr; /* qr with the runtime code blocks */ +#ifdef DEBUGGING + const char *lastparse; + I32 lastnum; + AV *paren_name_list; /* idx -> name */ +#define RExC_lastparse (pRExC_state->lastparse) +#define RExC_lastnum (pRExC_state->lastnum) +#define RExC_paren_name_list (pRExC_state->paren_name_list) +#endif +}; + +#define RExC_flags (pRExC_state->flags) +#define RExC_pm_flags (pRExC_state->pm_flags) +#define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) +#define RExC_rx (pRExC_state->rx) +#define RExC_rxi (pRExC_state->rxi) +#define RExC_start (pRExC_state->start) +#define RExC_end (pRExC_state->end) +#define RExC_parse (pRExC_state->parse) +#define RExC_whilem_seen (pRExC_state->whilem_seen) +#ifdef RE_TRACK_PATTERN_OFFSETS +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ +#endif +#define RExC_emit (pRExC_state->emit) +#define RExC_emit_dummy (pRExC_state->emit_dummy) +#define RExC_emit_start (pRExC_state->emit_start) +#define RExC_emit_bound (pRExC_state->emit_bound) +#define RExC_naughty (pRExC_state->naughty) +#define RExC_sawback (pRExC_state->sawback) +#define RExC_seen (pRExC_state->seen) +#define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) +#define RExC_npar (pRExC_state->npar) +#define RExC_nestroot (pRExC_state->nestroot) +#define RExC_extralen (pRExC_state->extralen) +#define RExC_seen_zerolen (pRExC_state->seen_zerolen) +#define RExC_utf8 (pRExC_state->utf8) +#define RExC_uni_semantics (pRExC_state->uni_semantics) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) +#define RExC_open_parens (pRExC_state->open_parens) +#define RExC_close_parens (pRExC_state->close_parens) +#define RExC_opend (pRExC_state->opend) +#define RExC_paren_names (pRExC_state->paren_names) +#define RExC_recurse (pRExC_state->recurse) +#define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) +#define RExC_in_lookbehind (pRExC_state->in_lookbehind) +#define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) +#define RExC_override_recoding (pRExC_state->override_recoding) +#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) + + +#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ + ((*s) == '{' && regcurly(s))) + +/* + * Flags to be passed up and down. + */ +#define WORST 0 /* Worst case. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ + +/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single + * character. (There needs to be a case: in the switch statement in regexec.c + * for any node marked SIMPLE.) Note that this is not the same thing as + * REGNODE_SIMPLE */ +#define SIMPLE 0x02 +#define SPSTART 0x04 /* Starts with * or + */ +#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ +#define TRYAGAIN 0x10 /* Weeded out a declaration. */ +#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */ + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) + +/* whether trie related optimizations are enabled */ +#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION +#define TRIE_STUDY_OPT +#define FULL_TRIE_STUDY +#define TRIE_STCLASS +#endif + + + +#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] +#define PBITVAL(paren) (1 << ((paren) & 7)) +#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) +#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) +#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) + +#define REQUIRE_UTF8 STMT_START { \ + if (!UTF) { \ + *flagp = RESTART_UTF8; \ + return NULL; \ + } \ + } STMT_END + +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((int) ((class) / 2)) +#define classnum_to_namedclass(classnum) ((classnum) * 2) + +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + +/* About scan_data_t. + + During optimisation we recurse through the regexp program performing + various inplace (keyhole style) optimisations. In addition study_chunk + and scan_commit populate this data structure with information about + what strings MUST appear in the pattern. We look for the longest + string that must appear at a fixed location, and we look for the + longest string that may appear at a floating location. So for instance + in the pattern: + + /FOO[xX]A.*B[xX]BAR/ + + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating + strings (because they follow a .* construct). study_chunk will identify + both FOO and BAR as being the longest fixed and floating strings respectively. + + The strings can be composites, for instance + + /(f)(o)(o)/ + + will result in a composite fixed substring 'foo'. + + For each string some basic information is maintained: + + - offset or min_offset + This is the position the string must appear at, or not before. + It also implicitly (when combined with minlenp) tells us how many + characters must match before the string we are searching for. + Likewise when combined with minlenp and the length of the string it + tells us how many characters must appear after the string we have + found. + + - max_offset + Only used for floating strings. This is the rightmost point that + the string can appear at. If set to SSize_t_MAX it indicates that the + string can occur infinitely far to the right. + + - minlenp + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive + lookahead or positive lookbehind we can have multiple patterns + involved. Consider + + /(?=FOO).*F/ + + The minimum length of the pattern overall is 3, the minimum length + of the lookahead part is 3, but the minimum length of the part that + will actually match is 1. So 'FOO's minimum length is 3, but the + minimum length for the F is 1. This is important as the minimum length + is used to determine offsets in front of and behind the string being + looked for. Since strings can be composites this is the length of the + pattern at the time it was committed with a scan_commit. Note that + the length is calculated by study_chunk, so that the minimum lengths + are not known until the full pattern has been compiled, thus the + pointer to the value. + + - lookbehind + + In the case of lookbehind the string being searched for can be + offset past the start point of the final matching string. + If this value was just blithely removed from the min_offset it would + invalidate some of the calculations for how many chars must match + before or after (as they are derived from min_offset and minlen and + the length of the string being searched for). + When the final pattern is compiled and the data is moved from the + scan_data_t structure into the regexp structure the information + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the + associated string. + + The fields pos_min and pos_delta are used to store the minimum offset + and the delta to the maximum offset at the current point in the pattern. + +*/ + +typedef struct scan_data_t { + /*I32 len_min; unused */ + /*I32 len_delta; unused */ + SSize_t pos_min; + SSize_t pos_delta; + SV *last_found; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; + SV **longest; /* Either &l_fixed, or &l_float. */ + SV *longest_fixed; /* longest fixed string found in pattern */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ + I32 lookbehind_fixed; /* is the position of the string modfied by LB */ + SV *longest_float; /* longest floating string found in pattern */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ + I32 flags; + I32 whilem_c; + SSize_t *last_closep; + regnode_ssc *start_class; +} scan_data_t; + +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + +/* + * Forward declarations for pregcomp()'s friends. + */ + +static const scan_data_t zero_scan_data = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; + +#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) +#define SF_BEFORE_SEOL 0x0001 +#define SF_BEFORE_MEOL 0x0002 +#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) +#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) + +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) + +#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) +#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) + +#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) +#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ +#define SF_IS_INF 0x0040 +#define SF_HAS_PAR 0x0080 +#define SF_IN_PAR 0x0100 +#define SF_HAS_EVAL 0x0200 +#define SCF_DO_SUBSTR 0x0400 +#define SCF_DO_STCLASS_AND 0x0800 +#define SCF_DO_STCLASS_OR 0x1000 +#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 + +#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ +#define SCF_SEEN_ACCEPT 0x8000 +#define SCF_TRIE_DOING_RESTUDY 0x10000 + +#define UTF cBOOL(RExC_utf8) + +/* The enums for all these are ordered so things work out correctly */ +#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) + +#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) + +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + +#define OOB_NAMEDCLASS -1 + +/* There is no code point that is out-of-bounds, so this is problematic. But + * its only current use is to initialize a variable that is always set before + * looked at. */ +#define OOB_UNICODE 0xDEADBEEF + +#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) + + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "<-- HERE" /* marker as it appears in the description */ +#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define _FAIL(code) STMT_START { \ + const char *ellipses = ""; \ + IV len = RExC_end - RExC_precomp; \ + \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + code; \ +} STMT_END + +#define FAIL(msg) _FAIL( \ + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +#define FAIL2(msg,arg) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL(m); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL2(m, a1); \ +} STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL3(m, a1, a2); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vFAIL4(m,a1,a2,a3) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL4(m, a1, a2, a3); \ +} STMT_END + +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + +/* m is not necessarily a "literal string", in this macro */ +#define reg_warn_non_literal_string(loc, m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNreg(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN_dep(loc, m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNdep(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNregdep(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN2reg(loc, m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN3(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN4(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + +/* Allow for side effects in s */ +#define REGC(c,s) STMT_START { \ + if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ +} STMT_END + +/* Macros for recording node offsets. 20001227 mjd@plover.com + * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in + * element 2*n-1 of the array. Element #2n holds the byte length node #n. + * Element 0 holds the number n. + * Position is 1 indexed. + */ +#ifndef RE_TRACK_PATTERN_OFFSETS +#define Set_Node_Offset_To_R(node,byte) +#define Set_Node_Offset(node,byte) +#define Set_Cur_Node_Offset +#define Set_Node_Length_To_R(node,len) +#define Set_Node_Length(node,len) +#define Set_Node_Cur_Length(node,start) +#define Node_Offset(n) +#define Node_Length(n) +#define Set_Node_Offset_Length(node,offset,len) +#define ProgLen(ri) ri->u.proglen +#define SetProgLen(ri,x) ri->u.proglen = x +#else +#define ProgLen(ri) ri->u.offsets[0] +#define SetProgLen(ri,x) ri->u.offsets[0] = x +#define Set_Node_Offset_To_R(node,byte) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(byte))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ + } else { \ + RExC_offsets[2*(node)-1] = (byte); \ + } \ + } \ +} STMT_END + +#define Set_Node_Offset(node,byte) \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) +#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) + +#define Set_Node_Length_To_R(node,len) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(len))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ + } else { \ + RExC_offsets[2*(node)] = (len); \ + } \ + } \ +} STMT_END + +#define Set_Node_Length(node,len) \ + Set_Node_Length_To_R((node)-RExC_emit_start, len) +#define Set_Node_Cur_Length(node, start) \ + Set_Node_Length(node, RExC_parse - start) + +/* Get offsets and lengths */ +#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) +#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) + +#define Set_Node_Offset_Length(node,offset,len) STMT_START { \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ + Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ +} STMT_END +#endif + +#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS +#define EXPERIMENTAL_INPLACESCAN +#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ + +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + +#define DEBUG_STUDYDATA(str,data,depth) \ +DEBUG_OPTIMISE_MORE_r(if(data){ \ + PerlIO_printf(Perl_debug_log, \ + "%*s" str "Pos:%"IVdf"/%"IVdf \ + " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ + (int)(depth)*2, "", \ + (IV)((data)->pos_min), \ + (IV)((data)->pos_delta), \ + (UV)((data)->flags), \ + (IV)((data)->whilem_c), \ + (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ + is_inf ? "INF " : "" \ + ); \ + if ((data)->last_found) \ + PerlIO_printf(Perl_debug_log, \ + "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ + " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ + SvPVX_const((data)->last_found), \ + (IV)((data)->last_end), \ + (IV)((data)->last_start_min), \ + (IV)((data)->last_start_max), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ + SvPVX_const((data)->longest_fixed), \ + (IV)((data)->offset_fixed), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_float)) ? "*" : "", \ + SvPVX_const((data)->longest_float), \ + (IV)((data)->offset_float_min), \ + (IV)((data)->offset_float_max) \ + ); \ + PerlIO_printf(Perl_debug_log,"\n"); \ +}); + +/* Mark that we cannot extend a found fixed substring at this point. + Update the longest found anchored substring and the longest found + floating substrings if needed. */ + +STATIC void +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) +{ + const STRLEN l = CHR_SVLEN(data->last_found); + const STRLEN old_l = CHR_SVLEN(*data->longest); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_SCAN_COMMIT; + + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { + SvSetMagicSV(*data->longest, data->last_found); + if (*data->longest == data->longest_fixed) { + data->offset_fixed = l ? data->last_start_min : data->pos_min; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); + else + data->flags &= ~SF_FIX_BEFORE_EOL; + data->minlen_fixed=minlenp; + data->lookbehind_fixed=0; + } + else { /* *data->longest == data->longest_float */ + data->offset_float_min = l ? data->last_start_min : data->pos_min; + data->offset_float_max = (l + ? data->last_start_max + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); + else + data->flags &= ~SF_FL_BEFORE_EOL; + data->minlen_float=minlenp; + data->lookbehind_float=0; + } + } + SvCUR_set(data->last_found, 0); + { + SV * const sv = data->last_found; + if (SvUTF8(sv) && SvMAGICAL(sv)) { + MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); + if (mg) + mg->mg_len = 0; + } + } + data->last_end = -1; + data->flags &= ~SF_BEFORE_EOL; + DEBUG_STUDYDATA("commit: ",data,0); +} + +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + +STATIC void +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ + + PERL_ARGS_ASSERT_SSC_INIT; + + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); + + /* If any portion of the regex is to operate under locale rules, + * initialization includes it. The reason this isn't done for all regexes + * is that the optimizer was written under the assumption that locale was + * all-or-nothing. Given the complexity and lack of documentation in the + * optimizer, and that there are inadequate test cases for locale, many + * parts of it may not work properly, it is safest to avoid locale unless + * necessary. */ + if (RExC_contains_locale) { + ANYOF_POSIXL_SETALL(ssc); + } + else { + ANYOF_POSIXL_ZERO(ssc); + } +} + +STATIC int +S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; +} + +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) +{ + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ + + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } + + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } + + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } + + return invlist; +} + +/* These two functions currently do the exact same thing */ +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ + +STATIC void +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) +{ + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ + + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; + + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); + + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); + + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } + } + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } + } +} + +STATIC void +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) +{ + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ + + SV* ored_cp_list; + U8 ored_flags; + + PERL_ARGS_ASSERT_SSC_OR; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ + + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); + } + } + } + } + + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} + +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; + } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); +} + +#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] +#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) +#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) + + +#ifdef DEBUGGING +/* + dump_trie(trie,widecharmap,revcharmap) + dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) + dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) + + These routines dump out a trie in a somewhat readable format. + The _interim_ variants are used for debugging the interim + tables that are used to generate the final compressed + representation which is what dump_trie expects. + + Part of the reason for their existence is to provide a form + of documentation as to how the different representations function. + +*/ + +/* + Dumps the final compressed table form of the trie to Perl_debug_log. + Used for debugging make_trie(). +*/ + +STATIC void +S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, + AV *revcharmap, U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + U16 word; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE; + + PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", + (int)depth * 2 + 2,"", + "Match","Base","Ofs" ); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) { + SV ** const tmp = av_fetch( revcharmap, state, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", + (int)depth * 2 + 2,""); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) + PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); + PerlIO_printf( Perl_debug_log, "\n"); + + for( state = 1 ; state < trie->statecount ; state++ ) { + const U32 base = trie->states[ state ].trans.base; + + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); + + if ( trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); + } else { + PerlIO_printf( Perl_debug_log, "%6s", "" ); + } + + PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); + + if ( base ) { + U32 ofs = 0; + + while( ( base + ofs < trie->uniquecharcount ) || + ( base + ofs - trie->uniquecharcount < trie->lasttrans + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) + ofs++; + + PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) + { + PerlIO_printf( Perl_debug_log, "%*"UVXf, + colwidth, + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); + } else { + PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); + } + } + + PerlIO_printf( Perl_debug_log, "]"); + + } + PerlIO_printf( Perl_debug_log, "\n" ); + } + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); + for (word=1; word <= trie->wordcount; word++) { + PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", + (int)word, (int)(trie->wordinfo[word].prev), + (int)(trie->wordinfo[word].len)); + } + PerlIO_printf(Perl_debug_log, "\n" ); +} +/* + Dumps a fully constructed but uncompressed trie in list form. + List tries normally only are used for construction when the number of + possible chars (trie->uniquecharcount) is very high. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + + /* print out the table precompression. */ + PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", + (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", + "------:-----+-----------------\n" ); + + for( state=1 ; state < next_alloc ; state ++ ) { + U16 charid; + + PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", + (int)depth * 2 + 2,"", (UV)state ); + if ( ! trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, "%5s| ",""); + } else { + PerlIO_printf( Perl_debug_log, "W%4x| ", + trie->states[ state ].wordnum + ); + } + for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR + ) , + TRIE_LIST_ITEM(state,charid).forid, + (UV)TRIE_LIST_ITEM(state,charid).newstate + ); + if (!(charid % 10)) + PerlIO_printf(Perl_debug_log, "\n%*s| ", + (int)((depth * 2) + 14), ""); + } + } + PerlIO_printf( Perl_debug_log, "\n"); + } +} + +/* + Dumps a fully constructed but uncompressed trie in table form. + This is the normal DFA style state transition table, with a few + twists to facilitate compression later. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + U16 charid; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; + + /* + print out the table precompression so that we can do a visual check + that they are identical. + */ + + PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, charid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + + PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); + + for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { + PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); + } + + PerlIO_printf( Perl_debug_log, "\n" ); + + for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { + + PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", + (int)depth * 2 + 2,"", + (UV)TRIE_NODENUM( state ) ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); + if (v) + PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); + else + PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); + } + if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); + } else { + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, + trie->states[ TRIE_NODENUM( state ) ].wordnum ); + } + } +} + +#endif + + +/* make_trie(startbranch,first,last,tail,word_count,flags,depth) + startbranch: the first branch in the whole branch sequence + first : start branch of sequence of branch-exact nodes. + May be the same as startbranch + last : Thing following the last branch. + May be the same as tail. + tail : item following the branch sequence + count : words in the sequence + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/ + depth : indent depth + +Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. + +A trie is an N'ary tree where the branches are determined by digital +decomposition of the key. IE, at the root node you look up the 1st character and +follow that branch repeat until you find the end of the branches. Nodes can be +marked as "accepting" meaning they represent a complete word. Eg: + + /he|she|his|hers/ + +would convert into the following structure. Numbers represent states, letters +following numbers represent valid transitions on the letter from that state, if +the number is in square brackets it represents an accepting state, otherwise it +will be in parenthesis. + + +-h->+-e->[3]-+-r->(8)-+-s->[9] + | | + | (2) + | | + (1) +-i->(6)-+-s->[7] + | + +-s->(3)-+-h->(4)-+-e->[5] + + Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) + +This shows that when matching against the string 'hers' we will begin at state 1 +read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, +then read 'r' and go to state 8 followed by 's' which takes us to state 9 which +is also accepting. Thus we know that we can match both 'he' and 'hers' with a +single traverse. We store a mapping from accepting to state to which word was +matched, and then when we have multiple possibilities we try to complete the +rest of the regex in the order in which they occured in the alternation. + +The only prior NFA like behaviour that would be changed by the TRIE support is +the silent ignoring of duplicate alternations which are of the form: + + / (DUPE|DUPE) X? (?{ ... }) Y /x + +Thus EVAL blocks following a trie may be called a different number of times with +and without the optimisation. With the optimisations dupes will be silently +ignored. This inconsistent behaviour of EVAL type nodes is well established as +the following demonstrates: + + 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ + +which prints out 'word' three times, but + + 'words'=~/(word|word|word)(?{ print $1 })S/ + +which doesnt print it out at all. This is due to other optimisations kicking in. + +Example of what happens on a structural level: + +The regexp /(ac|ad|ab)+/ will produce the following debug output: + + 1: CURLYM[1] {1,32767}(18) + 5: BRANCH(8) + 6: EXACT (16) + 8: BRANCH(11) + 9: EXACT (16) + 11: BRANCH(14) + 12: EXACT (16) + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +This would be optimizable with startbranch=5, first=5, last=16, tail=16 +and should turn into: + + 1: CURLYM[1] {1,32767}(18) + 5: TRIE(16) + [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] + + + + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +Cases where tail != last would be like /(?foo|bar)baz/: + + 1: BRANCH(4) + 2: EXACT (8) + 4: BRANCH(7) + 5: EXACT (8) + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + +which would be optimizable with startbranch=1, first=1, last=7, tail=8 +and would end up looking like: + + 1: TRIE(8) + [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] + + + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + + d = uvchr_to_utf8_flags(d, uv, 0); + +is the recommended Unicode-aware way of saying + + *(d++) = uv; +*/ + +#define TRIE_STORE_REVCHAR(val) \ + STMT_START { \ + if (UTF) { \ + SV *zlopp = newSV(7); /* XXX: optimize me */ \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)val; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END + +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ +} STMT_END + + + +#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ + if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ + U32 ging = TRIE_LIST_LEN( state ) *= 2; \ + Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ + } \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ + TRIE_LIST_CUR( state )++; \ +} STMT_END + +#define TRIE_LIST_NEW(state) STMT_START { \ + Newxz( trie->states[ state ].trans.list, \ + 4, reg_trie_trans_le ); \ + TRIE_LIST_CUR( state ) = 1; \ + TRIE_LIST_LEN( state ) = 4; \ +} STMT_END + +#define TRIE_HANDLE_WORD(state) STMT_START { \ + U16 dupe= trie->states[ state ].wordnum; \ + regnode * const noper_next = regnext( noper ); \ + \ + DEBUG_r({ \ + /* store the word for dumping */ \ + SV* tmp; \ + if (OP(noper) != NOTHING) \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ + else \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ + av_push( trie_words, tmp ); \ + }); \ + \ + curword++; \ + trie->wordinfo[curword].prev = 0; \ + trie->wordinfo[curword].len = wordlen; \ + trie->wordinfo[curword].accept = state; \ + \ + if ( noper_next < tail ) { \ + if (!trie->jump) \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ + trie->jump[curword] = (U16)(noper_next - convert); \ + if (!jumper) \ + jumper = noper_next; \ + if (!nextbranch) \ + nextbranch= regnext(cur); \ + } \ + \ + if ( dupe ) { \ + /* It's a dupe. Pre-insert into the wordinfo[].prev */\ + /* chain, so that when the bits of chain are later */\ + /* linked together, the dups appear in the chain */\ + trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ + trie->wordinfo[dupe].prev = curword; \ + } else { \ + /* we haven't inserted this word yet. */ \ + trie->states[ state ].wordnum = curword; \ + } \ +} STMT_END + + +#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ + ( ( base + charid >= ucharcount \ + && base + charid < ubound \ + && state == trie->trans[ base - ucharcount + charid ].check \ + && trie->trans[ base - ucharcount + charid ].next ) \ + ? trie->trans[ base - ucharcount + charid ].next \ + : ( state==1 ? special : 0 ) \ + ) + +#define MADE_TRIE 1 +#define MADE_JUMP_TRIE 2 +#define MADE_EXACT_TRIE 4 + +STATIC I32 +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) +{ + /* first pass, loop through and scan words */ + reg_trie_data *trie; + HV *widecharmap = NULL; + AV *revcharmap = newAV(); + regnode *cur; + STRLEN len = 0; + UV uvc = 0; + U16 curword = 0; + U32 next_alloc = 0; + regnode *jumper = NULL; + regnode *nextbranch = NULL; + regnode *convert = NULL; + U32 *prev_states; /* temp array mapping each state to previous one */ + /* we just use folder as a flag in utf8 */ + const U8 * folder = NULL; + +#ifdef DEBUGGING + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); + AV *trie_words = NULL; + /* along with revcharmap, this only used during construction but both are + * useful during debugging so we store them in the struct when debugging. + */ +#else + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); + STRLEN trie_charcount=0; +#endif + SV *re_trie_maxbuff; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + switch (flags) { + case EXACT: break; + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: folder = PL_fold_latin1; break; + case EXACTF: folder = PL_fold; break; + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); + } + + trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); + trie->refcount = 1; + trie->startstate = 1; + trie->wordcount = word_count; + RExC_rxi->data->data[ data_slot ] = (void*)trie; + trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); + if (flags == EXACT) + trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); + trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( + trie->wordcount+1, sizeof(reg_trie_wordinfo)); + + DEBUG_r({ + trie_words = newAV(); + }); + + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + assert(re_trie_maxbuff); + if (!SvIOK(re_trie_maxbuff)) { + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); + }); + + /* Find the node we are going to overwrite */ + if ( first == startbranch && OP( last ) != BRANCH ) { + /* whole branch chain */ + convert = first; + } else { + /* branch sub-chain */ + convert = NEXTOPER( first ); + } + + /* -- First loop and Setup -- + + We first traverse the branches and scan each word to determine if it + contains widechars, and how many unique chars there are, this is + important as we have to build a table with at least as many columns as we + have unique chars. + + We use an array of integers to represent the character codes 0..255 + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. + + *TODO* If we keep track of how many times each character is used we can + remap the columns so that the table compression later on is more + efficient in terms of memory by ensuring the most common value is in the + middle and the least common are on the outside. IMO this would be better + than a most to least common mapping as theres a decent chance the most + common letter will share a node with the least common, meaning the node + will not be compressible. With a middle is most common approach the worst + case is when we have the least common nodes twice. + + */ + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + regnode *noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + int foldlen = 0; + U32 wordlen = 0; /* required init */ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + trie->minlen= STR_LEN(noper); + } else { + trie->minlen= 0; + continue; + } + } + + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ + TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte + regardless of encoding */ + if (OP( noper ) == EXACTFU_SS) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); + } + } + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ + TRIE_CHARCOUNT(trie)++; + TRIE_READ_CHAR; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ + if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } + if ( !trie->charmap[ uvc ] ) { + trie->charmap[ uvc ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( uvc ); + } + if ( set_bit ) { + /* store the codepoint in the bitmap, and its folded + * equivalent. */ + TRIE_BITMAP_SET(trie, uvc); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + variant codepoints */ + if (! UVCHR_IS_INVARIANT(uvc)) { + TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); + } + } + set_bit = 0; /* We've done our bit :-) */ + } + } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + + SV** svpp; + if ( !widecharmap ) + widecharmap = newHV(); + + svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); + + if ( !svpp ) + Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); + + if ( !SvTRUE( *svpp ) ) { + sv_setiv( *svpp, ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR(uvc); + } + } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ + if( cur == first ) { + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; + } + } /* end first pass */ + DEBUG_TRIE_COMPILE_r( + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + (int)depth * 2 + 2,"", + ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, + (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, + (int)trie->minlen, (int)trie->maxlen ) + ); + + /* + We now know what we are dealing with in terms of unique chars and + string sizes so we can calculate how much memory a naive + representation using a flat table will take. If it's over a reasonable + limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory + conservative but potentially much slower representation using an array + of lists. + + At the end we convert both representations into the same compressed + form that will be used in regexec.c for matching with. The latter + is a form that cannot be used to construct with but has memory + properties similar to the list form and access properties similar + to the table form making it both suitable for fast searches and + small enough that its feasable to store for the duration of a program. + + See the comment in the code where the compressed table is produced + inplace from the flat tabe representation for an explanation of how + the compression works. + + */ + + + Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); + prev_states[1] = 0; + + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { + /* + Second Pass -- Array Of Lists Representation + + Each state will be represented by a list of charid:state records + (reg_trie_trans_le) the first such element holds the CUR and LEN + points of the allocated array. (See defines above). + + We build the initial structure using the lists, and then convert + it into the compressed table form which allows faster lookups + (but cant be modified once converted). + */ + + STRLEN transcount = 1; + + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using list compiler\n", + (int)depth * 2 + 2, "")); + + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + TRIE_LIST_NEW(1); + next_alloc = 2; + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = NEXTOPER( cur ); + U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + U32 state = 1; /* required init */ + U16 charid = 0; /* sanity init */ + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + + if (OP(noper) != NOTHING) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + if ( !svpp ) { + charid = 0; + } else { + charid=(U16)SvIV( *svpp ); + } + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + if ( charid ) { + + U16 check; + U32 newstate = 0; + + charid--; + if ( !trie->states[ state ].trans.list ) { + TRIE_LIST_NEW( state ); + } + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { + newstate = TRIE_LIST_ITEM( state, check ).newstate; + break; + } + } + if ( ! newstate ) { + newstate = next_alloc++; + prev_states[newstate] = state; + TRIE_LIST_PUSH( state, charid, newstate ); + transcount++; + } + state = newstate; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + } + } + TRIE_HANDLE_WORD(state); + + } /* end second pass */ + + /* next alloc is the NEXT state to be allocated */ + trie->statecount = next_alloc; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, + next_alloc + * sizeof(reg_trie_state) ); + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, + revcharmap, next_alloc, + depth+1) + ); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); + { + U32 state; + U32 tp = 0; + U32 zp = 0; + + + for( state=1 ; state < next_alloc ; state ++ ) { + U32 base=0; + + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) + ); + */ + + if (trie->states[state].trans.list) { + U16 minid=TRIE_LIST_ITEM( state, 1).forid; + U16 maxid=minid; + U16 idx; + + for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U16 forid = TRIE_LIST_ITEM( state, idx).forid; + if ( forid < minid ) { + minid=forid; + } else if ( forid > maxid ) { + maxid=forid; + } + } + if ( transcount < tp + maxid - minid + 1) { + transcount *= 2; + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, + transcount + * sizeof(reg_trie_trans) ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); + } + base = trie->uniquecharcount + tp - minid; + if ( maxid == minid ) { + U32 set = 0; + for ( ; zp < tp ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + base = trie->uniquecharcount + zp - minid; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ zp ].check = state; + set = 1; + break; + } + } + if ( !set ) { + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ tp ].check = state; + tp++; + zp = tp; + } + } else { + for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; + trie->trans[ tid ].check = state; + } + tp += ( maxid - minid + 1 ); + } + Safefree(trie->states[ state ].trans.list); + } + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, " base: %d\n",base); + ); + */ + trie->states[ state ].trans.base=base; + } + trie->lasttrans = tp + 1; + } + } else { + /* + Second Pass -- Flat Table Representation. + + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. + + We then construct the trie using only the .next slots of the entry + structs. + + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. + + Since trans are numbered from 1 any 0 pointer in the table is a FAIL + transition. + + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: + + A B A B + 1. 2 4 1. 3 7 + 2. 0 3 3. 0 5 + 3. 0 0 5. 0 0 + 4. 0 0 7. 0 0 + + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. + + */ + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using table compiler\n", + (int)depth * 2 + 2, "")); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) + * trie->uniquecharcount + 1, + sizeof(reg_trie_trans) ); + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + next_alloc = trie->uniquecharcount + 1; + + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + + U32 state = 1; /* required init */ + + U16 charid = 0; /* sanity init */ + U32 accept_state = 0; /* sanity init */ + + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + + if ( OP(noper) != NOTHING ) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + charid = svpp ? (U16)SvIV(*svpp) : 0; + } + if ( charid ) { + charid--; + if ( !trie->trans[ state + charid ].next ) { + trie->trans[ state + charid ].next = next_alloc; + trie->trans[ state ].check++; + prev_states[TRIE_NODENUM(next_alloc)] + = TRIE_NODENUM(state); + next_alloc += trie->uniquecharcount; + } + state = trie->trans[ state + charid ].next; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + } + } + accept_state = TRIE_NODENUM( state ); + TRIE_HANDLE_WORD(accept_state); + + } /* end second pass */ + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, + revcharmap, + next_alloc, depth+1)); + + { + /* + * Inplace compress the table.* + + For sparse data sets the table constructed by the trie algorithm will + be mostly 0/FAIL transitions or to put it another way mostly empty. + (Note that leaf nodes will not contain any transitions.) + + This algorithm compresses the tables by eliminating most such + transitions, at the cost of a modest bit of extra work during lookup: + + - Each states[] entry contains a .base field which indicates the + index in the state[] array wheres its transition data is stored. + + - If .base is 0 there are no valid transitions from that node. + + - If .base is nonzero then charid is added to it to find an entry in + the trans array. + + -If trans[states[state].base+charid].check!=state then the + transition is taken to be a 0/Fail transition. Thus if there are fail + transitions at the front of the node then the .base offset will point + somewhere inside the previous nodes data (or maybe even into a node + even earlier), but the .check field determines if the transition is + valid. + + XXX - wrong maybe? + The following process inplace converts the table to the compressed + table: We first do not compress the root node 1,and mark all its + .check pointers as 1 and set its .base pointer as 1 as well. This + allows us to do a DFA construction from the compressed table later, + and ensures that any .base pointers we calculate later are greater + than 0. + + - We set 'pos' to indicate the first entry of the second node. + + - We then iterate over the columns of the node, finding the first and + last used entry at l and m. We then copy l..m into pos..(pos+m-l), + and set the .check pointers accordingly, and advance pos + appropriately and repreat for the next node. Note that when we copy + the next pointers we have to convert them from the original + NODEIDX form to NODENUM form as the former is not valid post + compression. + + - If a node has no transitions used we mark its base as 0 and do not + advance the pos pointer. + + - If a node only has one transition we use a second pointer into the + structure to fill in allocated fail transitions from other states. + This pointer is independent of the main pointer and scans forward + looking for null transitions that are allocated to a state. When it + finds one it writes the single transition into the "hole". If the + pointer doesnt find one the single transition is appended as normal. + + - Once compressed we can Renew/realloc the structures to release the + excess space. + + See "Table-Compression Methods" in sec 3.9 of the Red Dragon, + specifically Fig 3.47 and the associated pseudocode. + + demq + */ + const U32 laststate = TRIE_NODENUM( next_alloc ); + U32 state, charid; + U32 pos = 0, zp=0; + trie->statecount = laststate; + + for ( state = 1 ; state < laststate ; state++ ) { + U8 flag = 0; + const U32 stateidx = TRIE_NODEIDX( state ); + const U32 o_used = trie->trans[ stateidx ].check; + U32 used = trie->trans[ stateidx ].check; + trie->trans[ stateidx ].check = 0; + + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { + if ( flag || trie->trans[ stateidx + charid ].next ) { + if ( trie->trans[ stateidx + charid ].next ) { + if (o_used == 1) { + for ( ; zp < pos ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + break; + } + } + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); + trie->trans[ zp ].check = state; + if ( ++zp > pos ) pos = zp; + break; + } + used--; + } + if ( !flag ) { + flag = 1; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; + } + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].check = state; + pos++; + } + } + } + trie->lasttrans = pos + 1; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, laststate + * sizeof(reg_trie_state) ); + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + ); + + } /* end table compress */ + } + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + (int)depth * 2 + 2, "", + (UV)trie->statecount, + (UV)trie->lasttrans) + ); + /* resize the trans array to remove unused space */ + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, trie->lasttrans + * sizeof(reg_trie_trans) ); + + { /* Modify the program and insert the new TRIE node */ + U8 nodetype =(U8)(flags & 0xFF); + char *str=NULL; + +#ifdef DEBUGGING + regnode *optimize = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + + U32 mjd_offset = 0; + U32 mjd_nodelen = 0; +#endif /* RE_TRACK_PATTERN_OFFSETS */ +#endif /* DEBUGGING */ + /* + This means we convert either the first branch or the first Exact, + depending on whether the thing following (in 'last') is a branch + or not and whther first is the startbranch (ie is it a sub part of + the alternation or is it the whole thing.) + Assuming its a sub part we convert the EXACT otherwise we convert + the whole branch sequence, including the first. + */ + /* Find the node we are going to overwrite */ + if ( first != startbranch || OP( last ) == BRANCH ) { + /* branch sub-chain */ + NEXT_OFF( first ) = (U16)(last - first); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_r({ + mjd_offset= Node_Offset((convert)); + mjd_nodelen= Node_Length((convert)); + }); +#endif + /* whole branch chain */ + } +#ifdef RE_TRACK_PATTERN_OFFSETS + else { + DEBUG_r({ + const regnode *nop = NEXTOPER( convert ); + mjd_offset= Node_Offset((nop)); + mjd_nodelen= Node_Length((nop)); + }); + } + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + (int)depth * 2 + 2, "", + (UV)mjd_offset, (UV)mjd_nodelen) + ); +#endif + /* But first we check to see if there is a common prefix we can + split out as an EXACT and put in front of the TRIE node. */ + trie->startstate= 1; + if ( trie->bitmap && !widecharmap && !trie->jump ) { + U32 state; + for ( state = 1 ; state < trie->statecount-1 ; state++ ) { + U32 ofs = 0; + I32 idx = -1; + U32 count = 0; + const U32 base = trie->states[ state ].trans.base; + + if ( trie->states[state].wordnum ) + count = 1; + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) && + ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && + trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + { + if ( ++count > 1 ) { + SV **tmp = av_fetch( revcharmap, ofs, 0); + const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); + if ( state == 1 ) break; + if ( count == 2 ) { + Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*sNew Start State=%"UVuf" Class: [", + (int)depth * 2 + 2, "", + (UV)state)); + if (idx >= 0) { + SV ** const tmp = av_fetch( revcharmap, idx, 0); + const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); + + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie, folder[ *ch ]); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, "%s", (char*)ch) + ); + } + } + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie,folder[ *ch ]); + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); + } + idx = ofs; + } + } + if ( count == 1 ) { + SV **tmp = av_fetch( revcharmap, idx, 0); + STRLEN len; + char *ch = SvPV( *tmp, len ); + DEBUG_OPTIMISE_r({ + SV *sv=sv_newmortal(); + PerlIO_printf( Perl_debug_log, + "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", + (int)depth * 2 + 2, "", + (UV)state, (UV)idx, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + }); + if ( state==1 ) { + OP( convert ) = nodetype; + str=STRING(convert); + STR_LEN(convert)=0; + } + STR_LEN(convert) += len; + while (len--) + *str++ = *ch++; + } else { +#ifdef DEBUGGING + if (state>1) + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); +#endif + break; + } + } + trie->prefixlen = (state-1); + if (str) { + regnode *n = convert+NODE_SZ_STR(convert); + NEXT_OFF(convert) = NODE_SZ_STR(convert); + trie->startstate = state; + trie->minlen -= (state - 1); + trie->maxlen -= (state - 1); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + regnode *fix = convert; + U32 word = trie->wordcount; + mjd_nodelen++; + Set_Node_Offset_Length(convert, mjd_offset, state - 1); + while( ++fix < n ) { + Set_Node_Offset_Length(fix, 0, 0); + } + while (word--) { + SV ** const tmp = av_fetch( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif + if (trie->maxlen) { + convert = n; + } else { + NEXT_OFF(convert) = (U16)(tail - convert); + DEBUG_r(optimize= n); + } + } + } + if (!jumper) + jumper = last; + if ( trie->maxlen ) { + NEXT_OFF( convert ) = (U16)(tail - convert); + ARG_SET( convert, data_slot ); + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. + We use this when dumping a trie and during optimisation. */ + if (trie->jump) + trie->jump[0] = (U16)(nextbranch - convert); + + /* If the start state is not accepting (meaning there is no empty string/NOTHING) + * and there is a bitmap + * and the first "jump target" node we found leaves enough room + * then convert the TRIE node into a TRIEC node, with the bitmap + * embedded inline in the opcode - this is hypothetically faster. + */ + if ( !trie->states[trie->startstate].wordnum + && trie->bitmap + && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) + { + OP( convert ) = TRIEC; + Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); + PerlMemShared_free(trie->bitmap); + trie->bitmap= NULL; + } else + OP( convert ) = TRIE; + + /* store the type in the flags */ + convert->flags = nodetype; + DEBUG_r({ + optimize = convert + + NODE_STEP_REGNODE + + regarglen[ OP( convert ) ]; + }); + /* XXX We really should free up the resource in trie now, + as we won't use them - (which resources?) dmq */ + } + /* needed for dumping*/ + DEBUG_r(if (optimize) { + regnode *opt = convert; + + while ( ++opt < optimize) { + Set_Node_Offset_Length(opt,0,0); + } + /* + Try to clean up some of the debris left after the + optimisation. + */ + while( optimize < jumper ) { + mjd_nodelen += Node_Length((optimize)); + OP( optimize ) = OPTIMIZED; + Set_Node_Offset_Length(optimize,0,0); + optimize++; + } + Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); + }); + } /* end node insert */ + + /* Finish populating the prev field of the wordinfo array. Walk back + * from each accept state until we find another accept state, and if + * so, point the first word's .prev field at the second word. If the + * second already has a .prev field set, stop now. This will be the + * case either if we've already processed that word's accept state, + * or that state had multiple words, and the overspill words were + * already linked up earlier. + */ + { + U16 word; + U32 state; + U16 prev; + + for (word=1; word <= trie->wordcount; word++) { + prev = 0; + if (trie->wordinfo[word].prev) + continue; + state = trie->wordinfo[word].accept; + while (state) { + state = prev_states[state]; + if (!state) + break; + prev = trie->states[state].wordnum; + if (prev) + break; + } + trie->wordinfo[word].prev = prev; + } + Safefree(prev_states); + } + + + /* and now dump out the compressed format */ + DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); + + RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; +#ifdef DEBUGGING + RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; + RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; +#else + SvREFCNT_dec_NN(revcharmap); +#endif + return trie->jump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE + : MADE_TRIE; +} + +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) +{ +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed + + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 + ISBN 0-201-10088-6 + + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. + Consider + 'abcdgu'=~/abcdefg|cdgu/ + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. + */ + /* add a fail transition */ + const U32 trie_offset = ARG(source); + reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; + U32 *q; + const U32 ucharcount = trie->uniquecharcount; + const U32 numstates = trie->statecount; + const U32 ubound = trie->lasttrans + ucharcount; + U32 q_read = 0; + U32 q_write = 0; + U32 charid; + U32 base = trie->states[ 1 ].trans.base; + U32 *fail; + reg_ac_data *aho; + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; + PERL_UNUSED_CONTEXT; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ + + ARG_SET( stclass, data_slot ); + aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); + RExC_rxi->data->data[ data_slot ] = (void*)aho; + aho->trie=trie_offset; + aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); + Copy( trie->states, aho->states, numstates, reg_trie_state ); + Newxz( q, numstates, U32); + aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); + aho->refcount = 1; + fail = aho->fail; + /* initialize fail[0..1] to be 1 so that we always have + a valid final fail state */ + fail[ 0 ] = fail[ 1 ] = 1; + + for ( charid = 0; charid < ucharcount ; charid++ ) { + const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); + if ( newstate ) { + q[ q_write ] = newstate; + /* set to point at the root */ + fail[ q[ q_write++ ] ]=1; + } + } + while ( q_read < q_write) { + const U32 cur = q[ q_read++ % numstates ]; + base = trie->states[ cur ].trans.base; + + for ( charid = 0 ; charid < ucharcount ; charid++ ) { + const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); + if (ch_state) { + U32 fail_state = cur; + U32 fail_base; + do { + fail_state = fail[ fail_state ]; + fail_base = aho->states[ fail_state ].trans.base; + } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); + + fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); + fail[ ch_state ] = fail_state; + if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) + { + aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; + } + q[ q_write++ % numstates] = ch_state; + } + } + } + /* restore fail[0..1] to 0 so that we "fall out" of the AC loop + when we fail in state 1, this allows us to use the + charclass scan to find a valid start char. This is based on the principle + that theres a good chance the string being searched contains lots of stuff + that cant be a start char. + */ + fail[ 0 ] = fail[ 1 ] = 0; + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%*sStclass Failtable (%"UVuf" states): 0", + (int)(depth * 2), "", (UV)numstates + ); + for( q_read=1; q_read%3d: %s (%d)\n", \ + (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ + Next ? (REG_NODE_NUM(Next)) : 0 ); \ + }}); + + +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one. The regop may be changed if the node(s) contain certain sequences that + * require special handling. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING-kind nodes, and + * these get optimized out + * + * If a node is to match under /i (folded), the number of characters it matches + * can be different than its character length if it contains a multi-character + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. + * + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) + * + * This is as good a place as any to discuss the design of handling these + * multi-character fold sequences. It's been wrong in Perl for a very long + * time. There are three code points in Unicode whose multi-character folds + * were long ago discovered to mess things up. The previous designs for + * dealing with these involved assigning a special node for them. This + * approach doesn't always work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both sides fold to "sss", but if the pattern is parsed to create a node that + * would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFU_SS node + * that is "sss" in this case. + * + * It turns out that there are problems with all multi-character folds, and not + * just these three. Now the code is general, for all such cases. The + * approach taken is: + * 1) This routine examines each EXACTFish node that could contain multi- + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * is used for an EXACTFU node that contains at least one "ss" sequence in + * it. For non-UTF-8 patterns and strings, this is the only case where + * there is a possible fold length change. That means that a regular + * EXACTFU node without UTF-8 involvement doesn't have to concern itself + * with length changes, and so can be processed faster. regexec.c takes + * advantage of this. Generally, an EXACTFish node that is in UTF-8 is + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string are + * members of a fold-pair, and arrays are set up for all of them so that + * the other member of the pair can be found quickly. Code elsewhere in + * this file makes sure that in EXACTFU nodes, the sharp s gets folded to + * 'ss', even if the pattern isn't UTF-8. This avoids the issues + * described in the next item. + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid + * assumption. Thus, there is no optimization based on string lengths for + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ + if (PL_regkind[OP(scan)] == EXACT) \ + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) + +STATIC U32 +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ + /* Merge several consecutive EXACTish nodes into one. */ + regnode *n = regnext(scan); + U32 stringok = 1; + regnode *next = scan + NODE_SZ_STR(scan); + U32 merged = 0; + U32 stopnow = 0; +#ifdef DEBUGGING + regnode *stop = scan; + GET_RE_DEBUG_FLAGS_DECL; +#else + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_JOIN_EXACT; +#ifndef EXPERIMENTAL_INPLACESCAN + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(val); +#endif + DEBUG_PEEP("join",scan,depth); + + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) + && NEXT_OFF(n) + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { + + if (OP(n) == TAIL || n > next) + stringok = 0; + if (PL_regkind[OP(n)] == NOTHING) { + DEBUG_PEEP("skip:",n,depth); + NEXT_OFF(scan) += NEXT_OFF(n); + next = n + NODE_STEP_REGNODE; +#ifdef DEBUGGING + if (stringok) + stop = n; +#endif + n = regnext(n); + } + else if (stringok) { + const unsigned int oldl = STR_LEN(scan); + regnode * const nnext = regnext(n); + + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ + /* Don't join if the sum can't fit into a single node */ + if (oldl + STR_LEN(n) > U8_MAX) + break; + + DEBUG_PEEP("merg",n,depth); + merged++; + + NEXT_OFF(scan) += NEXT_OFF(n); + STR_LEN(scan) += STR_LEN(n); + next = n + NODE_SZ_STR(n); + /* Now we can overwrite *n : */ + Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); +#ifdef DEBUGGING + stop = next - 1; +#endif + n = nnext; + if (stopnow) break; + } + +#ifdef EXPERIMENTAL_INPLACESCAN + if (flags && !NEXT_OFF(n)) { + DEBUG_PEEP("atch", val, depth); + if (reg_off_by_arg[OP(n)]) { + ARG_SET(n, val - n); + } + else { + NEXT_OFF(n) = val - n; + } + stopnow = 1; + } +#endif + } + + *min_subtract = 0; + *unfolded_multi_char = FALSE; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ + if (OP(scan) != EXACT) { + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ + + /* One pass is made over the node's string looking for all the + * possibilities. To avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ + + /* Examine the string for a multi-character fold sequence. UTF-8 + * patterns have all characters pre-folded by the time this code is + * executed */ + while (s < s_end - 1) /* Can stop 1 before the end, as minimum + length sequence we are looking for is 2 */ + { + int count = 0; /* How many characters in a multi-char fold */ + int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); + if (! len) { /* Not a multi-char fold: get next char */ + s += UTF8SKIP(s); + continue; + } + + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ + if (len == 2 && *s == 's' && *(s+1) == 's' + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) + { + count = 2; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } + s += 2; + } + else { /* Here is a generic multi-char fold. */ + U8* multi_end = s + len; + + /* Count how many characters are in it. In the case of + * /aa, no folds which contain ASCII code points are + * allowed, so check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { + count = utf8_length(s, multi_end); + s = multi_end; + } + else { + while (s < multi_end) { + if (isASCII(*s)) { + s++; + goto next_iteration; + } + else { + s += UTF8SKIP(s); + } + count++; + } + } + } + + /* The delta is how long the sequence is minus 1 (1 is how long + * the character that folds to the sequence is) */ + total_count_delta += count - 1; + next_iteration: ; + } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); + } + else if (OP(scan) == EXACTFA) { + + /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char + * fold to the ASCII range (and there are no existing ones in the + * upper latin1 range). But, as outlined in the comments preceding + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ + while (s < s_end) { + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; + break; + } + s++; + continue; + } + } + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; + + while (s < upper) { + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); + if (! len) { /* Not a multi-char fold. */ + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) + { + *unfolded_multi_char = TRUE; + } + s++; + continue; + } + + if (len == 2 + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) + { + + /* EXACTF nodes need to know that the minimum length + * changed so that a sharp s in the string can match this + * ss in the pattern, but they remain EXACTF nodes, as they + * won't match this unless the target string is is UTF-8, + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } + } + + *min_subtract += len - 1; + s += len; + } + } + } + +#ifdef DEBUGGING + /* Allow dumping but overwriting the collection of skipped + * ops and/or strings with fake optimized ops */ + n = scan + NODE_SZ_STR(scan); + while (n <= stop) { + OP(n) = OPTIMIZED; + FLAGS(n) = 0; + NEXT_OFF(n) = 0; + n++; + } +#endif + DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); + return stopnow; +} + +/* REx optimizer. Converts nodes into quicker variants "in place". + Finds fixed substrings. */ + +/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set + to the position after last scanned or to NULL. */ + +#define INIT_AND_WITHP \ + assert(!and_withp); \ + Newx(and_withp,1, regnode_ssc); \ + SAVEFREEPV(and_withp) + +/* this is a chain of data about sub patterns we are processing that + need to be handled separately/specially in study_chunk. Its so + we can simulate recursion without losing state. */ +struct scan_frame; +typedef struct scan_frame { + regnode *last; /* last node to process in this frame */ + regnode *next; /* next node to process when last is reached */ + struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; + I32 stop; /* what stopparen do we use */ +} scan_frame; + + +STATIC SSize_t +S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, + SSize_t *minlenp, SSize_t *deltap, + regnode *last, + scan_data_t *data, + I32 stopparen, + U32 recursed_depth, + regnode_ssc *and_withp, + U32 flags, U32 depth) + /* scanp: Start here (read-write). */ + /* deltap: Write maxlen-minlen here. */ + /* last: Stop before this one. */ + /* data: string data about the pattern */ + /* stopparen: treat close N as END */ + /* recursed: which subroutines have we recursed into */ + /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ +{ + /* There must be at least this number of characters to match */ + SSize_t min = 0; + I32 pars = 0, code; + regnode *scan = *scanp, *next; + SSize_t delta = 0; + int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); + int is_inf_internal = 0; /* The studied chunk is infinite */ + I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; + scan_data_t data_fake; + SV *re_trie_maxbuff = NULL; + regnode *first_non_open = scan; + SSize_t stopmin = SSize_t_MAX; + scan_frame *frame = NULL; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_STUDY_CHUNK; + +#ifdef DEBUGGING + StructCopy(&zero_scan_data, &data_fake, scan_data_t); +#endif + if ( depth == 0 ) { + while (first_non_open && OP(first_non_open) == OPEN) + first_non_open=regnext(first_non_open); + } + + + fake_study_recurse: + while ( scan && OP(scan) != END && scan < last ){ + UV min_subtract = 0; /* How mmany chars to subtract from the minimum + node length to get a real minimum (because + the folded version may be shorter) */ + bool unfolded_multi_char = FALSE; + /* Peephole optimizer: */ + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); + + + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); + + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. */ + if (OP(scan) != CURLYX) { + const int max = (reg_off_by_arg[OP(scan)] + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); + int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); + int noff; + regnode *n = scan; + + /* Skip NOTHING and LONGJMP. */ + while ((n = regnext(n)) + && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n)))) + && off + noff < max) + off += noff; + if (reg_off_by_arg[OP(scan)]) + ARG(scan) = off; + else + NEXT_OFF(scan) = off; + } + + + + /* The principal pseudo-switch. Cannot be a switch, since we + look into several different things. */ + if (OP(scan) == BRANCH || OP(scan) == BRANCHJ + || OP(scan) == IFTHEN) { + next = regnext(scan); + code = OP(scan); + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ + + if (OP(next) == code || code == IFTHEN) { + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; + regnode * const startbranch=scan; + + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + while (OP(scan) == code) { + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; + + num++; + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + + data_fake.pos_delta = delta; + next = regnext(scan); + scan = NEXTOPER(scan); + if (code != BRANCH) + scan = NEXTOPER(scan); + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + /* we suppose the run is continuous, last=next...*/ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); + if (min1 > minnext) + min1 = minnext; + if (deltanext == SSize_t_MAX) { + is_inf = is_inf_internal = 1; + max1 = SSize_t_MAX; + } else if (max1 < minnext + deltanext) + max1 = minnext + deltanext; + scan = next; + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > minnext) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); + } + if (code == IFTHEN && num < 2) /* Empty ELSE branch */ + min1 = 0; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; + else + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; + else + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { + /* demq. + + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of + + BRANCH->EXACT=>x1 + BRANCH->EXACT=>x2 + tail + + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ + + If we can find such a subsequence we need to turn the first + element into a trie and then add the subsequent branch exact + strings to the trie. + + We have two cases + + 1. patterns where the whole set of branches can be + converted. + + 2. patterns where only a subset can be converted. + + In case 1 we can replace the whole set with a single regop + for the trie. In case 2 we need to keep the start and end + branches so + + 'BRANCH EXACT; BRANCH EXACT; BRANCH X' + becomes BRANCH TRIE; BRANCH X; + + There is an additional case, that being where there is a + common prefix, which gets split out into an EXACT like node + preceding the TRIE node. + + If x(1..n)==tail then we can do a simple trie, if not we make + a "jump" trie, such that when we match the appropriate word + we "jump" to the appropriate tail node. Essentially we turn + a nested if into a case structure of sorts. + + */ + + int made=0; + if (!re_trie_maxbuff) { + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + if (!SvIOK(re_trie_maxbuff)) + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + if ( SvIV(re_trie_maxbuff)>=0 ) { + regnode *cur; + regnode *first = (regnode *)NULL; + regnode *last = (regnode *)NULL; + regnode *tail = scan; + U8 trietype = 0; + U32 count=0; + +#ifdef DEBUGGING + SV * const mysv = sv_newmortal(); /* for dumping */ +#endif + /* var tail is used because there may be a TAIL + regop in the way. Ie, the exacts will point to the + thing following the TAIL, but the last branch will + point at the TAIL. So we advance tail. If we + have nested (?:) we may have to move through several + tails. + */ + + while ( OP( tail ) == TAIL ) { + /* this is the TAIL generated by (?:) */ + tail = regnext( tail ); + } + + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, tail, NULL); + PerlIO_printf( Perl_debug_log, "%*s%s%s\n", + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) + ); + }); + + /* + + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part + of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACTFU | EXACTFU + EXACTFU_SS | EXACTFU + EXACTFA | EXACTFA + + + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ + ( EXACT == (X) ) ? EXACT : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ + 0 ) + + /* dont use tail as the end marker for this traverse */ + for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { + regnode * const noper = NEXTOPER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); +#if defined(DEBUGGING) || defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; +#endif + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", + (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); + + regprop(RExC_rx, mysv, noper, NULL); + PerlIO_printf( Perl_debug_log, " -> %s", + SvPV_nolen_const(mysv)); + + if ( noper_next ) { + regprop(RExC_rx, mysv, noper_next, NULL); + PerlIO_printf( Perl_debug_log,"\t=> %s\t", + SvPV_nolen_const(mysv)); + } + PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] + ); + }); + + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ + if ( noper_trietype + && + ( + ( noper_trietype == NOTHING) + || ( trietype == NOTHING ) + || ( trietype == noper_trietype ) + ) +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + && count < U16_MAX) + { + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ + if ( !first ) { + first = cur; + if ( noper_trietype == NOTHING ) { +#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; +#endif + + if ( noper_next_trietype ) { + trietype = noper_next_trietype; + } else if (noper_next_type) { + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ + first = NULL; + } + } else { + trietype = noper_trietype; + } + } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; + last = cur; + } + if (first) + count++; + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ + if ( last ) { + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ + if ( trietype && trietype != NOTHING ) + make_trie( pRExC_state, + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ + } + if ( noper_trietype +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + ){ + /* noper is triable, so we can start a new + * trie sequence */ + count = 1; + first = cur; + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the + * current node is not triable then we have + * to reset the first information. */ + count = 0; + first = NULL; + trietype = 0; + } + } /* end handle unmergable node */ + } /* loop over branches */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + if ( last && trietype ) { + if ( trietype != NOTHING ) { + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); +#ifdef TRIE_STUDY_OPT + if ( ((made == MADE_EXACT_TRIE && + startbranch == first) + || ( first_non_open == first )) && + depth==0 ) { + flags |= SCF_TRIE_RESTUDY; + if ( startbranch == first + && scan == tail ) + { + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; + } + } +#endif + } else { + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING + */ + if ( startbranch == first ) { + regnode *opt; + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + OP(startbranch)= NOTHING; + NEXT_OFF(startbranch)= tail - startbranch; + for ( opt= startbranch + 1; opt < tail ; opt++ ) + OP(opt)= OPTIMIZED; + } + } + } /* end if ( last) */ + } /* TRIE_MAXBUF is non zero */ + + } /* do trie */ + + } + else if ( code == BRANCHJ ) { /* single branch is optimized. */ + scan = NEXTOPER(NEXTOPER(scan)); + } else /* single branch is optimized. */ + scan = NEXTOPER(scan); + continue; + } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { + scan_frame *newframe = NULL; + I32 paren; + regnode *start; + regnode *end; + U32 my_recursed_depth= recursed_depth; + + if (OP(scan) != SUSPEND) { + /* set the pointer */ + if (OP(scan) == GOSUB) { + paren = ARG(scan); + RExC_recurse[ARG2L(scan)] = scan; + start = RExC_open_parens[paren-1]; + end = RExC_close_parens[paren-1]; + } else { + paren = 0; + start = RExC_rxi->program + 1; + end = RExC_opend; + } + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; + Newx(newframe,1,scan_frame); + } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + } + } else { + Newx(newframe,1,scan_frame); + paren = stopparen; + start = scan+2; + end = regnext(scan); + } + if (newframe) { + assert(start); + assert(end); + SAVEFREEPV(newframe); + newframe->next = regnext(scan); + newframe->last = last; + newframe->stop = stopparen; + newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); + + frame = newframe; + scan = start; + stopparen = paren; + last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; + + continue; + } + } + else if (OP(scan) == EXACT) { + SSize_t l = STR_LEN(scan); + UV uc; + if (UTF) { + const U8 * const s = (U8*)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } else { + uc = *((U8*)STRING(scan)); + } + min += l; + if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ + /* The code below prefers earlier match for fixed + offset, later match for variable offset. */ + if (data->last_end == -1) { /* Update the start info. */ + data->last_start_min = data->pos_min; + data->last_start_max = is_inf + ? SSize_t_MAX : data->pos_min + data->pos_delta; + } + sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + if (UTF) + SvUTF8_on(data->last_found); + { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += utf8_length((U8*)STRING(scan), + (U8*)STRING(scan)+STR_LEN(scan)); + } + data->last_end = data->pos_min + l; + data->pos_min += l; /* As in the first entry. */ + data->flags &= ~SF_BEFORE_EOL; + } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ + if (flags & SCF_DO_STCLASS_AND) { + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + } + else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is + EXACTFish */ + SSize_t l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ + const U8 * s = (U8*)STRING(scan); + + /* Search for fixed substrings supports EXACT only. */ + if (flags & SCF_DO_SUBSTR) { + assert(data); + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (UTF) { + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; + } + min += l - min_subtract; + assert (min >= 0); + delta += min_subtract; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += l - min_subtract; + if (data->pos_min < 0) { + data->pos_min = 0; + } + data->pos_delta += min_subtract; + if (min_subtract) { + data->longest = &(data->longest_float); + } + } + + if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + + if (! UTF) { + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { + + /* Any Latin1 range character can potentially match any + * other depending on the locale */ + if (OP(scan) == EXACTFL) { + _invlist_union(EXACTF_invlist, PL_Latin1, + &EXACTF_invlist); + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) { + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); + } + } + + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, + (U8) uc, + &EXACTF_invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + STR_LEN(scan); + SV** listp; + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(scan) == EXACTFL + && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) + { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); + } + } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; + } + + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c); + } + } + } + } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); + } + else if (REGNODE_VARIES(OP(scan))) { + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; + regnode * const oscan = scan; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; + I32 next_is_eval = 0; + + switch (PL_regkind[OP(scan)]) { + case WHILEM: /* End of (?:...)* . */ + scan = NEXTOPER(scan); + goto finish; + case PLUS: + if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { + next = NEXTOPER(scan); + if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { + mincount = 1; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + } + if (flags & SCF_DO_SUBSTR) + data->pos_min++; + min++; + /* FALLTHROUGH */ + case STAR: + if (flags & SCF_DO_STCLASS) { + mincount = 0; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + scan = regnext(scan); + goto optimize_curly_tail; + case CURLY: + if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) + && (scan->flags == stopparen)) + { + mincount = 1; + maxcount = 1; + } else { + mincount = ARG1(scan); + maxcount = ARG2(scan); + } + next = regnext(scan); + if (OP(scan) == CURLYX) { + I32 lp = (data ? *(data->last_closep) : 0); + scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); + } + scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; + next_is_eval = (OP(scan) == EVAL); + do_curly: + if (flags & SCF_DO_SUBSTR) { + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + pos_before = data->pos_min; + } + if (data) { + fl = data->flags; + data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); + if (is_inf) + data->flags |= SF_IS_INF; + } + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + oclass = data->start_class; + data->start_class = &this_class; + f |= SCF_DO_STCLASS_AND; + f &= ~SCF_DO_STCLASS_OR; + } + /* Exclude from super-linear cache processing any {n,m} + regops for which the combination of input pos and regex + pos is not enough information to determine if a match + will be possible. + + For example, in the regex /foo(bar\s*){4,8}baz/ with the + regex pos at the \s*, the prospects for a match depend not + only on the input position but also on how many (bar\s*) + repeats into the {4,8} we are. */ + if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) + f &= ~SCF_WHILEM_VISITED_POS; + + /* This will finish on WHILEM, setting scan, or on NULL: */ + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); + + if (flags & SCF_DO_STCLASS) + data->start_class = oclass; + if (mincount == 0 || minnext == 0) { + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + } + else if (flags & SCF_DO_STCLASS_AND) { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&this_class, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + } + } else { /* Non-zero len */ + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + } + else if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + flags &= ~SCF_DO_STCLASS; + } + if (!scan) /* It was not CURLYX, but CURLY. */ + scan = next; + if (!(flags & SCF_TRIE_DOING_RESTUDY) + /* ? quantifier ok, except for (?{ ... }) */ + && (next_is_eval || !(mincount == 0 && maxcount == 1)) + && (minnext == 0) && (deltanext == 0) + && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ + { + /* Fatal warnings may leak the regexp without this: */ + SAVEFREESV(RExC_rx_sv); + ckWARNreg(RExC_parse, + "Quantifier unexpected on zero-length expression"); + (void)ReREFCNT_inc(RExC_rx_sv); + } + + min += minnext * mincount; + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf |= is_inf_internal; + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } + /* Try powerful optimization CURLYX => CURLYN. */ + if ( OP(oscan) == CURLYX && data + && data->flags & SF_IN_PAR + && !(data->flags & SF_HAS_EVAL) + && !deltanext && minnext == 1 ) { + /* Try to optimize to CURLYN. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; + regnode * const nxt1 = nxt; +#ifdef DEBUGGING + regnode *nxt2; +#endif + + /* Skip open. */ + nxt = regnext(nxt); + if (!REGNODE_SIMPLE(OP(nxt)) + && !(PL_regkind[OP(nxt)] == EXACT + && STR_LEN(nxt) == 1)) + goto nogo; +#ifdef DEBUGGING + nxt2 = nxt; +#endif + nxt = regnext(nxt); + if (OP(nxt) != CLOSE) + goto nogo; + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ + } + /* Now we know that nxt2 is the only contents: */ + oscan->flags = (U8)ARG(nxt); + OP(oscan) = CURLYN; + OP(nxt1) = NOTHING; /* was OPEN. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ +#endif + } + nogo: + + /* Try optimization CURLYX => CURLYM. */ + if ( OP(oscan) == CURLYX && data + && !(data->flags & SF_HAS_PAR) + && !(data->flags & SF_HAS_EVAL) + && !deltanext /* atom is fixed width */ + && minnext != 0 /* CURLYM can't handle zero width */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) + ) { + /* XXXX How to optimize if data == 0? */ + /* Optimize to a simpler form. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ + regnode *nxt2; + + OP(oscan) = CURLYM; + while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ + && (OP(nxt2) != WHILEM)) + nxt = nxt2; + OP(nxt2) = SUCCEED; /* Whas WHILEM */ + /* Need to optimize away parenths. */ + if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { + /* Set the parenth number. */ + regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ + + oscan->flags = (U8)ARG(nxt); + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ + } + OP(nxt1) = OPTIMIZED; /* was OPEN. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ +#endif +#if 0 + while ( nxt1 && (OP(nxt1) != WHILEM)) { + regnode *nnxt = regnext(nxt1); + if (nnxt == nxt) { + if (reg_off_by_arg[OP(nxt1)]) + ARG_SET(nxt1, nxt2 - nxt1); + else if (nxt2 - nxt1 < U16_MAX) + NEXT_OFF(nxt1) = nxt2 - nxt1; + else + OP(nxt) = NOTHING; /* Cannot beautify */ + } + nxt1 = nnxt; + } +#endif + /* Optimize again: */ + study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, + NULL, stopparen, recursed_depth, NULL, 0,depth+1); + } + else + oscan->flags = 0; + } + else if ((OP(oscan) == CURLYX) + && (flags & SCF_WHILEM_VISITED_POS) + /* See the comment on a similar expression above. + However, this time it's not a subexpression + we care about, but the expression itself. */ + && (maxcount == REG_INFTY) + && data && ++data->whilem_c < 16) { + /* This stays as CURLYX, we can put the count/of pair. */ + /* Find WHILEM (as in regexec.c) */ + regnode *nxt = oscan + NEXT_OFF(oscan); + + if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ + nxt += ARG(nxt); + PREVOPER(nxt)->flags = (U8)(data->whilem_c + | (RExC_whilem_seen << 4)); /* On WHILEM */ + } + if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (flags & SCF_DO_SUBSTR) { + SV *last_str = NULL; + STRLEN last_chrs = 0; + int counted = mincount != 0; + + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min + ? pos_before : data->last_start_min; + STRLEN l; + const char * const s = SvPV_const(data->last_found, l); + SSize_t old = b - data->last_start_min; + + if (UTF) + old = utf8_hop((U8*)s, old) - (U8*)s; + l -= old; + /* Get the added string: */ + last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; + if (deltanext == 0 && pos_before == b) { + /* What was added is a constant string */ + if (mincount > 1) { + + SvGROW(last_str, (mincount * l) + 1); + repeatcpy(SvPVX(last_str) + l, + SvPVX_const(last_str), l, + mincount - 1); + SvCUR_set(last_str, SvCUR(last_str) * mincount); + /* Add additional parts. */ + SvCUR_set(data->last_found, + SvCUR(data->last_found) - l); + sv_catsv(data->last_found, last_str); + { + SV * sv = data->last_found; + MAGIC *mg = + SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += last_chrs * (mincount-1); + } + last_chrs *= mincount; + data->last_end += l * (mincount - 1); + } + } else { + /* start offset must point into the last copy */ + data->last_start_min += minnext * (mincount - 1); + data->last_start_max += is_inf ? SSize_t_MAX + : (maxcount - 1) * (minnext + data->pos_delta); + } + } + /* It is counted once already... */ + data->pos_min += minnext * (mincount - counted); +#if 0 +PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf + " SSize_t_MAX=%"UVuf" minnext=%"UVuf + " maxcount=%"UVuf" mincount=%"UVuf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); +#endif + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; + else + data->pos_delta += - counted * deltanext + + (minnext + deltanext) * maxcount - minnext * mincount; + if (mincount != maxcount) { + /* Cannot extend fixed substrings found inside + the group. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + if (mincount && last_str) { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + + if (mg) + mg->mg_len = -1; + sv_setsv(sv, last_str); + data->last_end = data->pos_min; + data->last_start_min = data->pos_min - last_chrs; + data->last_start_max = is_inf + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; + } + data->longest = &(data->longest_float); + } + SvREFCNT_dec(last_str); + } + if (data && (fl & SF_HAS_EVAL)) + data->flags |= SF_HAS_EVAL; + optimize_curly_tail: + if (OP(oscan) != CURLYX) { + while (PL_regkind[OP(next = regnext(oscan))] == NOTHING + && NEXT_OFF(next)) + NEXT_OFF(oscan) += NEXT_OFF(next); + } + continue; + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } + flags &= ~SCF_DO_STCLASS; + break; + } + } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + } + min++; + delta++; /* Because of the 2 char string cr-lf */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + } + else if (REGNODE_SIMPLE(OP(scan))) { + + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min++; + } + min++; + if (flags & SCF_DO_STCLASS) { + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + + /* Some of the logic below assumes that switching + locale on will only add false positives. */ + switch (OP(scan)) { + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); +#endif + case CANY: + case SANY: + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_match_all_cp(data->start_class); + break; + + case REG_ANY: + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); + } + break; + + case ANYOF: + if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); + else + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); + break; + + case NPOSIXL: + invert = 1; + /* FALLTHROUGH */ + + case POSIXL: + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; + if (flags & SCF_DO_STCLASS_AND) { + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); + } + else { + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); + } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + } + break; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; + /* FALLTHROUGH */ + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + + case NPOSIXD: + case NPOSIXU: + invert = 1; + /* FALLTHROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } + + join_posix: + + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); + } + else { + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); + } + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { + data->flags |= (OP(scan) == MEOL + ? SF_BEFORE_MEOL + : SF_BEFORE_SEOL); + scan_commit(pRExC_state, data, minlenp, is_inf); + + } + else if ( PL_regkind[OP(scan)] == BRANCHJ + /* Lookbehind, or need to calculate parens/evals/stclass: */ + && (scan->flags || data || (flags & SCF_DO_STCLASS)) + && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { + if ( OP(scan) == UNLESSM && + scan->flags == 0 && + OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && + OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED + ) { + regnode *opt; + regnode *upto= regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_STUDYDATA("OPFAIL",data,depth); + + /*DEBUG_PARSE_MSG("opfail");*/ + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) + ); + }); + OP(scan) = OPFAIL; + NEXT_OFF(scan) = upto - scan; + for (opt= scan + 1; opt < upto ; opt++) + OP(opt) = OPTIMIZED; + scan= upto; + continue; + } + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + || OP(scan) == UNLESSM ) + { + /* Negative Lookahead/lookbehind + In this case we can't do fixed string optimisation. + */ + + SSize_t deltanext, minnext, fake = 0; + regnode *nscan; + regnode_ssc intrnl; + int f = 0; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (minnext > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); + } + scan->flags = (U8)minnext; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (f & SCF_DO_STCLASS_AND) { + if (flags & SCF_DO_STCLASS_OR) { + /* OR before, AND after: ideally we would recurse with + * data_fake to get the AND applied by study of the + * remainder of the pattern, and then derecurse; + * *** HACK *** for now just treat as "no information". + * See [perl #56690]. + */ + ssc_init(pRExC_state, data->start_class); + } else { + /* AND before and after: combine and continue */ + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + } + } + } +#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY + else { + /* Positive Lookahead/lookbehind + In this case we can do fixed string optimisation, + but we must be careful about it. Note in the case of + lookbehind the positions will be offset by the minimum + length of the pattern, something we won't know about + until after the recurse. + */ + SSize_t deltanext, fake = 0; + regnode *nscan; + regnode_ssc intrnl; + int f = 0; + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated + minlens when it's all done. This way we don't + have to worry about freeing them when we know + they wont be used, which would be a pain. + */ + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); + SAVEFREEPV(minnextp); + + if (data) { + StructCopy(data, &data_fake, scan_data_t); + if ((flags & SCF_DO_SUBSTR) && data->last_found) { + f |= SCF_DO_SUBSTR; + if (scan->flags) + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); + data_fake.last_found=newSVsv(data->last_found); + } + } + else + data_fake.last_closep = &fake; + data_fake.flags = 0; + data_fake.pos_delta = delta; + if (is_inf) + data_fake.flags |= SF_IS_INF; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (*minnextp > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); + } + scan->flags = (U8)*minnextp; + } + + *minnextp += min; + + if (f & SCF_DO_STCLASS_AND) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { + if (RExC_rx->minlen<*minnextp) + RExC_rx->minlen=*minnextp; + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); + SvREFCNT_dec_NN(data_fake.last_found); + + if ( data_fake.minlen_fixed != minlenp ) + { + data->offset_fixed= data_fake.offset_fixed; + data->minlen_fixed= data_fake.minlen_fixed; + data->lookbehind_fixed+= scan->flags; + } + if ( data_fake.minlen_float != minlenp ) + { + data->minlen_float= data_fake.minlen_float; + data->offset_float_min=data_fake.offset_float_min; + data->offset_float_max=data_fake.offset_float_max; + data->lookbehind_float+= scan->flags; + } + } + } + } +#endif + } + else if (OP(scan) == OPEN) { + if (stopparen != (I32)ARG(scan)) + pars++; + } + else if (OP(scan) == CLOSE) { + if (stopparen == (I32)ARG(scan)) { + break; + } + if ((I32)ARG(scan) == is_par) { + next = regnext(scan); + + if ( next && (OP(next) != WHILEM) && next < last) + is_par = 0; /* Disable optimization */ + } + if (data) + *(data->last_closep) = ARG(scan); + } + else if (OP(scan) == EVAL) { + if (data) + data->flags |= SF_HAS_EVAL; + } + else if ( PL_regkind[OP(scan)] == ENDLIKE ) { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + flags &= ~SCF_DO_SUBSTR; + } + if (data && OP(scan)==ACCEPT) { + data->flags |= SCF_SEEN_ACCEPT; + if (stopmin > min) + stopmin = min; + } + } + else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ + { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + } + else if (OP(scan) == GPOS) { + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) + { + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) + RExC_rx->gofs = min; + } else { + RExC_rx->intflags |= PREGf_GPOS_FLOAT; + RExC_rx->gofs = 0; + } + } +#ifdef TRIE_STUDY_OPT +#ifdef FULL_TRIE_STUDY + else if (PL_regkind[OP(scan)] == TRIE) { + /* NOTE - There is similar code to this block above for handling + BRANCH nodes on the initial study. If you change stuff here + check there too. */ + regnode *trie_node= scan; + regnode *tail= regnext(scan); + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; + + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + if (!trie->jump) { + min1= trie->minlen; + max1= trie->maxlen; + } else { + const regnode *nextbranch= NULL; + U32 word; + + for ( word=1 ; word <= trie->wordcount ; word++) + { + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + if (trie->jump[word]) { + if (!nextbranch) + nextbranch = trie_node + trie->jump[0]; + scan= trie_node + trie->jump[word]; + /* We go from the jump point to the branch that follows + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed_depth, NULL, f,depth+1); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode*)nextbranch); + + if (min1 > (SSize_t)(minnext + trie->minlen)) + min1 = minnext + trie->minlen; + if (deltanext == SSize_t_MAX) { + is_inf = is_inf_internal = 1; + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) + max1 = minnext + deltanext + trie->maxlen; + + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > min + min1) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); + } + } + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + scan= tail; + continue; + } +#else + else if (PL_regkind[OP(scan)] == TRIE) { + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + U8*bang=NULL; + + min += trie->minlen; + delta += (trie->maxlen - trie->minlen); + flags &= ~SCF_DO_STCLASS; /* xxx */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += trie->minlen; + data->pos_delta += (trie->maxlen - trie->minlen); + if (trie->maxlen != trie->minlen) + data->longest = &(data->longest_float); + } + if (trie->jump) /* no more substrings -- for now /grr*/ + flags &= ~SCF_DO_SUBSTR; + } +#endif /* old or new */ +#endif /* TRIE_STUDY_OPT */ + + /* Else: zero-length, ignore. */ + scan = regnext(scan); + } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ + if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ + last = frame->last; + scan = frame->next; + stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + + frame = frame->prev; + goto fake_study_recurse; + } + + finish: + assert(!frame); + DEBUG_STUDYDATA("pre-fin:",data,depth); + + *scanp = scan; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + + if (flags & SCF_DO_SUBSTR && is_inf) + data->pos_delta = SSize_t_MAX - data->pos_min; + if (is_par > (I32)U8_MAX) + is_par = 0; + if (is_par && pars==1 && data) { + data->flags |= SF_IN_PAR; + data->flags &= ~SF_HAS_PAR; + } + else if (pars && data) { + data->flags |= SF_HAS_PAR; + data->flags &= ~SF_IN_PAR; + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + if (flags & SCF_TRIE_RESTUDY) + data->flags |= SCF_TRIE_RESTUDY; + + DEBUG_STUDYDATA("post-fin:",data,depth); + + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ +} + +STATIC U32 +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) +{ + U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; + + PERL_ARGS_ASSERT_ADD_DATA; + + Renewc(RExC_rxi->data, + sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), + char, struct reg_data); + if(count) + Renew(RExC_rxi->data->what, count + n, U8); + else + Newx(RExC_rxi->data->what, n, U8); + RExC_rxi->data->count = count + n; + Copy(s, RExC_rxi->data->what + count, n, U8); + return count; +} + +/*XXX: todo make this not included in a non debugging perl, but appears to be + * used anyway there, in 'use re' */ +#ifndef PERL_IN_XSUB_RE +void +Perl_reginitcolors(pTHX) +{ + const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); + if (s) { + char *t = savepv(s); + int i = 0; + PL_colors[0] = t; + while (++i < 6) { + t = strchr(t, '\t'); + if (t) { + *t = '\0'; + PL_colors[i] = ++t; + } + else + PL_colors[i] = t = (char *)""; + } + } else { + int i = 0; + while (i < 6) + PL_colors[i++] = (char *)""; + } + PL_colorset = 1; +} +#endif + + +#ifdef TRIE_STUDY_OPT +#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ + STMT_START { \ + if ( \ + (data.flags & SCF_TRIE_RESTUDY) \ + && ! restudied++ \ + ) { \ + dOsomething; \ + goto reStudy; \ + } \ + } STMT_END +#else +#define CHECK_RESTUDY_GOTO_butfirst +#endif + +/* + * pregcomp - compile a regular expression into internal code + * + * Decides which engine's compiler to call based on the hint currently in + * scope + */ + +#ifndef PERL_IN_XSUB_RE + +/* return the currently in-scope regex engine (or the default if none) */ + +regexp_engine const * +Perl_current_re_engine(pTHX) +{ + if (IN_PERL_COMPILETIME) { + HV * const table = GvHV(PL_hintgv); + SV **ptr; + + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) + return &PL_core_reg_engine; + ptr = hv_fetchs(table, "regcomp", FALSE); + if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) + return &PL_core_reg_engine; + return INT2PTR(regexp_engine*,SvIV(*ptr)); + } + else { + SV *ptr; + if (!PL_curcop->cop_hints_hash) + return &PL_core_reg_engine; + ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); + if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) + return &PL_core_reg_engine; + return INT2PTR(regexp_engine*,SvIV(ptr)); + } +} + + +REGEXP * +Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) +{ + regexp_engine const *eng = current_re_engine(); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_PREGCOMP; + + /* Dispatch a request to compile a regexp to correct regexp engine. */ + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + PTR2UV(eng)); + }); + return CALLREGCOMP_ENG(eng, pattern, flags); +} +#endif + +/* public(ish) entry point for the perl core's own regex compiling code. + * It's actually a wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs, and uses the internal engine rather + * than the current one */ + +REGEXP * +Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) +{ + SV *pat = pattern; /* defeat constness! */ + PERL_ARGS_ASSERT_RE_COMPILE; + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, +#ifdef PERL_IN_XSUB_RE + &my_reg_engine, +#else + &PL_core_reg_engine, +#endif + NULL, NULL, rx_flags, 0); +} + + +/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code + * blocks, recalculate the indices. Update pat_p and plen_p in-place to + * point to the realloced string and length. + * + * This is essentially a copy of Perl_bytes_to_utf8() with the code index + * stuff added */ + +static void +S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, + char **pat_p, STRLEN *plen_p, int num_code_blocks) +{ + U8 *const src = (U8*)*pat_p; + U8 *dst; + int n=0; + STRLEN s = 0, d = 0; + bool do_end = 0; + GET_RE_DEBUG_FLAGS_DECL; + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + + Newx(dst, *plen_p * 2 + 1, U8); + + while (s < *plen_p) { + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; + else { + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); + } + if (n < num_code_blocks) { + if (!do_end && pRExC_state->code_blocks[n].start == s) { + pRExC_state->code_blocks[n].start = d; + assert(dst[d] == '('); + do_end = 1; + } + else if (do_end && pRExC_state->code_blocks[n].end == s) { + pRExC_state->code_blocks[n].end = d; + assert(dst[d] == ')'); + do_end = 0; + n++; + } + } + s++; + d++; + } + dst[d] = '\0'; + *plen_p = d; + *pat_p = (char*) dst; + SAVEFREEPV(*pat_p); + RExC_orig_utf8 = RExC_utf8 = 1; +} + + + +/* S_concat_pat(): concatenate a list of args to the pattern string pat, + * while recording any code block indices, and handling overloading, + * nested qr// objects etc. If pat is null, it will allocate a new + * string, or just return the first arg, if there's only one. + * + * Returns the malloced/updated pat. + * patternp and pat_count is the array of SVs to be concatted; + * oplist is the optional list of ops that generated the SVs; + * recompile_p is a pointer to a boolean that will be set if + * the regex will need to be recompiled. + * delim, if non-null is an SV that will be inserted between each element + */ + +static SV* +S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, + SV *pat, SV ** const patternp, int pat_count, + OP *oplist, bool *recompile_p, SV *delim) +{ + SV **svp; + int n = 0; + bool use_delim = FALSE; + bool alloced = FALSE; + + /* if we know we have at least two args, create an empty string, + * then concatenate args to that. For no args, return an empty string */ + if (!pat && pat_count != 1) { + pat = newSVpvs(""); + SAVEFREESV(pat); + alloced = TRUE; + } + + for (svp = patternp; svp < patternp + pat_count; svp++) { + SV *sv; + SV *rx = NULL; + STRLEN orig_patlen = 0; + bool code = 0; + SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; + + /* if we've got a delimiter, we go round the loop twice for each + * svp slot (except the last), using the delimiter the second + * time round */ + if (use_delim) { + svp--; + use_delim = FALSE; + } + else if (delim) + use_delim = TRUE; + + if (SvTYPE(msv) == SVt_PVAV) { + /* we've encountered an interpolated array within + * the pattern, e.g. /...@a..../. Expand the list of elements, + * then recursively append elements. + * The code in this block is based on S_pushav() */ + + AV *const av = (AV*)msv; + const SSize_t maxarg = AvFILL(av) + 1; + SV **array; + + if (oplist) { + assert(oplist->op_type == OP_PADAV + || oplist->op_type == OP_RV2AV); + oplist = OP_SIBLING(oplist); + } + + if (SvRMAGICAL(av)) { + SSize_t i; + + Newx(array, maxarg, SV*); + SAVEFREEPV(array); + for (i=0; i < maxarg; i++) { + SV ** const svp = av_fetch(av, i, FALSE); + array[i] = svp ? *svp : &PL_sv_undef; + } + } + else + array = AvARRAY(av); + + pat = S_concat_pat(aTHX_ pRExC_state, pat, + array, maxarg, NULL, recompile_p, + /* $" */ + GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); + + continue; + } + + + /* we make the assumption here that each op in the list of + * op_siblings maps to one SV pushed onto the stack, + * except for code blocks, with have both an OP_NULL and + * and OP_CONST. + * This allows us to match up the list of SVs against the + * list of OPs to find the next code block. + * + * Note that PUSHMARK PADSV PADSV .. + * is optimised to + * PADRANGE PADSV PADSV .. + * so the alignment still works. */ + + if (oplist) { + if (oplist->op_type == OP_NULL + && (oplist->op_flags & OPf_SPECIAL)) + { + assert(n < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0; + pRExC_state->code_blocks[n].block = oplist; + pRExC_state->code_blocks[n].src_regex = NULL; + n++; + code = 1; + oplist = OP_SIBLING(oplist); /* skip CONST */ + assert(oplist); + } + oplist = OP_SIBLING(oplist);; + } + + /* apply magic and QR overloading to arg */ + + SvGETMAGIC(msv); + if (SvROK(msv) && SvAMAGIC(msv)) { + SV *sv = AMG_CALLunary(msv, regexp_amg); + if (sv) { + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) != SVt_REGEXP) + Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); + msv = sv; + } + } + + /* try concatenation overload ... */ + if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && + (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) + { + sv_setsv(pat, sv); + /* overloading involved: all bets are off over literal + * code. Pretend we haven't seen it */ + pRExC_state->num_code_blocks -= n; + n = 0; + } + else { + /* ... or failing that, try "" overload */ + while (SvAMAGIC(msv) + && (sv = AMG_CALLunary(msv, string_amg)) + && sv != msv + && !( SvROK(msv) + && SvROK(sv) + && SvRV(msv) == SvRV(sv)) + ) { + msv = sv; + SvGETMAGIC(msv); + } + if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) + msv = SvRV(msv); + + if (pat) { + /* this is a partially unrolled + * sv_catsv_nomg(pat, msv); + * that allows us to adjust code block indices if + * needed */ + STRLEN dlen; + char *dst = SvPV_force_nomg(pat, dlen); + orig_patlen = dlen; + if (SvUTF8(msv) && !SvUTF8(pat)) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); + sv_setpvn(pat, dst, dlen); + SvUTF8_on(pat); + } + sv_catsv_nomg(pat, msv); + rx = msv; + } + else + pat = msv; + + if (code) + pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + } + + /* extract any code blocks within any embedded qr//'s */ + if (rx && SvTYPE(rx) == SVt_REGEXP + && RX_ENGINE((REGEXP*)rx)->op_comp) + { + + RXi_GET_DECL(ReANY((REGEXP *)rx), ri); + if (ri->num_code_blocks) { + int i; + /* the presence of an embedded qr// with code means + * we should always recompile: the text of the + * qr// may not have changed, but it may be a + * different closure than last time */ + *recompile_p = 1; + Renew(pRExC_state->code_blocks, + pRExC_state->num_code_blocks + ri->num_code_blocks, + struct reg_code_block); + pRExC_state->num_code_blocks += ri->num_code_blocks; + + for (i=0; i < ri->num_code_blocks; i++) { + struct reg_code_block *src, *dst; + STRLEN offset = orig_patlen + + ReANY((REGEXP *)rx)->pre_prefix; + assert(n < pRExC_state->num_code_blocks); + src = &ri->code_blocks[i]; + dst = &pRExC_state->code_blocks[n]; + dst->start = src->start + offset; + dst->end = src->end + offset; + dst->block = src->block; + dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) + src->src_regex + ? src->src_regex + : (REGEXP*)rx); + n++; + } + } + } + } + /* avoid calling magic multiple times on a single element e.g. =~ $qr */ + if (alloced) + SvSETMAGIC(pat); + + return pat; +} + + + +/* see if there are any run-time code blocks in the pattern. + * False positives are allowed */ + +static bool +S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + int n = 0; + STRLEN s; + + PERL_UNUSED_CONTEXT; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + s = pRExC_state->code_blocks[n].end; + n++; + continue; + } + /* TODO ideally should handle [..], (#..), /#.../x to reduce false + * positives here */ + if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && + (pat[s+2] == '{' + || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) + ) + return 1; + } + return 0; +} + +/* Handle run-time code blocks. We will already have compiled any direct + * or indirect literal code blocks. Now, take the pattern 'pat' and make a + * copy of it, but with any literal code blocks blanked out and + * appropriate chars escaped; then feed it into + * + * eval "qr'modified_pattern'" + * + * For example, + * + * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno + * + * becomes + * + * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' + * + * After eval_sv()-ing that, grab any new code blocks from the returned qr + * and merge them with any code blocks of the original regexp. + * + * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; + * instead, just save the qr and return FALSE; this tells our caller that + * the original pattern needs upgrading to utf8. + */ + +static bool +S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + SV *qr; + + GET_RE_DEBUG_FLAGS_DECL; + + if (pRExC_state->runtime_code_qr) { + /* this is the second time we've been called; this should + * only happen if the main pattern got upgraded to utf8 + * during compilation; re-use the qr we compiled first time + * round (which should be utf8 too) + */ + qr = pRExC_state->runtime_code_qr; + pRExC_state->runtime_code_qr = NULL; + assert(RExC_utf8 && SvUTF8(qr)); + } + else { + int n = 0; + STRLEN s; + char *p, *newpat; + int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ + SV *sv, *qr_ref; + dSP; + + /* determine how many extra chars we need for ' and \ escaping */ + for (s = 0; s < plen; s++) { + if (pat[s] == '\'' || pat[s] == '\\') + newlen++; + } + + Newx(newpat, newlen, char); + p = newpat; + *p++ = 'q'; *p++ = 'r'; *p++ = '\''; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + /* blank out literal code block */ + assert(pat[s] == '('); + while (s <= pRExC_state->code_blocks[n].end) { + *p++ = '_'; + s++; + } + s--; + n++; + continue; + } + if (pat[s] == '\'' || pat[s] == '\\') + *p++ = '\\'; + *p++ = pat[s]; + } + *p++ = '\''; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + *p++ = 'x'; + *p++ = '\0'; + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%sre-parsing pattern for runtime code:%s %s\n", + PL_colors[4],PL_colors[5],newpat); + }); + + sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); + Safefree(newpat); + + ENTER; + SAVETMPS; + save_re_context(); + PUSHSTACKi(PERLSI_REQUIRE); + /* G_RE_REPARSING causes the toker to collapse \\ into \ when + * parsing qr''; normally only q'' does this. It also alters + * hints handling */ + eval_sv(sv, G_SCALAR|G_RE_REPARSING); + SvREFCNT_dec_NN(sv); + SPAGAIN; + qr_ref = POPs; + PUTBACK; + { + SV * const errsv = ERRSV; + if (SvTRUE_NN(errsv)) + { + Safefree(pRExC_state->code_blocks); + /* use croak_sv ? */ + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); + } + } + assert(SvROK(qr_ref)); + qr = SvRV(qr_ref); + assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); + /* the leaving below frees the tmp qr_ref. + * Give qr a life of its own */ + SvREFCNT_inc(qr); + POPSTACK; + FREETMPS; + LEAVE; + + } + + if (!RExC_utf8 && SvUTF8(qr)) { + /* first time through; the pattern got upgraded; save the + * qr for the next time through */ + assert(!pRExC_state->runtime_code_qr); + pRExC_state->runtime_code_qr = qr; + return 0; + } + + + /* extract any code blocks within the returned qr// */ + + + /* merge the main (r1) and run-time (r2) code blocks into one */ + { + RXi_GET_DECL(ReANY((REGEXP *)qr), r2); + struct reg_code_block *new_block, *dst; + RExC_state_t * const r1 = pRExC_state; /* convenient alias */ + int i1 = 0, i2 = 0; + + if (!r2->num_code_blocks) /* we guessed wrong */ + { + SvREFCNT_dec_NN(qr); + return 1; + } + + Newx(new_block, + r1->num_code_blocks + r2->num_code_blocks, + struct reg_code_block); + dst = new_block; + + while ( i1 < r1->num_code_blocks + || i2 < r2->num_code_blocks) + { + struct reg_code_block *src; + bool is_qr = 0; + + if (i1 == r1->num_code_blocks) { + src = &r2->code_blocks[i2++]; + is_qr = 1; + } + else if (i2 == r2->num_code_blocks) + src = &r1->code_blocks[i1++]; + else if ( r1->code_blocks[i1].start + < r2->code_blocks[i2].start) + { + src = &r1->code_blocks[i1++]; + assert(src->end < r2->code_blocks[i2].start); + } + else { + assert( r1->code_blocks[i1].start + > r2->code_blocks[i2].start); + src = &r2->code_blocks[i2++]; + is_qr = 1; + assert(src->end < r1->code_blocks[i1].start); + } + + assert(pat[src->start] == '('); + assert(pat[src->end] == ')'); + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; + dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) + : src->src_regex; + dst++; + } + r1->num_code_blocks += r2->num_code_blocks; + Safefree(r1->code_blocks); + r1->code_blocks = new_block; + } + + SvREFCNT_dec_NN(qr); + return 1; +} + + +STATIC bool +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) +{ + /* This is the common code for setting up the floating and fixed length + * string data extracted from Perl_re_op_compile() below. Returns a boolean + * as to whether succeeded or not */ + + I32 t; + SSize_t ml; + + if (! (longest_length + || (eol /* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) + ) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) + { + return FALSE; + } + + /* copy the information about the longest from the reg_scan_data + over to the program. */ + if (SvUTF8(sv_longest)) { + *rx_utf8 = sv_longest; + *rx_substr = NULL; + } else { + *rx_substr = sv_longest; + *rx_utf8 = NULL; + } + /* end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = minlen ? *(minlen) : (SSize_t)longest_length; + *rx_end_shift = ml - offset + - longest_length + (SvTAIL(sv_longest) != 0) + + lookbehind; + + t = (eol/* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); + fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); + + return TRUE; +} + +/* + * Perl_re_op_compile - the perl internal RE engine's function to compile a + * regular expression into internal code. + * The pattern may be passed either as: + * a list of SVs (patternp plus pat_count) + * a list of OPs (expr) + * If both are passed, the SV list is used, but the OP list indicates + * which SVs are actually pre-compiled code blocks + * + * The SVs in the list have magic and qr overloading applied to them (and + * the list may be modified in-place with replacement SVs in the latter + * case). + * + * If the pattern hasn't changed from old_re, then old_re will be + * returned. + * + * eng is the current engine. If that engine has an op_comp method, then + * handle directly (i.e. we assume that op_comp was us); otherwise, just + * do the initial concatenation of arguments and pass on to the external + * engine. + * + * If is_bare_re is not null, set it to a boolean indicating whether the + * arg list reduced (after overloading) to a single bare regex which has + * been returned (i.e. /$qr/). + * + * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. + * + * pm_flags contains the PMf_* flags, typically based on those from the + * pm_flags field of the related PMOP. Currently we're only interested in + * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) [NB: not true in perl] + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. [I'll say.] + */ + +REGEXP * +Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, + OP *expr, const regexp_engine* eng, REGEXP *old_re, + bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) +{ + REGEXP *rx; + struct regexp *r; + regexp_internal *ri; + STRLEN plen; + char *exp; + regnode *scan; + I32 flags; + SSize_t minlen = 0; + U32 rx_flags; + SV *pat; + SV *code_blocksv = NULL; + SV** new_patternp = patternp; + + /* these are all flags - maybe they should be turned + * into a single int with different bit masks */ + I32 sawlookahead = 0; + I32 sawplus = 0; + I32 sawopen = 0; + I32 sawminmod = 0; + + regex_charset initial_charset = get_regex_charset(orig_rx_flags); + bool recompile = 0; + bool runtime_code = 0; + scan_data_t data; + RExC_state_t RExC_state; + RExC_state_t * const pRExC_state = &RExC_state; +#ifdef TRIE_STUDY_OPT + int restudied = 0; + RExC_state_t copyRExC_state; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_OP_COMPILE; + + DEBUG_r(if (!PL_colorset) reginitcolors()); + +#ifndef PERL_IN_XSUB_RE + /* Initialize these here instead of as-needed, as is quick and avoids + * having to test them each time otherwise */ + if (! PL_AboveLatin1) { + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); + } +#endif + + pRExC_state->code_blocks = NULL; + pRExC_state->num_code_blocks = 0; + + if (is_bare_re) + *is_bare_re = FALSE; + + if (expr && (expr->op_type == OP_LIST || + (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { + /* allocate code_blocks if needed */ + OP *o; + int ncode = 0; + + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) + if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) + ncode++; /* count of DO blocks */ + if (ncode) { + pRExC_state->num_code_blocks = ncode; + Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); + } + } + + if (!pat_count) { + /* compile-time pattern with just OP_CONSTs and DO blocks */ + + int n; + OP *o; + + /* find how many CONSTs there are */ + assert(expr); + n = 0; + if (expr->op_type == OP_CONST) + n = 1; + else + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) { + if (o->op_type == OP_CONST) + n++; + } + + /* fake up an SV array */ + + assert(!new_patternp); + Newx(new_patternp, n, SV*); + SAVEFREEPV(new_patternp); + pat_count = n; + + n = 0; + if (expr->op_type == OP_CONST) + new_patternp[n] = cSVOPx_sv(expr); + else + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) { + if (o->op_type == OP_CONST) + new_patternp[n++] = cSVOPo_sv; + } + + } + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Assembling pattern from %d elements%s\n", pat_count, + orig_rx_flags & RXf_SPLIT ? " for split" : "")); + + /* set expr to the first arg op */ + + if (pRExC_state->num_code_blocks + && expr->op_type != OP_CONST) + { + expr = cLISTOPx(expr)->op_first; + assert( expr->op_type == OP_PUSHMARK + || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) + || expr->op_type == OP_PADRANGE); + expr = OP_SIBLING(expr); + } + + pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, + expr, &recompile, NULL); + + /* handle bare (possibly after overloading) regex: foo =~ $re */ + { + SV *re = pat; + if (SvROK(re)) + re = SvRV(re); + if (SvTYPE(re) == SVt_REGEXP) { + if (is_bare_re) + *is_bare_re = TRUE; + SvREFCNT_inc(re); + Safefree(pRExC_state->code_blocks); + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Precompiled pattern%s\n", + orig_rx_flags & RXf_SPLIT ? " for split" : "")); + + return (REGEXP*)re; + } + } + + exp = SvPV_nomg(pat, plen); + + if (!eng->op_comp) { + if ((SvUTF8(pat) && IN_BYTES) + || SvGMAGICAL(pat) || SvAMAGIC(pat)) + { + /* make a temporary copy; either to convert to bytes, + * or to avoid repeating get-magic / overloaded stringify */ + pat = newSVpvn_flags(exp, plen, SVs_TEMP | + (IN_BYTES ? 0 : SvUTF8(pat))); + } + Safefree(pRExC_state->code_blocks); + return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); + } + + /* ignore the utf8ness if the pattern is 0 length */ + RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); + RExC_uni_semantics = 0; + RExC_contains_locale = 0; + RExC_contains_i = 0; + pRExC_state->runtime_code_qr = NULL; + + DEBUG_COMPILE_r({ + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", + PL_colors[4],PL_colors[5],s); + }); + + redo_first_pass: + /* we jump here if we upgrade the pattern to utf8 and have to + * recompile */ + + if ((pm_flags & PMf_USE_RE_EVAL) + /* this second condition covers the non-regex literal case, + * i.e. $foo =~ '(?{})'. */ + || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) + ) + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); + + /* return old regex if pattern hasn't changed */ + /* XXX: note in the below we have to check the flags as well as the + * pattern. + * + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ + + if ( old_re + && !recompile + && !!RX_UTF8(old_re) == !!RExC_utf8 + && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) + && RX_PRECOMP(old_re) + && RX_PRELEN(old_re) == plen + && memEQ(RX_PRECOMP(old_re), exp, plen) + && !runtime_code /* with runtime code, always recompile */ ) + { + Safefree(pRExC_state->code_blocks); + return old_re; + } + + rx_flags = orig_rx_flags; + + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; + } + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + + /* Set to use unicode semantics if the pattern is in utf8 and has the + * 'depends' charset specified, as it means unicode when utf8 */ + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); + } + + RExC_precomp = exp; + RExC_flags = rx_flags; + RExC_pm_flags = pm_flags; + + if (runtime_code) { + if (TAINTING_get && TAINT_get) + Perl_croak(aTHX_ "Eval-group in insecure regular expression"); + + if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { + /* whoops, we have a non-utf8 pattern, whilst run-time code + * got compiled as utf8. Try again with a utf8 pattern */ + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + pRExC_state->num_code_blocks); + goto redo_first_pass; + } + } + assert(!pRExC_state->runtime_code_qr); + + RExC_sawback = 0; + + RExC_seen = 0; + RExC_maxlen = 0; + RExC_in_lookbehind = 0; + RExC_seen_zerolen = *exp == '^' ? -1 : 0; + RExC_extralen = 0; + RExC_override_recoding = 0; + RExC_in_multi_char_class = 0; + + /* First pass: determine size, legality. */ + RExC_parse = exp; + RExC_start = exp; + RExC_end = exp + plen; + RExC_naughty = 0; + RExC_npar = 1; + RExC_nestroot = 0; + RExC_size = 0L; + RExC_emit = (regnode *) &RExC_emit_dummy; + RExC_whilem_seen = 0; + RExC_open_parens = NULL; + RExC_close_parens = NULL; + RExC_opend = NULL; + RExC_paren_names = NULL; +#ifdef DEBUGGING + RExC_paren_name_list = NULL; +#endif + RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; + RExC_recurse_count = 0; + pRExC_state->code_index = 0; + +#if 0 /* REGC() is (currently) a NOP at the first pass. + * Clever compilers notice this and complain. --jhi */ + REGC((U8)REG_MAGIC, (char*)RExC_emit); +#endif + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + RExC_lastnum=0; + RExC_lastparse=NULL; + ); + /* reg may croak on us, not giving us a chance to free + pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may + need it to survive as long as the regexp (qr/(?{})/). + We must check that code_blocksv is not already set, because we may + have jumped back to restart the sizing pass. */ + if (pRExC_state->code_blocks && !code_blocksv) { + code_blocksv = newSV_type(SVt_PV); + SAVEFREESV(code_blocksv); + SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); + SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ + } + if (reg(pRExC_state, 0, &flags,1) == NULL) { + /* It's possible to write a regexp in ascii that represents Unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + -- dmq */ + if (flags & RESTART_UTF8) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + pRExC_state->num_code_blocks); + goto redo_first_pass; + } + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); + } + if (code_blocksv) + SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ + + DEBUG_PARSE_r({ + PerlIO_printf(Perl_debug_log, + "Required size %"IVdf" nodes\n" + "Starting second pass (creation)\n", + (IV)RExC_size); + RExC_lastnum=0; + RExC_lastparse=NULL; + }); + + /* The first pass could have found things that force Unicode semantics */ + if ((RExC_utf8 || RExC_uni_semantics) + && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) + { + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); + } + + /* Small enough for pointer-storage convention? + If extralen==0, this means that we will not need long jumps. */ + if (RExC_size >= 0x10000L && RExC_extralen) + RExC_size += RExC_extralen; + else + RExC_extralen = 0; + if (RExC_whilem_seen > 15) + RExC_whilem_seen = 15; + + /* Allocate space and zero-initialize. Note, the two step process + of zeroing when in debug mode, thus anything assigned has to + happen after that */ + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = ReANY(rx); + Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char, regexp_internal); + if ( r == NULL || ri == NULL ) + FAIL("Regexp out of space"); +#ifdef DEBUGGING + /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); +#else + /* bulk initialize base fields with 0. */ + Zero(ri, sizeof(regexp_internal), char); +#endif + + /* non-zero initialization begins here */ + RXi_SET( r, ri ); + r->engine= eng; + r->extflags = rx_flags; + RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; + + if (pm_flags & PMf_IS_QR) { + ri->code_blocks = pRExC_state->code_blocks; + ri->num_code_blocks = pRExC_state->num_code_blocks; + } + else + { + int n; + for (n = 0; n < pRExC_state->num_code_blocks; n++) + if (pRExC_state->code_blocks[n].src_regex) + SAVEFREESV(pRExC_state->code_blocks[n].src_regex); + SAVEFREEPV(pRExC_state->code_blocks); + } + + { + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); + + /* The caret is output if there are any defaults: if not all the STD + * flags are set, or if no character set specifier is needed */ + bool has_default = + (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) + || ! has_charset); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + >> RXf_PMf_STD_PMMOD_SHIFT); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + /* Allocate for the worst case, which is all the std flags are turned + * on. If more precision is desired, we could do a population count of + * the flags set. This could be done with a small lookup table, or by + * shifting, masking and adding, or even, when available, assembly + * language for a machine-language population count. + * We never output a minus, as all those are defaults, so are + * covered by the caret */ + const STRLEN wraplen = plen + has_p + has_runon + + has_default /* If needs a caret */ + + /* If needs a character set specifier */ + + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ + r->xpv_len_u.xpvlenu_pv = p; + if (RExC_utf8) + SvFLAGS(rx) |= SVf_UTF8; + *p++='('; *p++='?'; + + /* If a default, cover it using the caret */ + if (has_default) { + *p++= DEFAULT_PAT_MOD; + } + if (has_charset) { + STRLEN len; + const char* const name = get_regex_charset_name(r->extflags, &len); + Copy(name, p, len, char); + p += len; + } + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ + { + char ch; + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + reganch >>= 1; + } + } + + *p++ = ':'; + Copy(RExC_precomp, p, plen, char); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); + p += plen; + if (has_runon) + *p++ = '\n'; + *p++ = ')'; + *p = 0; + SvCUR_set(rx, p - RX_WRAPPED(rx)); + } + + r->intflags = 0; + r->nparens = RExC_npar - 1; /* set early to validate backrefs */ + + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { + Newxz(RExC_open_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_open_parens); + Newxz(RExC_close_parens,RExC_npar,regnode *); + SAVEFREEPV(RExC_close_parens); + } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } + + /* Useful during FAIL. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ + DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, + "%s %"UVuf" bytes for offset annotations.\n", + ri->u.offsets ? "Got" : "Couldn't get", + (UV)((2*RExC_size+1) * sizeof(U32)))); +#endif + SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; + RExC_rx = r; + RExC_rxi = ri; + + /* Second pass: emit code. */ + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_pm_flags = pm_flags; + RExC_parse = exp; + RExC_end = exp + plen; + RExC_naughty = 0; + RExC_npar = 1; + RExC_emit_start = ri->program; + RExC_emit = ri->program; + RExC_emit_bound = ri->program + RExC_size + 1; + pRExC_state->code_index = 0; + + REGC((U8)REG_MAGIC, (char*) RExC_emit++); + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(rx); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); + } + /* XXXX To minimize changes to RE engine we always allocate + 3-units-long substrs field. */ + Newx(r->substrs, 1, struct reg_substr_data); + if (RExC_recurse_count) { + Newxz(RExC_recurse,RExC_recurse_count,regnode *); + SAVEFREEPV(RExC_recurse); + } + +reStudy: + r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; + Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + +#ifdef TRIE_STUDY_OPT + if (!restudied) { + StructCopy(&zero_scan_data, &data, scan_data_t); + copyRExC_state = RExC_state; + } else { + U32 seen=RExC_seen; + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); + + RExC_state = copyRExC_state; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; + else + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; + StructCopy(&zero_scan_data, &data, scan_data_t); + } +#else + StructCopy(&zero_scan_data, &data, scan_data_t); +#endif + + /* Dig out information for optimizations. */ + r->extflags = RExC_flags; /* was pm_op */ + /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ + + if (UTF) + SvUTF8_on(rx); /* Unicode in it? */ + ri->regstclass = NULL; + if (RExC_naughty >= 10) /* Probably an expensive pattern. */ + r->intflags |= PREGf_NAUGHTY; + scan = ri->program + 1; /* First BRANCH. */ + + /* testing for BRANCH here tells us whether there is "must appear" + data in the pattern. If there is then we can use it for optimisations */ + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; + STRLEN longest_float_length, longest_fixed_length; + regnode_ssc ch_class; /* pointed to by data */ + int stclass_flag; + SSize_t last_close = 0; /* pointed to by data */ + regnode *first= scan; + regnode *first_next= regnext(first); + /* + * Skip introductions and multiplicators >= 1 + * so that we can extract the 'meat' of the pattern that must + * match in the large if() sequence following. + * NOTE that EXACT is NOT covered here, as it is normally + * picked up by the optimiser separately. + * + * This is unfortunate as the optimiser isnt handling lookahead + * properly currently. + * + */ + while ((OP(first) == OPEN && (sawopen = 1)) || + /* An OR of *one* alternative - should not happen now. */ + (OP(first) == BRANCH && OP(first_next) != BRANCH) || + /* for now we can't handle lookbehind IFMATCH*/ + (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || + (OP(first) == PLUS) || + (OP(first) == MINMOD) || + /* An {n,m} with n>0 */ + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) + { + /* + * the only op that could be a regnode is PLUS, all the rest + * will be regnode_1 or regnode_2. + * + * (yves doesn't think this is true) + */ + if (OP(first) == PLUS) + sawplus = 1; + else { + if (OP(first) == MINMOD) + sawminmod = 1; + first += regarglen[OP(first)]; + } + first = NEXTOPER(first); + first_next= regnext(first); + } + + /* Starting-point info. */ + again: + DEBUG_PEEP("first:",first,0); + /* Ignore EXACT as we deal with it later. */ + if (PL_regkind[OP(first)] == EXACT) { + if (OP(first) == EXACT) + NOOP; /* Empty, get anchored substr later. */ + else + ri->regstclass = first; + } +#ifdef TRIE_STCLASS + else if (PL_regkind[OP(first)] == TRIE && + ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) + { + /* this can happen only on restudy */ + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); + } +#endif + else if (REGNODE_SIMPLE(OP(first))) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOUND || + PL_regkind[OP(first)] == NBOUND) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOL) { + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL + : (OP(first) == SBOL + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); + first = NEXTOPER(first); + goto again; + } + else if (OP(first) == GPOS) { + r->intflags |= PREGf_ANCH_GPOS; + first = NEXTOPER(first); + goto again; + } + else if ((!sawopen || !RExC_sawback) && + !sawlookahead && + (OP(first) == STAR && + PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) + { + /* turn .* into ^.* with an implied $*=1 */ + const int type = + (OP(NEXTOPER(first)) == REG_ANY) + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); + first = NEXTOPER(first); + goto again; + } + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) + && !pRExC_state->num_code_blocks) /* May examine pos and $& */ + /* x+ must match at the 1st pos of run of x's */ + r->intflags |= PREGf_SKIP; + + /* Scan is after the zeroth branch, first is atomic matcher. */ +#ifdef TRIE_STUDY_OPT + DEBUG_PARSE_r( + if (!restudied) + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#else + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#endif + + + /* + * If there's something expensive in the r.e., find the + * longest literal string that must appear and make it the + * regmust. Resolve ties in favor of later strings, since + * the regstart check works with the beginning of the r.e. + * and avoiding duplication strengthens checking. Not a + * strong reason, but sufficient in the absence of others. + * [Now we resolve ties in favor of the earlier string if + * it happens that c_offset_min has been invalidated, since the + * earlier string may buy us something the later one won't.] + */ + + data.longest_fixed = newSVpvs(""); + data.longest_float = newSVpvs(""); + data.last_found = newSVpvs(""); + data.longest = &(data.longest_fixed); + ENTER_with_name("study_chunk"); + SAVEFREESV(data.longest_fixed); + SAVEFREESV(data.longest_float); + SAVEFREESV(data.last_found); + first = scan; + if (!ri->regstclass) { + ssc_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + stclass_flag = SCF_DO_STCLASS_AND; + } else /* XXXX Check for BOUND? */ + stclass_flag = 0; + data.last_closep = &last_close; + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, + SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag + | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), + 0); + + + CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); + + + if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) + && data.last_start_min == 0 && data.last_end > 0 + && !RExC_seen_zerolen + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ + r->extflags |= RXf_CHECK_ALL; + } + scan_commit(pRExC_state, &data,&minlen,0); + + longest_float_length = CHR_SVLEN(data.longest_float); + + if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) + && S_setup_longest (aTHX_ pRExC_state, + data.longest_float, + &(r->float_utf8), + &(r->float_substr), + &(r->float_end_shift), + data.lookbehind_float, + data.offset_float_min, + data.minlen_float, + longest_float_length, + cBOOL(data.flags & SF_FL_BEFORE_EOL), + cBOOL(data.flags & SF_FL_BEFORE_MEOL))) + { + r->float_min_offset = data.offset_float_min - data.lookbehind_float; + r->float_max_offset = data.offset_float_max; + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ + r->float_max_offset -= data.lookbehind_float; + SvREFCNT_inc_simple_void_NN(data.longest_float); + } + else { + r->float_substr = r->float_utf8 = NULL; + longest_float_length = 0; + } + + longest_fixed_length = CHR_SVLEN(data.longest_fixed); + + if (S_setup_longest (aTHX_ pRExC_state, + data.longest_fixed, + &(r->anchored_utf8), + &(r->anchored_substr), + &(r->anchored_end_shift), + data.lookbehind_fixed, + data.offset_fixed, + data.minlen_fixed, + longest_fixed_length, + cBOOL(data.flags & SF_FIX_BEFORE_EOL), + cBOOL(data.flags & SF_FIX_BEFORE_MEOL))) + { + r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; + SvREFCNT_inc_simple_void_NN(data.longest_fixed); + } + else { + r->anchored_substr = r->anchored_utf8 = NULL; + longest_fixed_length = 0; + } + LEAVE_with_name("study_chunk"); + + if (ri->regstclass + && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) + ri->regstclass = NULL; + + if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) + && stclass_flag + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); + StructCopy(data.start_class, + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class, NULL); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + data.start_class = NULL; + } + + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ + if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; + r->check_end_shift = r->anchored_end_shift; + r->check_substr = r->anchored_substr; + r->check_utf8 = r->anchored_utf8; + r->check_offset_min = r->check_offset_max = r->anchored_offset; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; + } + else { + r->substrs->check_ix = 1; + r->check_end_shift = r->float_end_shift; + r->check_substr = r->float_substr; + r->check_utf8 = r->float_utf8; + r->check_offset_min = r->float_min_offset; + r->check_offset_max = r->float_max_offset; + } + if ((r->check_substr || r->check_utf8) ) { + r->extflags |= RXf_USE_INTUIT; + if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) + r->extflags |= RXf_INTUIT_TAIL; + } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) + if ( (STRLEN)minlen < longest_float_length ) + minlen= longest_float_length; + if ( (STRLEN)minlen < longest_fixed_length ) + minlen= longest_fixed_length; + */ + } + else { + /* Several toplevels. Best we can is to set minlen. */ + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); + + scan = ri->program + 1; + ssc_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + data.last_closep = &last_close; + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), + 0); + + CHECK_RESTUDY_GOTO_butfirst(NOOP); + + r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 + = r->float_substr = r->float_utf8 = NULL; + + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); + StructCopy(data.start_class, + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class, NULL); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + data.start_class = NULL; + } + } + + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + + /* Guard against an embedded (?=) or (?<=) with a longer minlen than + the "real" pattern. */ + DEBUG_OPTIMISE_r({ + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); + }); + r->minlenret = minlen; + if (r->minlen < minlen) + r->minlen = minlen; + + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ + if (pRExC_state->num_code_blocks) + r->extflags |= RXf_EVAL_SEEN; + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) + { + r->intflags |= PREGf_VERBARG_SEEN; + r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ + } + if (RExC_seen & REG_CUTGROUP_SEEN) + r->intflags |= PREGf_CUTGROUP_SEEN; + if (pm_flags & PMf_USE_RE_EVAL) + r->intflags |= PREGf_USE_RE_EVAL; + if (RExC_paren_names) + RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); + else + RXp_PAREN_NAMES(r) = NULL; + + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ + regnode *first = ri->program + 1; + U8 fop = OP(first); + regnode *next = NEXTOPER(first); + U8 nop = OP(next); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) + r->extflags |= RXf_WHITE; + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + + } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + +#ifdef DEBUGGING + if (RExC_paren_names) { + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); + } else +#endif + ri->name_list_idx = 0; + + if (RExC_recurse_count) { + for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { + const regnode *scan = RExC_recurse[RExC_recurse_count-1]; + ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); + } + } + Newxz(r->offs, RExC_npar, regexp_paren_pair); + /* assume we don't need to swap parens around before we match */ + + DEBUG_DUMP_r({ + DEBUG_RExC_seen(); + PerlIO_printf(Perl_debug_log,"Final program:\n"); + regdump(r); + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_OFFSETS_r(if (ri->u.offsets) { + const STRLEN len = ri->u.offsets[0]; + STRLEN i; + GET_RE_DEBUG_FLAGS_DECL; + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + for (i = 1; i <= len; i++) { + if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) + PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", + (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); + } + PerlIO_printf(Perl_debug_log, "\n"); + }); +#endif + +#ifdef USE_ITHREADS + /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated + * by setting the regexp SV to readonly-only instead. If the + * pattern's been recompiled, the USEDness should remain. */ + if (old_re && SvREADONLY(old_re)) + SvREADONLY_on(rx); +#endif + return rx; +} + + +SV* +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak_no_modify(); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) +{ + AV *retarray = NULL; + SV *ret; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + + if (flags & RXapif_ALL) + retarray=newAV(); + + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) + { + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); + if (!retarray) + return ret; + } else { + if (retarray) + ret = newSVsv(&PL_sv_undef); + } + if (retarray) + av_push(retarray, ret); + } + if (retarray) + return newRV_noinc(MUTABLE_SV(retarray)); + } + } + return NULL; +} + +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, + const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & RXapif_ALL) { + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); + if (sv) { + SvREFCNT_dec_NN(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; + + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); + + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + return newSVhek(HeKEY_hek(temphe)); + } + } + } + return NULL; +} + +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) +{ + SV *ret; + AV *av; + SSize_t length; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = MUTABLE_AV(SvRV(ret)); + length = av_tindex(av); + SvREFCNT_dec_NN(ret); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + AV *av = newAV(); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + av_push(av, newSVhek(HeKEY_hek(temphe))); + } + } + } + + return newRV_noinc(MUTABLE_SV(av)); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) +{ + struct regexp *const rx = ReANY(r); + char *s = NULL; + SSize_t i = 0; + SSize_t s1, t1; + I32 n = paren; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; + + if ( n == RX_BUFF_IDX_CARET_PREMATCH + || n == RX_BUFF_IDX_CARET_FULLMATCH + || n == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } + + if (!rx->subbeg) + goto ret_undef; + + if (n == RX_BUFF_IDX_CARET_FULLMATCH) + /* no need to distinguish between them any more */ + n = RX_BUFF_IDX_FULLMATCH; + + if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) + && rx->offs[0].start != -1) + { + /* $`, ${^PREMATCH} */ + i = rx->offs[0].start; + s = rx->subbeg; + } + else + if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) + && rx->offs[0].end != -1) + { + /* $', ${^POSTMATCH} */ + s = rx->subbeg - rx->suboffset + rx->offs[0].end; + i = rx->sublen + rx->suboffset - rx->offs[0].end; + } + else + if ( 0 <= n && n <= (I32)rx->nparens && + (s1 = rx->offs[n].start) != -1 && + (t1 = rx->offs[n].end) != -1) + { + /* $&, ${^MATCH}, $1 ... */ + i = t1 - s1; + s = rx->subbeg + s1 - rx->suboffset; + } else { + goto ret_undef; + } + + assert(s >= rx->subbeg); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); + if (i >= 0) { +#ifdef NO_TAINT_SUPPORT + sv_setpvn(sv, s, i); +#else + const int oldtainted = TAINT_get; + TAINT_NOT; + sv_setpvn(sv, s, i); + TAINT_set(oldtainted); +#endif + if ( (rx->intflags & PREGf_CANY_SEEN) + ? (RXp_MATCH_UTF8(rx) + && (!i || is_utf8_string((U8*)s, i))) + : (RXp_MATCH_UTF8(rx)) ) + { + SvUTF8_on(sv); + } + else + SvUTF8_off(sv); + if (TAINTING_get) { + if (RXp_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + TAINT; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); + } + } else { + TAINT; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); + } + } else { + ret_undef: + sv_setsv(sv,&PL_sv_undef); + return; + } +} + +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak_no_modify(); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, + const I32 paren) +{ + struct regexp *const rx = ReANY(r); + I32 i; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + + /* Some of this code was originally in C in F */ + switch (paren) { + case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: /* $` */ + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + + case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: /* $' */ + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + + default: /* $& / ${^MATCH}, $1, $2, ... */ + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + warn_undef: + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((const SV *)sv); + return 0; + } + } + getlen: + if (i > 0 && RXp_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg - rx->suboffset + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_ARGS_ASSERT_REG_QR_PACKAGE; + PERL_UNUSED_ARG(rx); + if (0) + return NULL; + else + return newSVpvs("Regexp"); +} + +/* Scans the name of a named buffer from the pattern. + * If flags is REG_RSN_RETURN_NULL returns null. + * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name + * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding + * to the parsed name as looked up in the RExC_paren_names hash. + * If there is an error throws a vFAIL().. type exception. + */ + +#define REG_RSN_RETURN_NULL 0 +#define REG_RSN_RETURN_NAME 1 +#define REG_RSN_RETURN_DATA 2 + +STATIC SV* +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) +{ + char *name_start = RExC_parse; + + PERL_ARGS_ASSERT_REG_SCAN_NAME; + + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + /* skip IDFIRST by using do...while */ + if (UTF) + do { + RExC_parse += UTF8SKIP(RExC_parse); + } while (isWORDCHAR_utf8((U8*)RExC_parse)); + else + do { + RExC_parse++; + } while (isWORDCHAR(*RExC_parse)); + } else { + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ + vFAIL("Group name must start with a non-digit word character"); + } + if ( flags ) { + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); + if ( flags == REG_RSN_RETURN_NAME) + return sv_name; + else if (flags==REG_RSN_RETURN_DATA) { + HE *he_str = NULL; + SV *sv_dat = NULL; + if ( ! sv_name ) /* should not happen*/ + Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) + vFAIL("Reference to nonexistent named group"); + return sv_dat; + } + else { + Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", + (unsigned long) flags); + } + assert(0); /* NOT REACHED */ + } + return NULL; +} + +#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ + int rem=(int)(RExC_end - RExC_parse); \ + int cut; \ + int num; \ + int iscut=0; \ + if (rem>10) { \ + rem=10; \ + iscut=1; \ + } \ + cut=10-rem; \ + if (RExC_lastparse!=RExC_parse) \ + PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ + rem, RExC_parse, \ + cut + 4, \ + iscut ? "..." : "<" \ + ); \ + else \ + PerlIO_printf(Perl_debug_log,"%16s",""); \ + \ + if (SIZE_ONLY) \ + num = RExC_size + 1; \ + else \ + num=REG_NODE_NUM(RExC_emit); \ + if (RExC_lastnum!=num) \ + PerlIO_printf(Perl_debug_log,"|%4d",num); \ + else \ + PerlIO_printf(Perl_debug_log,"|%4s",""); \ + PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ + (int)((depth*2)), "", \ + (funcname) \ + ); \ + RExC_lastnum=num; \ + RExC_lastparse=RExC_parse; \ +}) + + + +#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,"%4s","\n"); \ +}) +#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,fmt "\n",args); \ +}) + +/* This section of code defines the inversion list object and its methods. The + * interfaces are highly subject to change, so as much as possible is static to + * this file. An inversion list is here implemented as a malloc'd C UV array + * as an SVt_INVLIST scalar. + * + * An inversion list for Unicode is an array of code points, sorted by ordinal + * number. The zeroth element is the first code point in the list. The 1th + * element is the first element beyond that not in the list. In other words, + * the first range is + * invlist[0]..(invlist[1]-1) + * The other ranges follow. Thus every element whose index is divisible by two + * marks the beginning of a range that is in the list, and every element not + * divisible by two marks the beginning of a range not in the list. A single + * element inversion list that contains the single code point N generally + * consists of two elements + * invlist[0] == N + * invlist[1] == N+1 + * (The exception is when N is the highest representable value on the + * machine, in which case the list containing just it would be a single + * element, itself. By extension, if the last range in the list extends to + * infinity, then the first element of that range will be in the inversion list + * at a position that is divisible by two, and is the final element in the + * list.) + * Taking the complement (inverting) an inversion list is quite simple, if the + * first element is 0, remove it; otherwise add a 0 element at the beginning. + * This implementation reserves an element at the beginning of each inversion + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. + * + * More about inversion lists can be found in "Unicode Demystified" + * Chapter 13 by Richard Gillam, published by Addison-Wesley. + * More will be coming when functionality is added later. + * + * The inversion list data structure is currently implemented as an SV pointing + * to an array of UVs that the SV thinks are bytes. This allows us to have an + * array of UV whose memory management is automatically handled by the existing + * facilities for SV's. + * + * Some of the methods should always be private to the implementation, and some + * should eventually be made public */ + +/* The header definitions are in F */ + +PERL_STATIC_INLINE UV* +S__invlist_array_init(SV* const invlist, const bool will_have_0) +{ + /* Returns a pointer to the first element in the inversion list's array. + * This is called upon initialization of an inversion list. Where the + * array begins depends on whether the list has the code point U+0000 in it + * or not. The other parameter tells it whether the code that follows this + * call is about to put a 0 in the inversion list or not. The first + * element is either the element reserved for 0, if TRUE, or the element + * after it, if FALSE */ + + bool* offset = get_invlist_offset_addr(invlist); + UV* zero_addr = (UV *) SvPVX(invlist); + + PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; + + /* Must be empty */ + assert(! _invlist_len(invlist)); + + *zero_addr = 0; + + /* 1^1 = 0; 1^0 = 1 */ + *offset = 1 ^ will_have_0; + return zero_addr + *offset; +} + +PERL_STATIC_INLINE UV* +S_invlist_array(SV* const invlist) +{ + /* Returns the pointer to the inversion list's array. Every time the + * length changes, this needs to be called in case malloc or realloc moved + * it */ + + PERL_ARGS_ASSERT_INVLIST_ARRAY; + + /* Must not be empty. If these fail, you probably didn't check for + * being non-zero before trying to get the array */ + assert(_invlist_len(invlist)); + + /* The very first element always contains zero, The array begins either + * there, or if the inversion list is offset, at the element after it. + * The offset header field determines which; it contains 0 or 1 to indicate + * how much additionally to add */ + assert(0 == *(SvPVX(invlist))); + return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); +} + +PERL_STATIC_INLINE void +S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) +{ + /* Sets the current number of elements stored in the inversion list. + * Updates SvCUR correspondingly */ + PERL_UNUSED_CONTEXT; + PERL_ARGS_ASSERT_INVLIST_SET_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + SvCUR_set(invlist, + (len == 0) + ? 0 + : TO_INTERNAL_SIZE(len + offset)); + assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); +} + +PERL_STATIC_INLINE IV* +S_get_invlist_previous_index_addr(SV* invlist) +{ + /* Return the address of the IV that is reserved to hold the cached index + * */ + PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); +} + +PERL_STATIC_INLINE IV +S_invlist_previous_index(SV* const invlist) +{ + /* Returns cached index of previous search */ + + PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; + + return *get_invlist_previous_index_addr(invlist); +} + +PERL_STATIC_INLINE void +S_invlist_set_previous_index(SV* const invlist, const IV index) +{ + /* Caches for later retrieval */ + + PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; + + assert(index == 0 || index < (int) _invlist_len(invlist)); + + *get_invlist_previous_index_addr(invlist) = index; +} + +PERL_STATIC_INLINE UV +S_invlist_max(SV* const invlist) +{ + /* Returns the maximum number of elements storable in the inversion list's + * array, without having to realloc() */ + + PERL_ARGS_ASSERT_INVLIST_MAX; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ + return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; +} + +#ifndef PERL_IN_XSUB_RE +SV* +Perl__new_invlist(pTHX_ IV initial_size) +{ + + /* Return a pointer to a newly constructed inversion list, with enough + * space to store 'initial_size' elements. If that number is negative, a + * system default is used instead */ + + SV* new_list; + + if (initial_size < 0) { + initial_size = 10; + } + + /* Allocate the initial space */ + new_list = newSV_type(SVt_INVLIST); + + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(new_list, 0, 0); + + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; + + *get_invlist_previous_index_addr(new_list) = 0; + + return new_list; +} + +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) +{ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to , which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands. The passed in 'list' contains + * several header fields at the beginning that are not part of the + * inversion list body proper */ + + const STRLEN length = (STRLEN) list[0]; + const UV version_id = list[1]; + const bool offset = cBOOL(list[2]); +#define HEADER_LENGTH 3 + /* If any of the above changes in any way, you must change HEADER_LENGTH + * (if appropriate) and regenerate INVLIST_VERSION_ID by running + * perl -E 'say int(rand 2**31-1)' + */ +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and + data structure type, so that one being + passed in can be validated to be an + inversion list of the correct vintage. + */ + + SV* invlist = newSV_type(SVt_INVLIST); + + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; + + if (version_id != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + /* The generated array passed in includes header elements that aren't part + * of the list proper, so start it just after them */ + SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); + + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); + + invlist_set_previous_index(invlist, 0); + + /* Initialize the iteration pointer. */ + invlist_iterfinish(invlist); + + SvREADONLY_on(invlist); + + return invlist; +} +#endif /* ifndef PERL_IN_XSUB_RE */ + +STATIC void +S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) +{ + /* Grow the maximum size of an inversion list */ + + PERL_ARGS_ASSERT_INVLIST_EXTEND; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Add one to account for the zero element at the beginning which may not + * be counted by the calling parameters */ + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); +} + +PERL_STATIC_INLINE void +S_invlist_trim(SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_TRIM; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Change the length of the inversion list to how many entries it currently + * has */ + SvPV_shrink_to_cur((SV *) invlist); +} + +STATIC void +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) +{ + /* Subject to change or removal. Append the range from 'start' to 'end' at + * the end of the inversion list. The range must be above any existing + * ones. */ + + UV* array; + UV max = invlist_max(invlist); + UV len = _invlist_len(invlist); + bool offset; + + PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; + + if (len == 0) { /* Empty lists must be initialized */ + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); + } + else { + /* Here, the existing list is non-empty. The current max entry in the + * list is generally the first value not in the set, except when the + * set extends to the end of permissible values, in which case it is + * the first entry in that final set, and so this call is an attempt to + * append out-of-order */ + + UV final_element = len - 1; + array = invlist_array(invlist); + if (array[final_element] > start + || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) + { + Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); + } + + /* Here, it is a legal append. If the new range begins with the first + * value not in the set, it is extending the set, so the new first + * value not in the set is one greater than the newly extended range. + * */ + offset = *get_invlist_offset_addr(invlist); + if (array[final_element] == start) { + if (end != UV_MAX) { + array[final_element] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, + * just let the range that this would extend to have no end */ + invlist_set_len(invlist, len - 1, offset); + } + return; + } + } + + /* Here the new range doesn't extend any existing set. Add it */ + + len += 2; /* Includes an element each for the start and end of range */ + + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ + if (max < len) { + invlist_extend(invlist, len); + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + + array = invlist_array(invlist); + } + else { + invlist_set_len(invlist, len, offset); + } + + /* The next item on the list starts the range, the one after that is + * one past the new range. */ + array[len - 2] = start; + if (end != UV_MAX) { + array[len - 1] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, just let + * the range have no end */ + invlist_set_len(invlist, len - 1, offset); + } +} + +#ifndef PERL_IN_XSUB_RE + +IV +Perl__invlist_search(SV* const invlist, const UV cp) +{ + /* Searches the inversion list for the entry that contains the input code + * point . If is not in the list, -1 is returned. Otherwise, the + * return value is the index into the list's array of the range that + * contains */ + + IV low = 0; + IV mid; + IV high = _invlist_len(invlist); + const IV highest_element = high - 1; + const UV* array; + + PERL_ARGS_ASSERT__INVLIST_SEARCH; + + /* If list is empty, return failure. */ + if (high == 0) { + return -1; + } + + /* (We can't get the array unless we know the list is non-empty) */ + array = invlist_array(invlist); + + mid = invlist_previous_index(invlist); + assert(mid >=0 && mid <= highest_element); + + /* contains the cache of the result of the previous call to this + * function (0 the first time). See if this call is for the same result, + * or if it is for mid-1. This is under the theory that calls to this + * function will often be for related code points that are near each other. + * And benchmarks show that caching gives better results. We also test + * here if the code point is within the bounds of the list. These tests + * replace others that would have had to be made anyway to make sure that + * the array bounds were not exceeded, and these give us extra information + * at the same time */ + if (cp >= array[mid]) { + if (cp >= array[highest_element]) { + return highest_element; + } + + /* Here, array[mid] <= cp < array[highest_element]. This means that + * the final element is not the answer, so can exclude it; it also + * means that is not the final element, so can refer to 'mid + 1' + * safely */ + if (cp < array[mid + 1]) { + return mid; + } + high--; + low = mid + 1; + } + else { /* cp < aray[mid] */ + if (cp < array[0]) { /* Fail if outside the array */ + return -1; + } + high = mid; + if (cp >= array[mid - 1]) { + goto found_entry; + } + } + + /* Binary search. What we are looking for is such that + * array[i] <= cp < array[i+1] + * The loop below converges on the i+1. Note that there may not be an + * (i+1)th element in the array, and things work nonetheless */ + while (low < high) { + mid = (low + high) / 2; + assert(mid <= highest_element); + if (array[mid] <= cp) { /* cp >= array[mid] */ + low = mid + 1; + + /* We could do this extra test to exit the loop early. + if (cp < array[low]) { + return mid; + } + */ + } + else { /* cp < array[mid] */ + high = mid; + } + } + + found_entry: + high--; + invlist_set_previous_index(invlist, high); + return high; +} + +void +Perl__invlist_populate_swatch(SV* const invlist, + const UV start, const UV end, U8* swatch) +{ + /* populates a swatch of a swash the same way swatch_get() does in utf8.c, + * but is used when the swash has an inversion list. This makes this much + * faster, as it uses a binary search instead of a linear one. This is + * intimately tied to that function, and perhaps should be in utf8.c, + * except it is intimately tied to inversion lists as well. It assumes + * that is all 0's on input */ + + UV current = start; + const IV len = _invlist_len(invlist); + IV i; + const UV * array; + + PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; + + if (len == 0) { /* Empty inversion list */ + return; + } + + array = invlist_array(invlist); + + /* Find which element it is */ + i = _invlist_search(invlist, start); + + /* We populate from to */ + while (current < end) { + UV upper; + + /* The inversion list gives the results for every possible code point + * after the first one in the list. Only those ranges whose index is + * even are ones that the inversion list matches. For the odd ones, + * and if the initial code point is not in the list, we have to skip + * forward to the next element */ + if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { + i++; + if (i >= len) { /* Finished if beyond the end of the array */ + return; + } + current = array[i]; + if (current >= end) { /* Finished if beyond the end of what we + are populating */ + if (LIKELY(end < UV_MAX)) { + return; + } + + /* We get here when the upper bound is the maximum + * representable on the machine, and we are looking for just + * that code point. Have to special case it */ + i = len; + goto join_end_of_list; + } + } + assert(current >= start); + + /* The current range ends one below the next one, except don't go past + * */ + i++; + upper = (i < len && array[i] < end) ? array[i] : end; + + /* Here we are in a range that matches. Populate a bit in the 3-bit U8 + * for each code point in it */ + for (; current < upper; current++) { + const STRLEN offset = (STRLEN)(current - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + + join_end_of_list: + + /* Quit if at the end of the list */ + if (i >= len) { + + /* But first, have to deal with the highest possible code point on + * the platform. The previous code assumes that is one + * beyond where we want to populate, but that is impossible at the + * platform's infinity, so have to handle it specially */ + if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) + { + const STRLEN offset = (STRLEN)(end - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + return; + } + + /* Advance to the next range, which will be for code points not in the + * inversion list */ + current = array[i]; + } + + return; +} + +void +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) +{ + /* Take the union of two inversion lists and point to it. *output + * SHOULD BE DEFINED upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. + * + * The algorithm is like a merge sort. + * + * XXX A potential performance improvement is to keep track as we go along + * if only one of the inputs contributes to the result, meaning the other + * is a subset of that one. In that case, we can skip the final copy and + * return the larger of the input lists, but then outside code might need + * to keep track of whether to free the input list or not */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* u; /* the resulting union */ + UV* array_u; + UV len_u; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_u = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 0. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is zero is something not in the set. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* If either one is empty, the union is the other one */ + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + + if (*output == a) { + if (a != NULL) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + } + if (*output != b) { + *output = invlist_clone(b); + if (complement_b) { + _invlist_invert(*output); + } + } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } + return; + } + else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; + if (*output == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + + /* The complement of an empty list is a list that has everything in it, + * so the union with includes everything too */ + if (complement_b) { + if (a == *output) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + *output = _new_invlist(1); + _append_range_to_invlist(*output, 0, UV_MAX); + } + else if (*output != a) { + *output = invlist_clone(a); + } + /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the union for the worst case: that the sets are completely + * disjoint */ + u = _new_invlist(len_a + len_b); + + /* Will contain U+0000 if either component does */ + array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) + || (len_b > 0 && array_b[0] == 0)); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the union's array */ + bool cp_in_set; /* is it in the the input list's set or not */ + + /* We need to take one or the other of the two inputs for the union. + * Since we are merging two sorted lists, we take the smaller of the + * next items. In case of a tie, we take the one that is in its set + * first. If we took one not in the set first, it would decrement the + * count, possibly to 0 which would cause it to be output as ending the + * range, and the next time through we would take the same number, and + * output it again as beginning the next range. By doing it the + * opposite way, there is no possibility that the count will be + * momentarily decremented to 0, and thus the two adjoining ranges will + * be seamlessly merged. (In a tie and both are in the set or both not + * in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp = array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 0, which marks the + * beginning/end of a range in that's in the set */ + if (cp_in_set) { + if (count == 0) { + array_u[i_u++] = cp; + } + count++; + } + else { + count--; + if (count == 0) { + array_u[i_u++] = cp; + } + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that hasn't been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to the element beyond + * the one we care about.) If in the set, we decrement 'count'; if 0, there + * is potentially more to output. + * There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. What's left + * in the union is entirely from the non-exhausted set. + * 2) Both were in their sets, count is 2. Nothing further should + * be output, as everything that remains will be in the exhausted + * list's set, hence in the union; decrementing to 1 but not 0 insures + * that + * 3) the exhausted was in its set, non-exhausted isn't, count is 1. + * Nothing further should be output because the union includes + * everything from the exhausted set. Not decrementing ensures that. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; + * decrementing to 0 insures that we look at the remainder of the + * non-exhausted set */ + if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count--; + } + + /* The final length is what we've output so far, plus what else is about to + * be output. (If 'count' is non-zero, then the input list we exhausted + * has everything remaining up to the machine's limit in its set, and hence + * in the union, so there will be no further output. */ + len_u = i_u; + if (count == 0) { + /* At most one of the subexpressions will be non-zero */ + len_u += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_u, so + * re-find it */ + if (len_u != _invlist_len(u)) { + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); + invlist_trim(u); + array_u = invlist_array(u); + } + + /* When 'count' is 0, the list that was exhausted (if one was shorter than + * the other) ended with everything above it not in its set. That means + * that the remaining part of the union is precisely the same as the + * non-exhausted list, so can just copy it unchanged. (If both list were + * exhausted at the same time, then the operations below will be both 0.) + */ + if (count == 0) { + IV copy_count; /* At most one will have a non-zero copy count */ + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_u + i_u, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_u + i_u, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ + if (a == *output || b == *output) { + assert(! invlist_is_iterating(*output)); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } + } + + *output = u; + + return; +} + +void +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) +{ + /* Take the intersection of two inversion lists and point to it. *i + * SHOULD BE DEFINED upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. In fact, it had bugs + * + * The algorithm is like a merge sort, and is essentially the same as the + * union above + */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* r; /* the resulting intersection */ + UV* array_r; + UV len_r; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_r = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 2. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is 2 is something in the intersection. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* Special case if either one is empty */ + len_a = (a == NULL) ? 0 : _invlist_len(a); + if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; + + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the above 'if', 'b' must + * be empty. Here, also we are using 'b's complement, which hence + * must be every possible code point. Thus the intersection is + * simply 'a'. */ + if (*i != a) { + if (*i == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + + *i = invlist_clone(a); + } + /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ + if (*i == a) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + else if (*i == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the intersection for the worst case: that the intersection ends up + * fragmenting everything to be completely disjoint */ + r= _new_invlist(len_a + len_b); + + /* Will contain U+0000 iff both components do */ + array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 + && len_b > 0 && array_b[0] == 0); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the intersection's + array */ + bool cp_in_set; /* Is it in the input list's set or not */ + + /* We need to take one or the other of the two inputs for the + * intersection. Since we are merging two sorted lists, we take the + * smaller of the next items. In case of a tie, we take the one that + * is not in its set first (a difference from the union algorithm). If + * we took one in the set first, it would increment the count, possibly + * to 2 which would cause it to be output as starting a range in the + * intersection, and the next time through we would take that same + * number, and output it again as ending the set. By doing it the + * opposite of this, there is no possibility that the count will be + * momentarily incremented to 2. (In a tie and both are in the set or + * both not in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp= array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 2, which marks the + * beginning/end of a range that's in the intersection */ + if (cp_in_set) { + count++; + if (count == 2) { + array_r[i_r++] = cp; + } + } + else { + if (count == 2) { + array_r[i_r++] = cp; + } + count--; + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that has been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to elements 1 beyond + * the ones we care about.) There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. There's + * nothing left in the intersection. + * 2) Both were in their sets, count is 2 and perhaps is incremented to + * above 2. What should be output is exactly that which is in the + * non-exhausted set, as everything it has is also in the intersection + * set, and everything it doesn't have can't be in the intersection + * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and + * gets incremented to 2. Like the previous case, the intersection is + * everything that remains in the non-exhausted set. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and + * remains 1. And the intersection has nothing more. */ + if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count++; + } + + /* The final length is what we've output so far plus what else is in the + * intersection. At most one of the subexpressions below will be non-zero + * */ + len_r = i_r; + if (count >= 2) { + len_r += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_r, so + * re-find it */ + if (len_r != _invlist_len(r)) { + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); + invlist_trim(r); + array_r = invlist_array(r); + } + + /* Finish outputting any remaining */ + if (count >= 2) { /* At most one will have a non-zero copy count */ + IV copy_count; + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_r + i_r, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_r + i_r, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ + if (a == *i || b == *i) { + assert(! invlist_is_iterating(*i)); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } + } + + *i = r; + + return; +} + +SV* +Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) +{ + /* Add the range from 'start' to 'end' inclusive to the inversion list's + * set. A pointer to the inversion list is returned. This may actually be + * a new list, in which case the passed in one has been destroyed. The + * passed in inversion list can be NULL, in which case a new one is created + * with just the one range in it */ + + SV* range_invlist; + UV len; + + if (invlist == NULL) { + invlist = _new_invlist(2); + len = 0; + } + else { + len = _invlist_len(invlist); + } + + /* If comes after the final entry actually in the list, can just append it + * to the end, */ + if (len == 0 + || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1) + && start >= invlist_array(invlist)[len - 1])) + { + _append_range_to_invlist(invlist, start, end); + return invlist; + } + + /* Here, can't just append things, create and return a new inversion list + * which is the union of this range and the existing inversion list */ + range_invlist = _new_invlist(2); + _append_range_to_invlist(range_invlist, start, end); + + _invlist_union(invlist, range_invlist, &invlist); + + /* The temporary can be freed */ + SvREFCNT_dec_NN(range_invlist); + + return invlist; +} + +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + +#endif + +PERL_STATIC_INLINE SV* +S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { + return _add_range_to_invlist(invlist, cp, cp); +} + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_invert(pTHX_ SV* const invlist) +{ + /* Complement the input inversion list. This adds a 0 if the list didn't + * have a zero; removes it otherwise. As described above, the data + * structure is set up so that this is very efficient */ + + PERL_ARGS_ASSERT__INVLIST_INVERT; + + assert(! invlist_is_iterating(invlist)); + + /* The inverse of matching nothing is matching everything */ + if (_invlist_len(invlist) == 0) { + _append_range_to_invlist(invlist, 0, UV_MAX); + return; + } + + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); +} + +#endif + +PERL_STATIC_INLINE SV* +S_invlist_clone(pTHX_ SV* const invlist) +{ + + /* Return a new inversion list that is a copy of the input one, which is + * unchanged. The new list will not be mortal even if the old one was. */ + + /* Need to allocate extra space to accommodate Perl's addition of a + * trailing NUL to SvPV's, since it thinks they are always strings */ + SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); + STRLEN physical_length = SvCUR(invlist); + bool offset = *(get_invlist_offset_addr(invlist)); + + PERL_ARGS_ASSERT_INVLIST_CLONE; + + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, _invlist_len(invlist), offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); + + return new_invlist; +} + +PERL_STATIC_INLINE STRLEN* +S_get_invlist_iter_addr(SV* invlist) +{ + /* Return the address of the UV that contains the current iteration + * position */ + + PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->iterator); +} + +PERL_STATIC_INLINE void +S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ +{ + PERL_ARGS_ASSERT_INVLIST_ITERINIT; + + *get_invlist_iter_addr(invlist) = 0; +} + +PERL_STATIC_INLINE void +S_invlist_iterfinish(SV* invlist) +{ + /* Terminate iterator for invlist. This is to catch development errors. + * Any iteration that is interrupted before completed should call this + * function. Functions that add code points anywhere else but to the end + * of an inversion list assert that they are not in the middle of an + * iteration. If they were, the addition would make the iteration + * problematical: if the iteration hadn't reached the place where things + * were being added, it would be ok */ + + PERL_ARGS_ASSERT_INVLIST_ITERFINISH; + + *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; +} + +STATIC bool +S_invlist_iternext(SV* invlist, UV* start, UV* end) +{ + /* An C call on must be used to set this up. + * This call sets in <*start> and <*end>, the next range in . + * Returns if successful and the next call will return the next + * range; if was already at the end of the list. If the latter, + * <*start> and <*end> are unchanged, and the next call to this function + * will start over at the beginning of the list */ + + STRLEN* pos = get_invlist_iter_addr(invlist); + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_ITERNEXT; + + if (*pos >= len) { + *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ + return FALSE; + } + + array = invlist_array(invlist); + + *start = array[(*pos)++]; + + if (*pos >= len) { + *end = UV_MAX; + } + else { + *end = array[(*pos)++] - 1; + } + + return TRUE; +} + +PERL_STATIC_INLINE bool +S_invlist_is_iterating(SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; + + return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; +} + +PERL_STATIC_INLINE UV +S_invlist_highest(SV* const invlist) +{ + /* Returns the highest code point that matches an inversion list. This API + * has an ambiguity, as it returns 0 under either the highest is actually + * 0, or if the list is empty. If this distinction matters to you, check + * for emptiness before calling this function */ + + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_HIGHEST; + + if (len == 0) { + return 0; + } + + array = invlist_array(invlist); + + /* The last element in the array in the inversion list always starts a + * range that goes to infinity. That range may be for code points that are + * matched in the inversion list, or it may be for ones that aren't + * matched. In the latter case, the highest code point in the set is one + * less than the beginning of this range; otherwise it is the final element + * of this range: infinity */ + return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) + ? UV_MAX + : array[len - 1] - 1; +} + +#ifndef PERL_IN_XSUB_RE +SV * +Perl__invlist_contents(pTHX_ SV* const invlist) +{ + /* Get the contents of an inversion list into a string SV so that they can + * be printed out. It uses the format traditionally done for debug tracing + */ + + UV start, end; + SV* output = newSVpvs("\n"); + + PERL_ARGS_ASSERT__INVLIST_CONTENTS; + + assert(! invlist_is_iterating(invlist)); + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); + } + else if (end != start) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", + start, end); + } + else { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); + } + } + + return output; +} +#endif + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) +{ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ + + UV start, end; + STRLEN count = 0; + + PERL_ARGS_ASSERT__INVLIST_DUMP; + + if (invlist_is_iterating(invlist)) { + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); + return; + } + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); + } + else if (end != start) { + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); + } + else { + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); + } + count += 2; + } +} + +void +Perl__load_PL_utf8_foldclosures (pTHX) +{ + assert(! PL_utf8_foldclosures); + + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); +} +#endif + +#ifdef PERL_ARGS_ASSERT__INVLISTEQ +bool +S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) +{ + /* Return a boolean as to if the two passed in inversion lists are + * identical. The final argument, if TRUE, says to take the complement of + * the second inversion list before doing the comparison */ + + const UV* array_a = invlist_array(a); + const UV* array_b = invlist_array(b); + UV len_a = _invlist_len(a); + UV len_b = _invlist_len(b); + + UV i = 0; /* current index into the arrays */ + bool retval = TRUE; /* Assume are identical until proven otherwise */ + + PERL_ARGS_ASSERT__INVLISTEQ; + + /* If are to compare 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* The complement of nothing is everything, so would have to have + * just one element, starting at zero (ending at infinity) */ + if (len_b == 0) { + return (len_a == 1 && array_a[0] == 0); + } + else if (array_b[0] == 0) { + + /* Otherwise, to complement, we invert. Here, the first element is + * 0, just remove it. To do this, we just pretend the array starts + * one later */ + + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Make sure that the lengths are the same, as well as the final element + * before looping through the remainder. (Thus we test the length, final, + * and first elements right off the bat) */ + if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { + retval = FALSE; + } + else for (i = 0; i < len_a - 1; i++) { + if (array_a[i] != array_b[i]) { + retval = FALSE; + break; + } + } + + return retval; +} +#endif + +#undef HEADER_LENGTH +#undef TO_INTERNAL_SIZE +#undef FROM_INTERNAL_SIZE +#undef INVLIST_VERSION_ID + +/* End of inversion list object */ + +STATIC void +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) +{ + /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' + * constructs, and updates RExC_flags with them. On input, RExC_parse + * should point to the first flag; it is updated on output to point to the + * final ')' or ':'. There needs to be at least one flag, or this will + * abort */ + + /* for (?g), (?gc), and (?o) warnings; warning + about (?c) will warn about (?g) -- japhy */ + +#define WASTED_O 0x01 +#define WASTED_G 0x02 +#define WASTED_C 0x04 +#define WASTED_GC (WASTED_G|WASTED_C) + I32 wastedflags = 0x00; + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; + char has_charset_modifier = '\0'; + regex_charset cs; + bool has_use_defaults = FALSE; + const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ + + PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; + + /* '^' as an initial flag sets certain defaults */ + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + has_use_defaults = TRUE; + STD_PMMOD_FLAGS_CLEAR(&RExC_flags); + set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET); + } + + cs = get_regex_charset(RExC_flags); + if (cs == REGEX_DEPENDS_CHARSET + && (RExC_utf8 || RExC_uni_semantics)) + { + cs = REGEX_UNICODE_CHARSET; + } + + while (*RExC_parse) { + /* && strchr("iogcmsx", *RExC_parse) */ + /* (?g), (?gc) and (?o) are useless here + and must be globally applied -- japhy */ + switch (*RExC_parse) { + + /* Code for the imsx flags */ + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + + case LOCALE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_LOCALE_CHARSET; + has_charset_modifier = LOCALE_PAT_MOD; + break; + case UNICODE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_UNICODE_CHARSET; + has_charset_modifier = UNICODE_PAT_MOD; + break; + case ASCII_RESTRICT_PAT_MOD: + if (flagsp == &negflags) { + goto neg_modifier; + } + if (has_charset_modifier) { + if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { + goto excess_modifier; + } + /* Doubled modifier implies more restricted */ + cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; + } + else { + cs = REGEX_ASCII_RESTRICTED_CHARSET; + } + has_charset_modifier = ASCII_RESTRICT_PAT_MOD; + break; + case DEPENDS_PAT_MOD: + if (has_use_defaults) { + goto fail_modifiers; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + else if (has_charset_modifier) { + goto excess_modifier; + } + + /* The dual charset means unicode semantics if the + * pattern (or target, not known until runtime) are + * utf8, or something in the pattern indicates unicode + * semantics */ + cs = (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET; + has_charset_modifier = DEPENDS_PAT_MOD; + break; + excess_modifier: + RExC_parse++; + if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { + vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); + } + else if (has_charset_modifier == *(RExC_parse - 1)) { + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); + } + else { + vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); + } + /*NOTREACHED*/ + neg_modifier: + RExC_parse++; + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); + /*NOTREACHED*/ + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; + if (! (wastedflags & wflagbit) ) { + wastedflags |= wflagbit; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ + vWARN5( + RExC_parse + 1, + "Useless (%s%c) - %suse /%c modifier", + flagsp == &negflags ? "?-" : "?", + *RExC_parse, + flagsp == &negflags ? "don't " : "", + *RExC_parse + ); + } + } + break; + + case CONTINUE_PAT_MOD: /* 'c' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (! (wastedflags & WASTED_C) ) { + wastedflags |= WASTED_GC; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ + vWARN3( + RExC_parse + 1, + "Useless (%sc) - %suse /gc modifier", + flagsp == &negflags ? "?-" : "?", + flagsp == &negflags ? "don't " : "" + ); + } + } + break; + case KEEPCOPY_PAT_MOD: /* 'p' */ + if (flagsp == &negflags) { + if (SIZE_ONLY) + ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); + } else { + *flagsp |= RXf_PMf_KEEPCOPY; + } + break; + case '-': + /* A flag is a default iff it is following a minus, so + * if there is a minus, it means will be trying to + * re-specify a default which is an error */ + if (has_use_defaults || flagsp == &negflags) { + goto fail_modifiers; + } + flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case ':': + case ')': + RExC_flags |= posflags; + RExC_flags &= ~negflags; + set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } + return; + /*NOTREACHED*/ + default: + fail_modifiers: + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); + /*NOTREACHED*/ + } + + ++RExC_parse; + } +} + +/* + - reg - regular expression, i.e. main body or parenthesized thing + * + * Caller must absorb opening parenthesis. + * + * Combining parenthesis handling with the base level of regular expression + * is a trifle forced, but the need to tie the tails of the branches to what + * follows makes it hard to avoid. + */ +#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) +#ifdef DEBUGGING +#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) +#else +#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) +#endif + +/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets + flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan + needs to be restarted. + Otherwise would only return NULL if regbranch() returns NULL, which + cannot happen. */ +STATIC regnode * +S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) + /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. + * 2 is like 1, but indicates that nextchar() has been called to advance + * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and + * this flag alerts us to the need to check for that */ +{ + regnode *ret; /* Will be the head of the group. */ + regnode *br; + regnode *lastbr; + regnode *ender = NULL; + I32 parno = 0; + I32 flags; + U32 oregflags = RExC_flags; + bool have_branch = 0; + bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; + I32 num; /* numeric backreferences */ + + char * parse_start = RExC_parse; /* MJD */ + char * const oregcomp_parse = RExC_parse; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG; + DEBUG_PARSE("reg "); + + *flagp = 0; /* Tentatively. */ + + + /* Make an OPEN node, if parenthesized. */ + if (paren) { + + /* Under /x, space and comments can be gobbled up between the '(' and + * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such + * intervening space, as the sequence is a token, and a token should be + * indivisible */ + bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; + + if ( *RExC_parse == '*') { /* (*VERB:ARG) */ + char *start_verb = RExC_parse; + STRLEN verb_len = 0; + char *start_arg = NULL; + unsigned char op = 0; + int argok = 1; + int internal_argval = 0; /* internal_argval is only useful if + !argok */ + + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); + } + while ( *RExC_parse && *RExC_parse != ')' ) { + if ( *RExC_parse == ':' ) { + start_arg = RExC_parse + 1; + break; + } + RExC_parse++; + } + ++start_verb; + verb_len = RExC_parse - start_verb; + if ( start_arg ) { + RExC_parse++; + while ( *RExC_parse && *RExC_parse != ')' ) + RExC_parse++; + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern argument"); + if ( RExC_parse == start_arg ) + start_arg = NULL; + } else { + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern"); + } + + switch ( *start_verb ) { + case 'A': /* (*ACCEPT) */ + if ( memEQs(start_verb,verb_len,"ACCEPT") ) { + op = ACCEPT; + internal_argval = RExC_nestroot; + } + break; + case 'C': /* (*COMMIT) */ + if ( memEQs(start_verb,verb_len,"COMMIT") ) + op = COMMIT; + break; + case 'F': /* (*FAIL) */ + if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { + op = OPFAIL; + argok = 0; + } + break; + case ':': /* (*:NAME) */ + case 'M': /* (*MARK:NAME) */ + if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { + op = MARKPOINT; + argok = -1; + } + break; + case 'P': /* (*PRUNE) */ + if ( memEQs(start_verb,verb_len,"PRUNE") ) + op = PRUNE; + break; + case 'S': /* (*SKIP) */ + if ( memEQs(start_verb,verb_len,"SKIP") ) + op = SKIP; + break; + case 'T': /* (*THEN) */ + /* [19:06] :: is then */ + if ( memEQs(start_verb,verb_len,"THEN") ) { + op = CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; + } + break; + } + if ( ! op ) { + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); + } + if ( argok ) { + if ( start_arg && internal_argval ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else if ( argok < 0 && !start_arg ) { + vFAIL3("Verb pattern '%.*s' has a mandatory argument", + verb_len, start_verb); + } else { + ret = reganode(pRExC_state, op, internal_argval); + if ( ! internal_argval && ! SIZE_ONLY ) { + if (start_arg) { + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); + RExC_rxi->data->data[ARG(ret)]=(void*)sv; + ret->flags = 0; + } else { + ret->flags = 1; + } + } + } + if (!internal_argval) + RExC_seen |= REG_VERBARG_SEEN; + } else if ( start_arg ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else { + ret = reg_node(pRExC_state, op); + } + nextchar(pRExC_state); + return ret; + } + else if (*RExC_parse == '?') { /* (?...) */ + bool is_logical = 0; + const char * const seqstart = RExC_parse; + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(?...)', the '(' and '?' must be adjacent"); + } + + RExC_parse++; + paren = *RExC_parse++; + ret = NULL; /* For look-ahead/behind. */ + switch (paren) { + + case 'P': /* (?P...) variants for those used to PCRE/Python */ + paren = *RExC_parse++; + if ( paren == '<') /* (?P<...>) named capture */ + goto named_capture; + else if (paren == '>') { /* (?P>name) named recursion */ + goto named_recursion; + } + else if (paren == '=') { /* (?P=...) named backref */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ + char* name_start = RExC_parse; + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + + nextchar(pRExC_state); + return ret; + } + RExC_parse++; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + case '<': /* (?<...) */ + if (*RExC_parse == '!') + paren = ','; + else if (*RExC_parse != '=') + named_capture: + { /* (?<...>) */ + char *name_start; + SV *svname; + paren= '>'; + case '\'': /* (?'...') */ + name_start= RExC_parse; + svname = reg_scan_name(pRExC_state, + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) + vFAIL2("Sequence (?%c... not terminated", + paren=='>' ? '<' : paren); + if (SIZE_ONLY) { + HE *he_str; + SV *sv_dat = NULL; + if (!svname) /* shouldn't happen */ + Perl_croak(aTHX_ + "panic: reg_scan_name returned NULL"); + if (!RExC_paren_names) { + RExC_paren_names= newHV(); + sv_2mortal(MUTABLE_SV(RExC_paren_names)); +#ifdef DEBUGGING + RExC_paren_name_list= newAV(); + sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); +#endif + } + he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) { + /* croak baby croak */ + Perl_croak(aTHX_ + "panic: paren_name hash element allocation failed"); + } else if ( SvPOK(sv_dat) ) { + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIV_set(sv_dat, SvIVX(sv_dat) + 1); + } + } else { + (void)SvUPGRADE(sv_dat,SVt_PVNV); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); + SvIOK_on(sv_dat); + SvIV_set(sv_dat, 1); + } +#ifdef DEBUGGING + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) + SvREFCNT_dec_NN(svname); +#endif + + /*sv_dump(sv_dat);*/ + } + nextchar(pRExC_state); + paren = 1; + goto capturing_parens; + } + RExC_seen |= REG_LOOKBEHIND_SEEN; + RExC_in_lookbehind++; + RExC_parse++; + /* FALLTHROUGH */ + case '=': /* (?=...) */ + RExC_seen_zerolen++; + break; + case '!': /* (?!...) */ + RExC_seen_zerolen++; + if (*RExC_parse == ')') { + ret=reg_node(pRExC_state, OPFAIL); + nextchar(pRExC_state); + return ret; + } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; + case ':': /* (?:...) */ + case '>': /* (?>...) */ + break; + case '$': /* (?$...) */ + case '@': /* (?@...) */ + vFAIL2("Sequence (?%c...) not implemented", (int)paren); + break; + case '0' : /* (?0) */ + case 'R' : /* (?R) */ + if (*RExC_parse != ')') + FAIL("Sequence (?R) not terminated"); + ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + /*notreached*/ + /* named and numeric backreferences */ + case '&': /* (?&NAME) */ + parse_start = RExC_parse - 1; + named_recursion: + { + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); + goto gen_recurse_regop; + assert(0); /* NOT REACHED */ + case '+': + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse++; + vFAIL("Illegal pattern"); + } + goto parse_recursion; + /* NOT REACHED*/ + case '-': /* (?-1) */ + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse--; /* rewind to let it be handled later */ + goto parse_flags; + } + /* FALLTHROUGH */ + case '1': case '2': case '3': case '4': /* (?1) */ + case '5': case '6': case '7': case '8': case '9': + RExC_parse--; + parse_recursion: + num = atoi(RExC_parse); + parse_start = RExC_parse - 1; /* MJD */ + if (*RExC_parse == '-') + RExC_parse++; + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (*RExC_parse!=')') + vFAIL("Expecting close bracket"); + + gen_recurse_regop: + if ( paren == '-' ) { + /* + Diagram of capture buffer numbering. + Top line is the normal capture buffer numbers + Bottom line is the negative indexing as from + the X (the (?-2)) + + + 1 2 3 4 5 X 6 7 + /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ + - 5 4 3 2 1 X x x + + */ + num = RExC_npar + num; + if (num < 1) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + } else if ( paren == '+' ) { + num = RExC_npar + num - 1; + } + + ret = reganode(pRExC_state, GOSUB, num); + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + ARG2L_SET( ret, RExC_recurse_count++); + RExC_emit++; + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); + } else { + RExC_size++; + } + RExC_seen |= REG_RECURSE_SEEN; + Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ + Set_Node_Offset(ret, parse_start); /* MJD */ + + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + + assert(0); /* NOT REACHED */ + + case '?': /* (??...) */ + is_logical = 1; + if (*RExC_parse != '{') { + RExC_parse++; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); + /*NOTREACHED*/ + } + *flagp |= POSTPONED; + paren = *RExC_parse++; + /* FALLTHROUGH */ + case '{': /* (?{...}) */ + { + U32 n = 0; + struct reg_code_block *cb; + + RExC_seen_zerolen++; + + if ( !pRExC_state->num_code_blocks + || pRExC_state->code_index >= pRExC_state->num_code_blocks + || pRExC_state->code_blocks[pRExC_state->code_index].start + != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) + - RExC_start) + ) { + if (RExC_pm_flags & PMf_USE_RE_EVAL) + FAIL("panic: Sequence (?{...}): no code block found\n"); + FAIL("Eval-group not allowed at runtime, use re 'eval'"); + } + /* this is a pre-compiled code block (?{...}) */ + cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + RExC_parse = RExC_start + cb->end; + if (!SIZE_ONLY) { + OP *o = cb->block; + if (cb->src_regex) { + n = add_data(pRExC_state, STR_WITH_LEN("rl")); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o; + } + else { + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); + RExC_rxi->data->data[n] = (void*)o; + } + } + pRExC_state->code_index++; + nextchar(pRExC_state); + + if (is_logical) { + regnode *eval; + ret = reg_node(pRExC_state, LOGICAL); + eval = reganode(pRExC_state, EVAL, n); + if (!SIZE_ONLY) { + ret->flags = 2; + /* for later propagation into (??{}) return value */ + eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); + } + REGTAIL(pRExC_state, ret, eval); + /* deal with the length of this later - MJD */ + return ret; + } + ret = reganode(pRExC_state, EVAL, n); + Set_Node_Length(ret, RExC_parse - parse_start + 1); + Set_Node_Offset(ret, parse_start); + return ret; + } + case '(': /* (?(?{...})...) and (?(?=...)...) */ + { + int is_define= 0; + if (RExC_parse[0] == '?') { /* (?(?...)) */ + if (RExC_parse[1] == '=' || RExC_parse[1] == '!' + || RExC_parse[1] == '<' + || RExC_parse[1] == '{') { /* Lookahead or eval. */ + I32 flag; + regnode *tail; + + ret = reg_node(pRExC_state, LOGICAL); + if (!SIZE_ONLY) + ret->flags = 1; + + tail = reg(pRExC_state, 1, &flag, depth+1); + if (flag & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + REGTAIL(pRExC_state, ret, tail); + goto insert_if; + } + } + else if ( RExC_parse[0] == '<' /* (?()...) */ + || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ + { + char ch = RExC_parse[0] == '<' ? '>' : '\''; + char *name_start= RExC_parse++; + U32 num = 0; + SV *sv_dat=reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ch) + vFAIL2("Sequence (?(%c... not terminated", + (ch == '>' ? '<' : ch)); + RExC_parse++; + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + ret = reganode(pRExC_state,NGROUPP,num); + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'D' && + RExC_parse[1] == 'E' && + RExC_parse[2] == 'F' && + RExC_parse[3] == 'I' && + RExC_parse[4] == 'N' && + RExC_parse[5] == 'E') + { + ret = reganode(pRExC_state,DEFINEP,0); + RExC_parse +=6 ; + is_define = 1; + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'R') { + RExC_parse++; + parno = 0; + if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + parno = atoi(RExC_parse++); + while (isDIGIT(*RExC_parse)) + RExC_parse++; + } else if (RExC_parse[0] == '&') { + SV *sv_dat; + RExC_parse++; + sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); + parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + ret = reganode(pRExC_state,INSUBP,parno); + goto insert_if_check_paren; + } + else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + /* (?(1)...) */ + char c; + char *tmp; + parno = atoi(RExC_parse++); + + while (isDIGIT(*RExC_parse)) + RExC_parse++; + ret = reganode(pRExC_state, GROUPP, parno); + + insert_if_check_paren: + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Switch condition not recognized"); + } + insert_if: + REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); + br = regbranch(pRExC_state, &flags, 1,depth+1); + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + (UV) flags); + } else + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); + c = *nextchar(pRExC_state); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + if (c == '|') { + if (is_define) + vFAIL("(?(DEFINE)....) does not allow branches"); + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + (UV) flags); + } + REGTAIL(pRExC_state, ret, lastbr); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + c = *nextchar(pRExC_state); + } + else + lastbr = NULL; + if (c != ')') + vFAIL("Switch (?(condition)... contains too many branches"); + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, br, ender); + if (lastbr) { + REGTAIL(pRExC_state, lastbr, ender); + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); + } + else + REGTAIL(pRExC_state, ret, ender); + RExC_size++; /* XXX WHY do we need this?!! + For large programs it seems to be required + but I can't figure out why. -- dmq*/ + return ret; + } + else { + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); + } + } + case '[': /* (?[ ... ]) */ + return handle_regex_sets(pRExC_state, NULL, flagp, depth, + oregcomp_parse); + case 0: + RExC_parse--; /* for vFAIL to print correctly */ + vFAIL("Sequence (? incomplete"); + break; + default: /* e.g., (?i) */ + --RExC_parse; + parse_flags: + parse_lparen_question_flags(pRExC_state); + if (UCHARAT(RExC_parse) != ':') { + nextchar(pRExC_state); + *flagp = TRYAGAIN; + return NULL; + } + paren = ':'; + nextchar(pRExC_state); + ret = NULL; + goto parse_rest; + } /* end switch */ + } + else { /* (...) */ + capturing_parens: + parno = RExC_npar; + RExC_npar++; + + ret = reganode(pRExC_state, OPEN, parno); + if (!SIZE_ONLY ){ + if (!RExC_nestroot) + RExC_nestroot = parno; + if (RExC_seen & REG_RECURSE_SEEN + && !RExC_open_parens[parno-1]) + { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting open paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ret))); + RExC_open_parens[parno-1]= ret; + } + } + Set_Node_Length(ret, 1); /* MJD */ + Set_Node_Offset(ret, RExC_parse); /* MJD */ + is_open = 1; + } + } + else /* ! paren */ + ret = NULL; + + parse_rest: + /* Pick up the branches, linking them together. */ + parse_start = RExC_parse; /* MJD */ + br = regbranch(pRExC_state, &flags, 1,depth+1); + + /* branch_len = (paren != 0); */ + + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + } + if (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, BRANCHJ, br, depth+1); + } + else { /* MJD */ + reginsert(pRExC_state, BRANCH, br, depth+1); + Set_Node_Length(br, paren != 0); + Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); + } + have_branch = 1; + if (SIZE_ONLY) + RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ + } + else if (paren == ':') { + *flagp |= flags&SIMPLE; + } + if (is_open) { /* Starts with OPEN. */ + REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ + } + else if (paren != '?') /* Not Conditional */ + ret = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + lastbr = br; + while (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + ender = reganode(pRExC_state, LONGJMP,0); + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); + } + if (SIZE_ONLY) + RExC_extralen += 2; /* Account for LONGJMP. */ + nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } + br = regbranch(pRExC_state, &flags, 0, depth+1); + + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + } + REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ + lastbr = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + } + + if (have_branch || paren != ':') { + /* Make a closing node, and hook it on the end. */ + switch (paren) { + case ':': + ender = reg_node(pRExC_state, TAIL); + break; + case 1: case 2: + ender = reganode(pRExC_state, CLOSE, parno); + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting close paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ender))); + RExC_close_parens[parno-1]= ender; + if (RExC_nestroot == parno) + RExC_nestroot = 0; + } + Set_Node_Offset(ender,RExC_parse+1); /* MJD */ + Set_Node_Length(ender,1); /* MJD */ + break; + case '<': + case ',': + case '=': + case '!': + *flagp &= ~HASWIDTH; + /* FALLTHROUGH */ + case '>': + ender = reg_node(pRExC_state, SUCCEED); + break; + case 0: + ender = reg_node(pRExC_state, END); + if (!SIZE_ONLY) { + assert(!RExC_opend); /* there can only be one! */ + RExC_opend = ender; + } + break; + } + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("lsbr"); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); + PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(lastbr), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - lastbr) + ); + }); + REGTAIL(pRExC_state, lastbr, ender); + + if (have_branch && !SIZE_ONLY) { + char is_nothing= 1; + if (depth==1) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; + + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br; br = regnext(br)) { + const U8 op = PL_regkind[OP(br)]; + if (op == BRANCH) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) + is_nothing= 0; + } + else if (op == BRANCHJ) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); + /* for now we always disable this optimisation * / + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) + */ + is_nothing= 0; + } + } + if (is_nothing) { + br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("NADA"); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); + PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(ret), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - ret) + ); + }); + OP(br)= NOTHING; + if (OP(ender) == TAIL) { + NEXT_OFF(br)= 0; + RExC_emit= br + 1; + } else { + regnode *opt; + for ( opt= br + 1; opt < ender ; opt++ ) + OP(opt)= OPTIMIZED; + NEXT_OFF(br)= ender - br; + } + } + } + } + + { + const char *p; + static const char parens[] = "=!<,>"; + + if (paren && (p = strchr(parens, paren))) { + U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; + int flag = (p - parens) > 1; + + if (paren == '>') + node = SUSPEND, flag = 0; + reginsert(pRExC_state, node,ret, depth+1); + Set_Node_Cur_Length(ret, parse_start); + Set_Node_Offset(ret, parse_start + 1); + ret->flags = flag; + REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); + } + } + + /* Check for proper termination. */ + if (paren) { + /* restore original flags, but keep (?p) */ + RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); + if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ("); + } + } + else if (!paren && RExC_parse < RExC_end) { + if (*RExC_parse == ')') { + RExC_parse++; + vFAIL("Unmatched )"); + } + else + FAIL("Junk on end of regexp"); /* "Can't happen". */ + assert(0); /* NOTREACHED */ + } + + if (RExC_in_lookbehind) { + RExC_in_lookbehind--; + } + if (after_freeze > RExC_npar) + RExC_npar = after_freeze; + return(ret); +} + +/* + - regbranch - one alternative of an | operator + * + * Implements the concatenation operator. + * + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + * restarted. + */ +STATIC regnode * +S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) +{ + regnode *ret; + regnode *chain = NULL; + regnode *latest; + I32 flags = 0, c = 0; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGBRANCH; + + DEBUG_PARSE("brnc"); + + if (first) + ret = NULL; + else { + if (!SIZE_ONLY && RExC_extralen) + ret = reganode(pRExC_state, BRANCHJ,0); + else { + ret = reg_node(pRExC_state, BRANCH); + Set_Node_Length(ret, 1); + } + } + + if (!first && SIZE_ONLY) + RExC_extralen += 1; /* BRANCHJ */ + + *flagp = WORST; /* Tentatively. */ + + RExC_parse--; + nextchar(pRExC_state); + while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { + flags &= ~TRYAGAIN; + latest = regpiece(pRExC_state, &flags,depth+1); + if (latest == NULL) { + if (flags & TRYAGAIN) + continue; + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); + } + else if (ret == NULL) + ret = latest; + *flagp |= flags&(HASWIDTH|POSTPONED); + if (chain == NULL) /* First piece. */ + *flagp |= flags&SPSTART; + else { + RExC_naughty++; + REGTAIL(pRExC_state, chain, latest); + } + chain = latest; + c++; + } + if (chain == NULL) { /* Loop ran zero times. */ + chain = reg_node(pRExC_state, NOTHING); + if (ret == NULL) + ret = chain; + } + if (c == 1) { + *flagp |= flags&SIMPLE; + } + + return ret; +} + +/* + - regpiece - something followed by possible [*+?] + * + * Note that the branching code sequences used for ? and the general cases + * of * and + are somewhat optimized: they use the same NOTHING node as + * both the endmarker for their branch list and the body of the last branch. + * It might seem that this node could be dispensed with entirely, but the + * endmarker role is not redundant. + * + * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with + * TRYAGAIN. + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + * restarted. + */ +STATIC regnode * +S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + regnode *ret; + char op; + char *next; + I32 flags; + const char * const origparse = RExC_parse; + I32 min; + I32 max = REG_INFTY; +#ifdef RE_TRACK_PATTERN_OFFSETS + char *parse_start; +#endif + const char *maxpos = NULL; + + /* Save the original in case we change the emitted regop to a FAIL. */ + regnode * const orig_emit = RExC_emit; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPIECE; + + DEBUG_PARSE("piec"); + + ret = regatom(pRExC_state, &flags,depth+1); + if (ret == NULL) { + if (flags & (TRYAGAIN|RESTART_UTF8)) + *flagp |= flags & (TRYAGAIN|RESTART_UTF8); + else + FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); + return(NULL); + } + + op = *RExC_parse; + + if (op == '{' && regcurly(RExC_parse)) { + maxpos = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; /* MJD */ +#endif + next = RExC_parse + 1; + while (isDIGIT(*next) || *next == ',') { + if (*next == ',') { + if (maxpos) + break; + else + maxpos = next; + } + next++; + } + if (*next == '}') { /* got one */ + if (!maxpos) + maxpos = next; + RExC_parse++; + min = atoi(RExC_parse); + if (*maxpos == ',') + maxpos++; + else + maxpos = RExC_parse; + max = atoi(maxpos); + if (!max && *maxpos != '0') + max = REG_INFTY; /* meaning "infinity" */ + else if (max >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + RExC_parse = next; + nextchar(pRExC_state); + if (max < min) { /* If can't match, warn and optimize to fail + unconditionally */ + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); + + /* We can't back off the size because we have to reserve + * enough space for all the things we are about to throw + * away, but we can shrink it by the ammount we are about + * to re-use here */ + RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; + } + else { + RExC_emit = orig_emit; + } + ret = reg_node(pRExC_state, OPFAIL); + return ret; + } + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { + if (SIZE_ONLY) { + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); + } + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); + } + + do_curly: + if ((flags&SIMPLE)) { + RExC_naughty += 2 + RExC_naughty / 2; + reginsert(pRExC_state, CURLY, ret, depth+1); + Set_Node_Offset(ret, parse_start+1); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); + } + else { + regnode * const w = reg_node(pRExC_state, WHILEM); + + w->flags = 0; + REGTAIL(pRExC_state, ret, w); + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, LONGJMP,ret, depth+1); + reginsert(pRExC_state, NOTHING,ret, depth+1); + NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ + } + reginsert(pRExC_state, CURLYX,ret, depth+1); + /* MJD hk */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Length(ret, + op == '{' ? (RExC_parse - parse_start) : 1); + + if (!SIZE_ONLY && RExC_extralen) + NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ + REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); + if (SIZE_ONLY) + RExC_whilem_seen++, RExC_extralen += 3; + RExC_naughty += 4 + RExC_naughty; /* compound interest */ + } + ret->flags = 0; + + if (min > 0) + *flagp = WORST; + if (max > 0) + *flagp |= HASWIDTH; + if (!SIZE_ONLY) { + ARG1_SET(ret, (U16)min); + ARG2_SET(ret, (U16)max); + } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + + goto nest_check; + } + } + + if (!ISMULT1(op)) { + *flagp = flags; + return(ret); + } + +#if 0 /* Now runtime fix should be reliable. */ + + /* if this is reinstated, don't forget to put this back into perldiag: + + =item Regexp *+ operand could be empty at {#} in regex m/%s/ + + (F) The part of the regexp subject to either the * or + quantifier + could match an empty string. The {#} shows in the regular + expression about where the problem was discovered. + + */ + + if (!(flags&HASWIDTH) && op != '?') + vFAIL("Regexp *+ operand could be empty"); +#endif + +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; +#endif + nextchar(pRExC_state); + + *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); + + if (op == '*' && (flags&SIMPLE)) { + reginsert(pRExC_state, STAR, ret, depth+1); + ret->flags = 0; + RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + } + else if (op == '*') { + min = 0; + goto do_curly; + } + else if (op == '+' && (flags&SIMPLE)) { + reginsert(pRExC_state, PLUS, ret, depth+1); + ret->flags = 0; + RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + } + else if (op == '+') { + min = 1; + goto do_curly; + } + else if (op == '?') { + min = 0; max = 1; + goto do_curly; + } + nest_check: + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); + (void)ReREFCNT_inc(RExC_rx_sv); + } + + if (RExC_parse < RExC_end && *RExC_parse == '?') { + nextchar(pRExC_state); + reginsert(pRExC_state, MINMOD, ret, depth+1); + REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); + } + else + if (RExC_parse < RExC_end && *RExC_parse == '+') { + regnode *ender; + nextchar(pRExC_state); + ender = reg_node(pRExC_state, SUCCEED); + REGTAIL(pRExC_state, ret, ender); + reginsert(pRExC_state, SUSPEND, ret, depth+1); + ret->flags = 0; + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, ret, ender); + } + + if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { + RExC_parse++; + vFAIL("Nested quantifiers"); + } + + return(ret); +} + +STATIC bool +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ + ) +{ + + /* This is expected to be called by a parser routine that has recognized '\N' + and needs to handle the rest. RExC_parse is expected to point at the first + char following the N at the time of the call. On successful return, + RExC_parse has been updated to point to just after the sequence identified + by this routine, and <*flagp> has been updated. + + The \N may be inside (indicated by the boolean ) or outside a + character class. + + \N may begin either a named sequence, or if outside a character class, mean + to match a non-newline. For non single-quoted regexes, the tokenizer has + attempted to decide which, and in the case of a named sequence, converted it + into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, + where c1... are the characters in the sequence. For single-quoted regexes, + the tokenizer passes the \N sequence through unchanged; this code will not + attempt to determine this nor expand those, instead raising a syntax error. + The net effect is that if the beginning of the passed-in pattern isn't '{U+' + or there is no '}', it signals that this \N occurrence means to match a + non-newline. + + Only the \N{U+...} form should occur in a character class, for the same + reason that '.' inside a character class means to just match a period: it + just doesn't make sense. + + The function raises an error (via vFAIL), and doesn't return for various + syntax errors. Otherwise it returns TRUE and sets or on + success; it returns FALSE otherwise. Returns FALSE, setting *flagp to + RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is + only possible if node_p is non-NULL. + + + If is non-null, it means the caller can accept an input sequence + consisting of a just a single code point; <*valuep> is set to that value + if the input is such. + + If is non-null it signifies that the caller can accept any other + legal sequence (i.e., one that isn't just a single code point). <*node_p> + is set as follows: + 1) \N means not-a-NL: points to a newly created REG_ANY node; + 2) \N{}: points to a new NOTHING node; + 3) otherwise: points to a new EXACT node containing the resolved + string. + Note that FALSE is returned for single code point sequences if is + null. + */ + + char * endbrace; /* '}' following the name */ + char* p; + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + bool has_multiple_chars; /* true if the input stream contains a sequence of + more than one character */ + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_GROK_BSLASH_N; + + GET_RE_DEBUG_FLAGS; + + assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ + + /* The [^\n] meaning of \N ignores spaces and comments under the /x + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ + p = (RExC_flags & RXf_PMf_EXTENDED) + ? regpatws(pRExC_state, RExC_parse, + TRUE) /* means recognize comments */ + : RExC_parse; + + /* Disambiguate between \N meaning a named character versus \N meaning + * [^\n]. The former is assumed when it can't be the latter. */ + if (*p != '{' || regcurly(p)) { + RExC_parse = p; + if (! node_p) { + /* no bare \N allowed in a charclass */ + if (in_char_class) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + return FALSE; + } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ + nextchar(pRExC_state); + *node_p = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + Set_Node_Length(*node_p, 1); /* MJD */ + return TRUE; + } + + /* Here, we have decided it should be a named character or sequence */ + + /* The test above made sure that the next real character is a '{', but + * under the /x modifier, it could be separated by space (or a comment and + * \n) and this is not allowed (for consistency with \x{...} and the + * tokenizer handling of \N{NAME}). */ + if (*RExC_parse != '{') { + vFAIL("Missing braces on \\N{}"); + } + + RExC_parse++; /* Skip past the '{' */ + + if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ + || ! (endbrace == RExC_parse /* nothing between the {} */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ + { + if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ + vFAIL("\\N{NAME} must be resolved by the lexer"); + } + + if (endbrace == RExC_parse) { /* empty: \N{} */ + bool ret = TRUE; + if (node_p) { + *node_p = reg_node(pRExC_state,NOTHING); + } + else if (in_char_class) { + if (SIZE_ONLY && in_char_class) { + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } + else { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class"); + } + } + ret = FALSE; + } + else { + return FALSE; + } + nextchar(pRExC_state); + return ret; + } + + RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ + RExC_parse += 2; /* Skip past the 'U+' */ + + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + + /* Code points are separated by dots. If none, there is only one code + * point, and is terminated by the brace */ + has_multiple_chars = (endchar < endbrace); + + if (valuep && (! has_multiple_chars || in_char_class)) { + /* We only pay attention to the first char of + multichar strings being returned in char classes. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. XXX Solution is to recharacterize as + [rest-of-class]|multi1|multi2... */ + + STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); + I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + + *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); + + /* The tokenizer should have guaranteed validity, but it's possible to + * bypass it by using single quoting, so check */ + if (length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) { + RExC_parse = endchar; + } + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + + if (in_char_class && has_multiple_chars) { + if (strict) { + RExC_parse = endbrace; + vFAIL("\\N{} in character class restricted to one character"); + } + else { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } + } + + RExC_parse = endbrace + 1; + } + else if (! node_p || ! has_multiple_chars) { + + /* Here, the input is legal, but not according to the caller's + * options. We fail without advancing the parse, so that the + * caller can try again */ + RExC_parse = p; + return FALSE; + } + else { + + /* What is done here is to convert this to a sub-pattern of the form + * (?:\x{char1}\x{char2}...) + * and then call reg recursively. That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. toke.c has converted the original Unicode values + * to native, so that we can just pass on the hex values unchanged. We + * do have to set a flag to keep recoding from happening in the + * recursion */ + + SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); + STRLEN len; + char *orig_end = RExC_end; + I32 flags; + + while (RExC_parse < endbrace) { + + /* Convert to notation the rest of the code understands */ + sv_catpv(substitute_parse, "\\x{"); + sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(substitute_parse, "}"); + + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + } + sv_catpv(substitute_parse, ")"); + + RExC_parse = SvPV(substitute_parse, len); + + /* Don't allow empty number */ + if (len < 8) { + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + RExC_end = RExC_parse + len; + + /* The values are Unicode, and therefore not subject to recoding */ + RExC_override_recoding = 1; + + if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return FALSE; + } + FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", + (UV) flags); + } + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + + RExC_parse = endbrace; + RExC_end = orig_end; + RExC_override_recoding = 0; + + nextchar(pRExC_state); + } + + return TRUE; +} + + +/* + * reg_recode + * + * It returns the code point in utf8 for the value in *encp. + * value: a code value in the source encoding + * encp: a pointer to an Encode object + * + * If the result from Encode is not a single character, + * it returns U+FFFD (Replacement character) and sets *encp to NULL. + */ +STATIC UV +S_reg_recode(pTHX_ const char value, SV **encp) +{ + STRLEN numlen = 1; + SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); + const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); + const STRLEN newlen = SvCUR(sv); + UV uv = UNICODE_REPLACEMENT; + + PERL_ARGS_ASSERT_REG_RECODE; + + if (newlen) + uv = SvUTF8(sv) + ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) + : *(U8*)s; + + if (!newlen || numlen != newlen) { + uv = UNICODE_REPLACEMENT; + *encp = NULL; + } + return uv; +} + +PERL_STATIC_INLINE U8 +S_compute_EXACTish(RExC_state_t *pRExC_state) +{ + U8 op; + + PERL_ARGS_ASSERT_COMPUTE_EXACTISH; + + if (! FOLD) { + return EXACT; + } + + op = get_regex_charset(RExC_flags); + if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { + op--; /* /a is same as /u, and map /aa's offset to what /a's would have + been, so there is no hole */ + } + + return op + EXACTF; +} + +PERL_STATIC_INLINE void +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) +{ + /* This knows the details about sizing an EXACTish node, setting flags for + * it (by setting <*flagp>, and potentially populating it with a single + * character. + * + * If (the length in bytes) is non-zero, this function assumes that + * the node has already been populated, and just does the sizing. In this + * case should be the final code point that has already been + * placed into the node. This value will be ignored except that under some + * circumstances <*flagp> is set based on it. + * + * If is zero, the function assumes that the node is to contain only + * the single character given by and calculates what + * should be. In pass 1, it sizes the node appropriately. In pass 2, it + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set + * + * It knows that under FOLD, the Latin Sharp S and UTF characters above + * 255, must be folded (the former only when the rules indicate it can + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ + + bool len_passed_in = cBOOL(len != 0); + U8 character[UTF8_MAXBYTES_CASE+1]; + + PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + + if (! len_passed_in) { + if (UTF) { + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, + character, + &len, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; + } + else { + uvchr_to_utf8( character, code_point); + len = UTF8SKIP(character); + } + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { + *character = (U8) code_point; + len = 1; + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { + *character = 's'; + *(character + 1) = 's'; + len = 2; + } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } + } + + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } + else { + RExC_emit += STR_SZ(len); + STR_LEN(node) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(node), len, char); + } + } + + *flagp |= HASWIDTH; + + /* A single character node is SIMPLE, except for the special-cased SHARP S + * under /di. */ + if ((len == 1 || (UTF && len == UNISKIP(code_point))) + && (code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS)) + { + *flagp |= SIMPLE; + } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } +} + + +/* return atoi(p), unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + char *q = p; + + for (;isDIGIT(*q); q++) {} /* calculate length of num */ + if (q - p == 0 || q - p > 9) + return I32_MAX; + return atoi(p); +} + + +/* + - regatom - the lowest level + + Try to identify anything special at the start of the pattern. If there + is, then handle it as required. This may involve generating a single regop, + such as for an assertion; or it may involve recursing, such as to + handle a () structure. + + If the string doesn't start with something special then we gobble up + as much literal text as we can. + + Once we have been able to handle whatever type of thing started the + sequence, we return. + + Note: we have to be careful with escapes, as they can be both literal + and special, and in the case of \10 and friends, context determines which. + + A summary of the code structure is: + + switch (first_byte) { + cases for each special: + handle this special; + break; + case '\\': + switch (2nd byte) { + cases for each unambiguous special: + handle this special; + break; + cases for each ambigous special/literal: + disambiguate; + if (special) handle here + else goto defchar; + default: // unambiguously literal: + goto defchar; + } + default: // is a literal char + // FALL THROUGH + defchar: + create EXACTish node for literal; + while (more input and node isn't full) { + switch (input_byte) { + cases for each special; + make sure parse pointer is set so that the next call to + regatom will see this special first + goto loopdone; // EXACTish node terminated by prev. char + default: + append char to EXACTISH node; + } + get next input byte; + } + loopdone: + } + return the generated node; + + Specifically there are two separate switches for handling + escape sequences, with the one for handling literal escapes requiring + a dummy entry for all of the special escapes that are actually handled + by the other. + + Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with + TRYAGAIN. + Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + restarted. + Otherwise does not return NULL. +*/ + +STATIC regnode * +S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + regnode *ret = NULL; + I32 flags = 0; + char *parse_start = RExC_parse; + U8 op; + int invert = 0; + U8 arg; + + GET_RE_DEBUG_FLAGS_DECL; + + *flagp = WORST; /* Tentatively. */ + + DEBUG_PARSE("atom"); + + PERL_ARGS_ASSERT_REGATOM; + +tryagain: + switch ((U8)*RExC_parse) { + case '^': + RExC_seen_zerolen++; + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MBOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SBOL); + else + ret = reg_node(pRExC_state, BOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '$': + nextchar(pRExC_state); + if (*RExC_parse) + RExC_seen_zerolen++; + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MEOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SEOL); + else + ret = reg_node(pRExC_state, EOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '.': + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SANY); + else + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + Set_Node_Length(ret, 1); /* MJD */ + break; + case '[': + { + char * const oregcomp_parse = ++RExC_parse; + ret = regclass(pRExC_state, flagp,depth+1, + FALSE, /* means parse the whole char class */ + TRUE, /* allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + NULL); + if (*RExC_parse != ']') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } + if (ret == NULL) { + if (*flagp & RESTART_UTF8) + return NULL; + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); + } + nextchar(pRExC_state); + Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ + break; + } + case '(': + nextchar(pRExC_state); + ret = reg(pRExC_state, 2, &flags,depth+1); + if (ret == NULL) { + if (flags & TRYAGAIN) { + if (RExC_parse == RExC_end) { + /* Make parent create an empty node if needed. */ + *flagp |= TRYAGAIN; + return(NULL); + } + goto tryagain; + } + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); + } + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + break; + case '|': + case ')': + if (flags & TRYAGAIN) { + *flagp |= TRYAGAIN; + return NULL; + } + vFAIL("Internal urp"); + /* Supposed to be caught earlier. */ + break; + case '?': + case '+': + case '*': + RExC_parse++; + vFAIL("Quantifier follows nothing"); + break; + case '\\': + /* Special Escapes + + This switch handles escape sequences that resolve to some kind + of special regop and not to literal text. Escape sequnces that + resolve to literal text are handled below in the switch marked + "Literal Escapes". + + Every entry in this switch *must* have a corresponding entry + in the literal escape switch. However, the opposite is not + required, as the default for this switch is to jump to the + literal text handling code. + */ + switch ((U8)*++RExC_parse) { + /* Special Escapes */ + case 'A': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, SBOL); + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'G': + ret = reg_node(pRExC_state, GPOS); + RExC_seen |= REG_GPOS_SEEN; + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'K': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, KEEPS); + *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_LOOKBEHIND_SEEN; + goto finish_meta_pat; + case 'Z': + ret = reg_node(pRExC_state, SEOL); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'z': + ret = reg_node(pRExC_state, EOS); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'C': + ret = reg_node(pRExC_state, CANY); + RExC_seen |= REG_CANY_SEEN; + *flagp |= HASWIDTH|SIMPLE; + if (SIZE_ONLY) { + ckWARNdep(RExC_parse+1, "\\C is deprecated"); + } + goto finish_meta_pat; + case 'X': + ret = reg_node(pRExC_state, CLUMP); + *flagp |= HASWIDTH; + goto finish_meta_pat; + + case 'W': + invert = 1; + /* FALLTHROUGH */ + case 'w': + arg = ANYOF_WORDCHAR; + goto join_posix; + + case 'b': + RExC_seen_zerolen++; + RExC_seen |= REG_LOOKBEHIND_SEEN; + op = BOUND + get_regex_charset(RExC_flags); + if (op > BOUNDA) { /* /aa is same as /a */ + op = BOUNDA; + } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); + } + goto finish_meta_pat; + case 'B': + RExC_seen_zerolen++; + RExC_seen |= REG_LOOKBEHIND_SEEN; + op = NBOUND + get_regex_charset(RExC_flags); + if (op > NBOUNDA) { /* /aa is same as /a */ + op = NBOUNDA; + } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); + } + goto finish_meta_pat; + + case 'D': + invert = 1; + /* FALLTHROUGH */ + case 'd': + arg = ANYOF_DIGIT; + goto join_posix; + + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + + case 'H': + invert = 1; + /* FALLTHROUGH */ + case 'h': + arg = ANYOF_BLANK; + op = POSIXU; + goto join_posix_op_known; + + case 'V': + invert = 1; + /* FALLTHROUGH */ + case 'v': + arg = ANYOF_VERTWS; + op = POSIXU; + goto join_posix_op_known; + + case 'S': + invert = 1; + /* FALLTHROUGH */ + case 's': + arg = ANYOF_SPACE; + + join_posix: + + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; + } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } + + join_posix_op_known: + + if (invert) { + op += NPOSIXD - POSIXD; + } + + ret = reg_node(pRExC_state, op); + if (! SIZE_ONLY) { + FLAGS(ret) = namedclass_to_classnum(arg); + } + + *flagp |= HASWIDTH|SIMPLE; + /* FALLTHROUGH */ + + finish_meta_pat: + nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ + break; + case 'p': + case 'P': + { +#ifdef DEBUGGING + char* parse_start = RExC_parse - 2; +#endif + + RExC_parse--; + + ret = regclass(pRExC_state, flagp,depth+1, + TRUE, /* means just parse this element */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. + It would be a bug if these returned + non-portables */ + NULL); + /* regclass() can only return RESTART_UTF8 if multi-char folds + are allowed. */ + if (!ret) + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); + + RExC_parse--; + + Set_Node_Offset(ret, parse_start + 2); + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + } + break; + case 'N': + /* Handle \N and \N{NAME} with multiple code points here and not + * below because it can be multicharacter. join_exact() will join + * them up later on. Also this makes sure that things like + * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. + * The options to the grok function call causes it to fail if the + * sequence is just a single code point. We then go treat it as + * just another character in the current EXACT node, and hence it + * gets uniform treatment with all the other characters. The + * special treatment for quantifiers is not needed for such single + * character sequences */ + ++RExC_parse; + if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, + FALSE /* not strict */ )) { + if (*flagp & RESTART_UTF8) + return NULL; + RExC_parse--; + goto defchar; + } + break; + case 'k': /* Handle \k and \k'NAME' */ + parse_named_seq: + { + char ch= RExC_parse[1]; + if (ch != '<' && ch != '\'' && ch != '{') { + RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.2s... not terminated",parse_start); + } else { + /* this pretty much dupes the code for (?P=...) in reg(), if + you change this make sure you change that */ + char* name_start = (RExC_parse += 2); + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; + if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + + } + break; + } + case 'g': + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + { + I32 num; + bool hasbrace = 0; + + if (*RExC_parse == 'g') { + bool isrel = 0; + + RExC_parse++; + if (*RExC_parse == '{') { + RExC_parse++; + hasbrace = 1; + } + if (*RExC_parse == '-') { + RExC_parse++; + isrel = 1; + } + if (hasbrace && !isDIGIT(*RExC_parse)) { + if (isrel) RExC_parse--; + RExC_parse -= 2; + goto parse_named_seq; + } + + num = S_backref_value(RExC_parse); + if (num == 0) + vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); + } + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); + } + } + else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } + } + + /* at this point RExC_parse definitely points to a backref + * number */ + { +#ifdef RE_TRACK_PATTERN_OFFSETS + char * const parse_start = RExC_parse - 1; /* MJD */ +#endif + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (hasbrace) { + if (*RExC_parse != '}') + vFAIL("Unterminated \\g{...} pattern"); + RExC_parse++; + } + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) + vFAIL("Reference to nonexistent group"); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? REF + : (ASCII_FOLD_RESTRICTED) + ? REFFA + : (AT_LEAST_UNI_SEMANTICS) + ? REFFU + : (LOC) + ? REFFL + : REFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + RExC_parse--; + nextchar(pRExC_state); + } + } + break; + case '\0': + if (RExC_parse >= RExC_end) + FAIL("Trailing \\"); + /* FALLTHROUGH */ + default: + /* Do not generate "unrecognized" warnings here, we fall + back into the quick-grab loop below */ + parse_start--; + goto defchar; + } + break; + + case '#': + if (RExC_flags & RXf_PMf_EXTENDED) { + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) + goto tryagain; + } + /* FALLTHROUGH */ + + default: + + parse_start = RExC_parse - 1; + + RExC_parse++; + + defchar: { + STRLEN len = 0; + UV ender = 0; + char *p; + char *s; +#define MAX_NODE_STRING_SIZE 127 + char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; + char *s0; + U8 upper_parse = MAX_NODE_STRING_SIZE; + U8 node_type = compute_EXACTish(pRExC_state); + bool next_is_quantifier; + char * oldp = NULL; + + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); + + /* If a folding node contains only code points that don't + * participate in folds, it can be changed into an EXACT node, + * which allows the optimizer more things to look for */ + bool maybe_exact; + + ret = reg_node(pRExC_state, node_type); + + /* In pass1, folded, we use a temporary buffer instead of the + * actual node, as the node doesn't exist yet */ + s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); + + s0 = s; + + reparse: + + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; + + /* XXX The node can hold up to 255 bytes, yet this only goes to + * 127. I (khw) do not know why. Keeping it somewhat less than + * 255 allows us to not have to worry about overflow due to + * converting to utf8 and fold expansion, but that value is + * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes + * split up by this limit into a single one using the real max of + * 255. Even at 127, this breaks under rare circumstances. If + * folding, we do not want to split a node at a character that is a + * non-final in a multi-char fold, as an input string could just + * happen to want to match across the node boundary. The join + * would solve that problem if the join actually happens. But a + * series of more than two nodes in a row each of 127 would cause + * the first join to succeed to get to 254, but then there wouldn't + * be room for the next one, which could at be one of those split + * multi-char folds. I don't know of any fool-proof solution. One + * could back off to end with only a code point that isn't such a + * non-final, but it is possible for there not to be any in the + * entire node. */ + for (p = RExC_parse - 1; + len < upper_parse && p < RExC_end; + len++) + { + oldp = p; + + if (RExC_flags & RXf_PMf_EXTENDED) + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ + switch ((U8)*p) { + case '^': + case '$': + case '.': + case '[': + case '(': + case ')': + case '|': + goto loopdone; + case '\\': + /* Literal Escapes Switch + + This switch is meant to handle escape sequences that + resolve to a literal character. + + Every escape sequence that represents something + else, like an assertion or a char class, is handled + in the switch marked 'Special Escapes' above in this + routine, but also has an entry here as anything that + isn't explicitly mentioned here will be treated as + an unescaped equivalent literal. + */ + + switch ((U8)*++p) { + /* These are all the special escapes. */ + case 'A': /* Start assertion */ + case 'b': case 'B': /* Word-boundary assertion*/ + case 'C': /* Single char !DANGEROUS! */ + case 'd': case 'D': /* digit class */ + case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ + case 'k': case 'K': /* named backref, keep marker */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ + case 's': case 'S': /* space class */ + case 'v': case 'V': /* VERTWS */ + case 'w': case 'W': /* word class */ + case 'X': /* eXtended Unicode "combining + character sequence" */ + case 'z': case 'Z': /* End of line/string assertion */ + --p; + goto loopdone; + + /* Anything after here is an escape that resolves to a + literal. (Except digits, which may or may not) + */ + case 'n': + ender = '\n'; + p++; + break; + case 'N': /* Handle a single-code point named character. */ + /* The options cause it to fail if a multiple code + * point sequence. Handle those in the switch() above + * */ + RExC_parse = p + 1; + if (! grok_bslash_N(pRExC_state, NULL, &ender, + flagp, depth, FALSE, + FALSE /* not strict */ )) + { + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + RExC_parse = p = oldp; + goto loopdone; + } + p = RExC_parse; + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + case 'r': + ender = '\r'; + p++; + break; + case 't': + ender = '\t'; + p++; + break; + case 'f': + ender = '\f'; + p++; + break; + case 'e': + ender = ASCII_TO_NATIVE('\033'); + p++; + break; + case 'a': + ender = '\a'; + p++; + break; + case 'o': + { + UV result; + const char* error_msg; + + bool valid = grok_bslash_o(&p, + &result, + &error_msg, + TRUE, /* out warnings */ + FALSE, /* not strict */ + TRUE, /* Output warnings + for non- + portables */ + UTF); + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + ender = result; + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } + case 'x': + { + UV result = UV_MAX; /* initialize to erroneous + value */ + const char* error_msg; + + bool valid = grok_bslash_x(&p, + &result, + &error_msg, + TRUE, /* out warnings */ + FALSE, /* not strict */ + TRUE, /* Output warnings + for non- + portables */ + UTF); + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + ender = result; + + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } + case 'c': + p++; + ender = grok_bslash_c(*p++, SIZE_ONLY); + break; + case '8': case '9': /* must be a backreference */ + --p; + goto loopdone; + case '1': case '2': case '3':case '4': + case '5': case '6': case '7': + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. + * + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) + { /* Not to be treated as an octal constant, go + find backref */ + --p; + goto loopdone; + } + /* FALLTHROUGH */ + case '0': + { + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + STRLEN numlen = 3; + ender = grok_oct(p, &numlen, &flags, NULL); + if (ender > 0xff) { + REQUIRE_UTF8; + } + p += numlen; + if (SIZE_ONLY /* like \08, \178 */ + && numlen < 3 + && p < RExC_end + && isDIGIT(*p) && ckWARN(WARN_REGEXP)) + { + reg_warn_non_literal_string( + p + 1, + form_short_octal_warning(p, numlen)); + } + } + if (PL_encoding && ender < 0x100) + goto recode_encoding; + break; + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + ender = reg_recode((const char)(U8)ender, &enc); + if (!enc && SIZE_ONLY) + ckWARNreg(p, "Invalid escape in the specified encoding"); + REQUIRE_UTF8; + } + break; + case '\0': + if (p >= RExC_end) + FAIL("Trailing \\"); + /* FALLTHROUGH */ + default: + if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { + /* Include any { following the alpha to emphasize + * that it could be part of an escape at some point + * in the future */ + int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; + ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p); + } + goto normal_default; + } /* End of switch on '\' */ + break; + case '{': + /* Currently we don't warn when the lbrace is at the start + * of a construct. This catches it in the middle of a + * literal string, or when its the first thing after + * something like "\b" */ + if (! SIZE_ONLY + && (len || (p > RExC_start && isALPHA_A(*(p -1))))) + { + ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); + } + /*FALLTHROUGH*/ + default: /* A literal character */ + normal_default: + if (UTF8_IS_START(*p) && UTF) { + STRLEN numlen; + ender = utf8n_to_uvchr((U8*)p, RExC_end - p, + &numlen, UTF8_ALLOW_DEFAULT); + p += numlen; + } + else + ender = (U8) *p++; + break; + } /* End of switch on the literal */ + + /* Here, have looked at the literal character and + * contains its ordinal,

points to the character after it + */ + + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ + + /* If the next thing is a quantifier, it applies to this + * character only, which means that this character has to be in + * its own node and can't just be appended to the string in an + * existing node, so if there are already other characters in + * the node, close the node with just them, and set up to do + * this character again next time through, when it will be the + * only thing in its new node */ + if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + { + p = oldp; + goto loopdone; + } + + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { + if (UTF) { + const STRLEN unilen = reguni(pRExC_state, ender, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len--; + } + else { + REGC((char)ender, s++); + } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } + } + else /* FOLD */ + if (! ( UTF + /* See comments for join_exact() as to why we fold this + * non-UTF at compile time */ + || (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S))) + { + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) + { + maybe_exactfu = FALSE; + } + } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ + *(s++) = (char) ender; + } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; + } + else { + STRLEN foldlen; + + folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { + maybe_exact = FALSE; + } + } + } + ender = folded; + } + + if (next_is_quantifier) { + + /* Here, the next input is a quantifier, and to get here, + * the current character is the only one in the node. + * Also, here doesn't include the final byte for this + * character */ + len++; + goto loopdone; + } + + } /* End of loop through literal characters */ + + /* Here we have either exhausted the input or ran out of room in + * the node. (If we encountered a character that can't be in the + * node, transfer is made directly to , and so we + * wouldn't have fallen off the end of the loop.) In the latter + * case, we artificially have to split the node into two, because + * we just don't have enough space to hold everything. This + * creates a problem if the final character participates in a + * multi-character fold in the non-final position, as a match that + * should have occurred won't, due to the way nodes are matched, + * and our artificial boundary. So back off until we find a non- + * problematic character -- one that isn't at the beginning or + * middle of such a fold. (Either it doesn't participate in any + * folds, or appears only in the final position of all the folds it + * does participate in.) A better solution with far fewer false + * positives, and that would fill the nodes more completely, would + * be to actually have available all the multi-character folds to + * test against, and to back-off only far enough to be sure that + * this node isn't ending with a partial one. is set + * further below (if we need to reparse the node) to include just + * up through that final non-problematic character that this code + * identifies, so when it is set to less than the full node, we can + * skip the rest of this */ + if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { + + const STRLEN full_len = len; + + assert(len >= MAX_NODE_STRING_SIZE); + + /* Here, points to the final byte of the final character. + * Look backwards through the string until find a non- + * problematic character */ + + if (! UTF) { + + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { + goto loopdone; + } + + while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } + len = s - s0 + 1; + } + else { + if (! PL_NonL1NonFinalFold) { + PL_NonL1NonFinalFold = _new_invlist_C_array( + NonL1_Perl_Non_Final_Folds_invlist); + } + + /* Point to the first byte of the final character */ + s = (char *) utf8_hop((U8 *) s, -1); + + while (s >= s0) { /* Search backwards until find + non-problematic char */ + if (UTF8_IS_INVARIANT(*s)) { + + /* There are no ascii characters that participate + * in multi-char folds under /aa. In EBCDIC, the + * non-ascii invariants are all control characters, + * so don't ever participate in any folds. */ + if (ASCII_FOLD_RESTRICTED + || ! IS_NON_FINAL_FOLD(*s)) + { + break; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + *s, *(s+1)))) + { + break; + } + } + else if (! _invlist_contains_cp( + PL_NonL1NonFinalFold, + valid_utf8_to_uvchr((U8 *) s, NULL))) + { + break; + } + + /* Here, the current character is problematic in that + * it does occur in the non-final position of some + * fold, so try the character before it, but have to + * special case the very first byte in the string, so + * we don't read outside the string */ + s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); + } /* End of loop backwards through the string */ + + /* If there were only problematic characters in the string, + * will point to before s0, in which case the length + * should be 0, otherwise include the length of the + * non-problematic character just found */ + len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); + } + + /* Here, have found the final character, if any, that is + * non-problematic as far as ending the node without splitting + * it across a potential multi-char fold. contains the + * number of bytes in the node up-to and including that + * character, or is 0 if there is no such character, meaning + * the whole node contains only problematic characters. In + * this case, give up and just take the node as-is. We can't + * do any better */ + if (len == 0) { + len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } + } else { + + /* Here, the node does contain some characters that aren't + * problematic. If one such is the final character in the + * node, we are done */ + if (len == full_len) { + goto loopdone; + } + else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { + + /* If the final character is problematic, but the + * penultimate is not, back-off that last character to + * later start a new node with it */ + p = oldp; + goto loopdone; + } + + /* Here, the final non-problematic character is earlier + * in the input than the penultimate character. What we do + * is reparse from the beginning, going up only as far as + * this final ok one, thus guaranteeing that the node ends + * in an acceptable character. The reason we reparse is + * that we know how far in the character is, but we don't + * know how to correlate its position with the input parse. + * An alternate implementation would be to build that + * correlation as we go along during the original parse, + * but that would entail extra work for every node, whereas + * this code gets executed only when the string is too + * large for the node, and the final two characters are + * problematic, an infrequent occurrence. Yet another + * possible strategy would be to save the tail of the + * string, and the next time regatom is called, initialize + * with that. The problem with this is that unless you + * back off one more character, you won't be guaranteed + * regatom will get called again, unless regbranch, + * regpiece ... are also changed. If you do back off that + * extra character, so that there is input guaranteed to + * force calling regatom, you can't handle the case where + * just the first character in the node is acceptable. I + * (khw) decided to try this method which doesn't have that + * pitfall; if performance issues are found, we can do a + * combination of the current approach plus that one */ + upper_parse = len; + len = 0; + s = s0; + goto reparse; + } + } /* End of verifying node ends with an appropriate char */ + + loopdone: /* Jumped to when encounters something that shouldn't be in + the node */ + + /* I (khw) don't know if you can get here with zero length, but the + * old code handled this situation by creating a zero-length EXACT + * node. Might as well be NOTHING instead */ + if (len == 0) { + OP(ret) = NOTHING; + } + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * (for /u), or depending on locale for /l */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } + } + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); + } + + RExC_parse = p - 1; + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + { + /* len is STRLEN which is unsigned, need to copy to signed */ + IV iv = len; + if (iv < 0) + vFAIL("Internal disaster"); + } + + } /* End of label 'defchar:' */ + break; + } /* End of giant switch on input character */ + + return(ret); +} + +STATIC char * +S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) +{ + /* Returns the next non-pattern-white space, non-comment character (the + * latter only if 'recognize_comment is true) in the string p, which is + * ended by RExC_end. See also reg_skipcomment */ + const char *e = RExC_end; + + PERL_ARGS_ASSERT_REGPATWS; + + while (p < e) { + STRLEN len; + if ((len = is_PATWS_safe(p, e, UTF))) { + p += len; + } + else if (recognize_comment && *p == '#') { + p = reg_skipcomment(pRExC_state, p); + } + else + break; + } + return p; +} + +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + +/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. + Character classes ([:foo:]) can also be negated ([:^foo:]). + Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. + Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, + but trigger failures because they are currently unimplemented. */ + +#define POSIXCC_DONE(c) ((c) == ':') +#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') +#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) + +PERL_STATIC_INLINE I32 +S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) +{ + I32 namedclass = OOB_NAMEDCLASS; + + PERL_ARGS_ASSERT_REGPPOSIXCC; + + if (value == '[' && RExC_parse + 1 < RExC_end && + /* I smell either [: or [= or [. -- POSIX has been here, right? */ + POSIXCC(UCHARAT(RExC_parse))) + { + const char c = UCHARAT(RExC_parse); + char* const s = RExC_parse++; + + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) + RExC_parse++; + if (RExC_parse == RExC_end) { + if (strict) { + + /* Try to give a better location for the error (than the end of + * the string) by looking for the matching ']' */ + RExC_parse = s; + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { + RExC_parse++; + } + vFAIL2("Unmatched '%c' in POSIX class", c); + } + /* Grandfather lone [:, [=, [. */ + RExC_parse = s; + } + else { + const char* const t = RExC_parse++; /* skip over the c */ + assert(*t == c); + + if (UCHARAT(RExC_parse) == ']') { + const char *posixcc = s + 1; + RExC_parse++; /* skip over the ending ] */ + + if (*s == ':') { + const I32 complement = *posixcc == '^' ? *posixcc++ : 0; + const I32 skip = t - posixcc; + + /* Initially switch on the length of the name. */ + switch (skip) { + case 4: + if (memEQ(posixcc, "word", 4)) /* this is not POSIX, + this is the Perl \w + */ + namedclass = ANYOF_WORDCHAR; + break; + case 5: + /* Names all of length 5. */ + /* alnum alpha ascii blank cntrl digit graph lower + print punct space upper */ + /* Offset 4 gives the best switch position. */ + switch (posixcc[4]) { + case 'a': + if (memEQ(posixcc, "alph", 4)) /* alpha */ + namedclass = ANYOF_ALPHA; + break; + case 'e': + if (memEQ(posixcc, "spac", 4)) /* space */ + namedclass = ANYOF_PSXSPC; + break; + case 'h': + if (memEQ(posixcc, "grap", 4)) /* graph */ + namedclass = ANYOF_GRAPH; + break; + case 'i': + if (memEQ(posixcc, "asci", 4)) /* ascii */ + namedclass = ANYOF_ASCII; + break; + case 'k': + if (memEQ(posixcc, "blan", 4)) /* blank */ + namedclass = ANYOF_BLANK; + break; + case 'l': + if (memEQ(posixcc, "cntr", 4)) /* cntrl */ + namedclass = ANYOF_CNTRL; + break; + case 'm': + if (memEQ(posixcc, "alnu", 4)) /* alnum */ + namedclass = ANYOF_ALPHANUMERIC; + break; + case 'r': + if (memEQ(posixcc, "lowe", 4)) /* lower */ + namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; + else if (memEQ(posixcc, "uppe", 4)) /* upper */ + namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; + break; + case 't': + if (memEQ(posixcc, "digi", 4)) /* digit */ + namedclass = ANYOF_DIGIT; + else if (memEQ(posixcc, "prin", 4)) /* print */ + namedclass = ANYOF_PRINT; + else if (memEQ(posixcc, "punc", 4)) /* punct */ + namedclass = ANYOF_PUNCT; + break; + } + break; + case 6: + if (memEQ(posixcc, "xdigit", 6)) + namedclass = ANYOF_XDIGIT; + break; + } + + if (namedclass == OOB_NAMEDCLASS) + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); + + /* The #defines are structured so each complement is +1 to + * the normal one */ + if (complement) { + namedclass++; + } + assert (posixcc[skip] == ':'); + assert (posixcc[skip+1] == ']'); + } else if (!SIZE_ONLY) { + /* [[=foo=]] and [[.foo.]] are still future. */ + + /* adjust RExC_parse so the warning shows after + the class closes */ + while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') + RExC_parse++; + vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } + } else { + /* Maternal grandfather: + * "[:" ending in ":" but not in ":]" */ + if (strict) { + vFAIL("Unmatched '[' in POSIX class"); + } + + /* Grandfather lone [:, [=, [. */ + RExC_parse = s; + } + } + } + + return namedclass; +} + +STATIC bool +S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state) +{ + /* This applies some heuristics at the current parse position (which should + * be at a '[') to see if what follows might be intended to be a [:posix:] + * class. It returns true if it really is a posix class, of course, but it + * also can return true if it thinks that what was intended was a posix + * class that didn't quite make it. + * + * It will return true for + * [:alphanumerics: + * [:alphanumerics] (as long as the ] isn't followed immediately by a + * ')' indicating the end of the (?[ + * [:any garbage including %^&$ punctuation:] + * + * This is designed to be called only from S_handle_regex_sets; it could be + * easily adapted to be called from the spot at the beginning of regclass() + * that checks to see in a normal bracketed class if the surrounding [] + * have been omitted ([:word:] instead of [[:word:]]). But doing so would + * change long-standing behavior, so I (khw) didn't do that */ + char* p = RExC_parse + 1; + char first_char = *p; + + PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS; + + assert(*(p - 1) == '['); + + if (! POSIXCC(first_char)) { + return FALSE; + } + + p++; + while (p < RExC_end && isWORDCHAR(*p)) p++; + + if (p >= RExC_end) { + return FALSE; + } + + if (p - RExC_parse > 2 /* Got at least 1 word character */ + && (*p == first_char + || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')'))) + { + return TRUE; + } + + p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse); + + return (p + && p - RExC_parse > 2 /* [:] evaluates to colon; + [::] is a bad posix class. */ + && first_char == *(p - 1)); +} + +STATIC regnode * +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) +{ + /* Handle the (?[...]) construct to do set operations */ + + U8 curchar; + UV start, end; /* End points of code point ranges */ + SV* result_string; + char *save_end, *save_parse; + SV* final; + STRLEN len; + regnode* node; + AV* stack; + const bool save_fold = FOLD; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; + + if (LOC) { + vFAIL("(?[...]) not valid in locale"); + } + RExC_uni_semantics = 1; + + /* This will return only an ANYOF regnode, or (unlikely) something smaller + * (such as EXACT). Thus we can skip most everything if just sizing. We + * call regclass to handle '[]' so as to not have to reinvent its parsing + * rules here (throwing away the size it computes each time). And, we exit + * upon an unescaped ']' that isn't one ending a regclass. To do both + * these things, we need to realize that something preceded by a backslash + * is escaped, so we have to keep track of backslashes */ + if (SIZE_ONLY) { + UV depth = 0; /* how many nested (?[...]) constructs */ + + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__REGEX_SETS), + "The regex_sets feature is experimental" REPORT_LOCATION, + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); + + while (RExC_parse < RExC_end) { + SV* current = NULL; + RExC_parse = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + switch (*RExC_parse) { + case '?': + if (RExC_parse[1] == '[') depth++, RExC_parse++; + /* FALLTHROUGH */ + default: + break; + case '\\': + /* Skip the next byte (which could cause us to end up in + * the middle of a UTF-8 character, but since none of those + * are confusable with anything we currently handle in this + * switch (invariants all), it's safe. We'll just hit the + * default: case next time and keep on incrementing until + * we find one of the invariants we do handle. */ + RExC_parse++; + break; + case '[': + { + /* If this looks like it is a [:posix:] class, leave the + * parse pointer at the '[' to fool regclass() into + * thinking it is part of a '[[:posix:]]'. That function + * will use strict checking to force a syntax error if it + * doesn't work out to a legitimate class */ + bool is_posix_class + = could_it_be_a_POSIX_class(pRExC_state); + if (! is_posix_class) { + RExC_parse++; + } + + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if (!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char + class only if not a + posix class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + + /* function call leaves parse pointing to the ']', except + * if we faked it */ + if (is_posix_class) { + RExC_parse--; + } + + SvREFCNT_dec(current); /* In case it returned something */ + break; + } + + case ']': + if (depth--) break; + RExC_parse++; + if (RExC_parse < RExC_end + && *RExC_parse == ')') + { + node = reganode(pRExC_state, ANYOF, 0); + RExC_size += ANYOF_SKIP; + nextchar(pRExC_state); + Set_Node_Length(node, + RExC_parse - oregcomp_parse + 1); /* MJD */ + return node; + } + goto no_close; + } + RExC_parse++; + } + + no_close: + FAIL("Syntax error in (?[...])"); + } + + /* Pass 2 only after this. Everything in this construct is a + * metacharacter. Operands begin with either a '\' (for an escape + * sequence), or a '[' for a bracketed character class. Any other + * character should be an operator, or parenthesis for grouping. Both + * types of operands are handled by calling regclass() to parse them. It + * is called with a parameter to indicate to return the computed inversion + * list. The parsing here is implemented via a stack. Each entry on the + * stack is a single character representing one of the operators, or the + * '('; or else a pointer to an operand inversion list. */ + +#define IS_OPERAND(a) (! SvIOK(a)) + + /* The stack starts empty. It is a syntax error if the first thing parsed + * is a binary operator; everything else is pushed on the stack. When an + * operand is parsed, the top of the stack is examined. If it is a binary + * operator, the item before it should be an operand, and both are replaced + * by the result of doing that operation on the new operand and the one on + * the stack. Thus a sequence of binary operands is reduced to a single + * one before the next one is parsed. + * + * A unary operator may immediately follow a binary in the input, for + * example + * [a] + ! [b] + * When an operand is parsed and the top of the stack is a unary operator, + * the operation is performed, and then the stack is rechecked to see if + * this new operand is part of a binary operation; if so, it is handled as + * above. + * + * A '(' is simply pushed on the stack; it is valid only if the stack is + * empty, or the top element of the stack is an operator or another '(' + * (for which the parenthesized expression will become an operand). By the + * time the corresponding ')' is parsed everything in between should have + * been parsed and evaluated to a single operand (or else is a syntax + * error), and is handled as a regular operand */ + + sv_2mortal((SV *)(stack = newAV())); + + while (RExC_parse < RExC_end) { + I32 top_index = av_tindex(stack); + SV** top_ptr; + SV* current = NULL; + + /* Skip white space */ + RExC_parse = regpatws(pRExC_state, RExC_parse, + TRUE /* means recognize comments */ ); + if (RExC_parse >= RExC_end) { + Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); + } + if ((curchar = UCHARAT(RExC_parse)) == ']') { + break; + } + + switch (curchar) { + + case '?': + if (av_tindex(stack) >= 0 /* This makes sure that we can + safely subtract 1 from + RExC_parse in the next clause. + If we have something on the + stack, we have parsed something + */ + && UCHARAT(RExC_parse - 1) == '(' + && RExC_parse < RExC_end) + { + /* If is a '(?', could be an embedded '(?flags:(?[...])'. + * This happens when we have some thing like + * + * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; + * ... + * qr/(?[ \p{Digit} & $thai_or_lao ])/; + * + * Here we would be handling the interpolated + * '$thai_or_lao'. We handle this by a recursive call to + * ourselves which returns the inversion list the + * interpolated expression evaluates to. We use the flags + * from the interpolated pattern. */ + U32 save_flags = RExC_flags; + const char * const save_parse = ++RExC_parse; + + parse_lparen_question_flags(pRExC_state); + + if (RExC_parse == save_parse /* Makes sure there was at + least one flag (or this + embedding wasn't compiled) + */ + || RExC_parse >= RExC_end - 4 + || UCHARAT(RExC_parse) != ':' + || UCHARAT(++RExC_parse) != '(' + || UCHARAT(++RExC_parse) != '?' + || UCHARAT(++RExC_parse) != '[') + { + + /* In combination with the above, this moves the + * pointer to the point just after the first erroneous + * character (or if there are no flags, to where they + * should have been) */ + if (RExC_parse >= RExC_end - 4) { + RExC_parse = RExC_end; + } + else if (RExC_parse != save_parse) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } + vFAIL("Expecting '(?flags:(?[...'"); + } + RExC_parse++; + (void) handle_regex_sets(pRExC_state, ¤t, flagp, + depth+1, oregcomp_parse); + + /* Here, 'current' contains the embedded expression's + * inversion list, and RExC_parse points to the trailing + * ']'; the next character should be the ')' which will be + * paired with the '(' that has been put on the stack, so + * the whole embedded expression reduces to '(operand)' */ + RExC_parse++; + + RExC_flags = save_flags; + goto handle_operand; + } + /* FALLTHROUGH */ + + default: + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unexpected character"); + + case '\\': + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if (!regclass(pRExC_state, flagp,depth+1, + TRUE, /* means parse just the next thing */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + /* regclass() will return with parsing just the \ sequence, + * leaving the parse pointer at the next thing to parse */ + RExC_parse--; + goto handle_operand; + + case '[': /* Is a bracketed character class */ + { + bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state); + + if (! is_posix_class) { + RExC_parse++; + } + + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if(!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char class + only if not a posix class */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + /* function call leaves parse pointing to the ']', except if we + * faked it */ + if (is_posix_class) { + RExC_parse--; + } + + goto handle_operand; + } + + case '&': + case '|': + case '+': + case '-': + case '^': + if (top_index < 0 + || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) + || ! IS_OPERAND(*top_ptr)) + { + RExC_parse++; + vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar); + } + av_push(stack, newSVuv(curchar)); + break; + + case '!': + av_push(stack, newSVuv(curchar)); + break; + + case '(': + if (top_index >= 0) { + top_ptr = av_fetch(stack, top_index, FALSE); + assert(top_ptr); + if (IS_OPERAND(*top_ptr)) { + RExC_parse++; + vFAIL("Unexpected '(' with no preceding operator"); + } + } + av_push(stack, newSVuv(curchar)); + break; + + case ')': + { + SV* lparen; + if (top_index < 1 + || ! (current = av_pop(stack)) + || ! IS_OPERAND(current) + || ! (lparen = av_pop(stack)) + || IS_OPERAND(lparen) + || SvUV(lparen) != '(') + { + SvREFCNT_dec(current); + RExC_parse++; + vFAIL("Unexpected ')'"); + } + top_index -= 2; + SvREFCNT_dec_NN(lparen); + + /* FALLTHROUGH */ + } + + handle_operand: + + /* Here, we have an operand to process, in 'current' */ + + if (top_index < 0) { /* Just push if stack is empty */ + av_push(stack, current); + } + else { + SV* top = av_pop(stack); + SV *prev = NULL; + char current_operator; + + if (IS_OPERAND(top)) { + SvREFCNT_dec_NN(top); + SvREFCNT_dec_NN(current); + vFAIL("Operand with no preceding operator"); + } + current_operator = (char) SvUV(top); + switch (current_operator) { + case '(': /* Push the '(' back on followed by the new + operand */ + av_push(stack, top); + av_push(stack, current); + SvREFCNT_inc(top); /* Counters the '_dec' done + just after the 'break', so + it doesn't get wrongly freed + */ + break; + + case '!': + _invlist_invert(current); + + /* Unlike binary operators, the top of the stack, + * now that this unary one has been popped off, may + * legally be an operator, and we now have operand + * for it. */ + top_index--; + SvREFCNT_dec_NN(top); + goto handle_operand; + + case '&': + prev = av_pop(stack); + _invlist_intersection(prev, + current, + ¤t); + av_push(stack, current); + break; + + case '|': + case '+': + prev = av_pop(stack); + _invlist_union(prev, current, ¤t); + av_push(stack, current); + break; + + case '-': + prev = av_pop(stack);; + _invlist_subtract(prev, current, ¤t); + av_push(stack, current); + break; + + case '^': /* The union minus the intersection */ + { + SV* i = NULL; + SV* u = NULL; + SV* element; + + prev = av_pop(stack); + _invlist_union(prev, current, &u); + _invlist_intersection(prev, current, &i); + /* _invlist_subtract will overwrite current + without freeing what it already contains */ + element = current; + _invlist_subtract(u, i, ¤t); + av_push(stack, current); + SvREFCNT_dec_NN(i); + SvREFCNT_dec_NN(u); + SvREFCNT_dec_NN(element); + break; + } + + default: + Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); + } + SvREFCNT_dec_NN(top); + SvREFCNT_dec(prev); + } + } + + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } + + if (av_tindex(stack) < 0 /* Was empty */ + || ((final = av_pop(stack)) == NULL) + || ! IS_OPERAND(final) + || av_tindex(stack) >= 0) /* More left on stack */ + { + vFAIL("Incomplete expression within '(?[ ])'"); + } + + /* Here, 'final' is the resultant inversion list from evaluating the + * expression. Return it if so requested */ + if (return_invlist) { + *return_invlist = final; + return END; + } + + /* Otherwise generate a resultant node, based on 'final'. regclass() is + * expecting a string of ranges and individual code points */ + invlist_iterinit(final); + result_string = newSVpvs(""); + while (invlist_iternext(final, &start, &end)) { + if (start == end) { + Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start); + } + else { + Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}", + start, end); + } + } + + save_parse = RExC_parse; + RExC_parse = SvPV(result_string, len); + save_end = RExC_end; + RExC_end = RExC_parse + len; + + /* We turn off folding around the call, as the class we have constructed + * already has all folding taken into consideration, and we don't want + * regclass() to add to that */ + RExC_flags &= ~RXf_PMf_FOLD; + /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. + */ + node = regclass(pRExC_state, flagp,depth+1, + FALSE, /* means parse the whole char class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. The above may very + well have generated non-portable code points, but + they're valid on this machine */ + NULL); + if (!node) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, + PTR2UV(flagp)); + if (save_fold) { + RExC_flags |= RXf_PMf_FOLD; + } + RExC_parse = save_parse + 1; + RExC_end = save_end; + SvREFCNT_dec_NN(final); + SvREFCNT_dec_NN(result_string); + + nextchar(pRExC_state); + Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ + return node; +} +#undef IS_OPERAND + +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a simple fold with other code points above + * Latin1. It would give false results if /aa has been specified. + * Multi-char folds are outside the scope of this, and must be handled + * specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp)); + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + default: + /* Use deprecated warning to increase the chances of this being + * output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + break; + } +} + +/* The names of properties whose definitions are not known at compile time are + * stored in this SV, after a constant heading. So if the length has been + * changed since initialization, then there is a run-time definition. */ +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) + +STATIC regnode * +S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, + const bool stop_at_1, /* Just parse the next thing, don't + look for a full character class */ + bool allow_multi_folds, + const bool silence_non_portable, /* Don't output warnings + about too large + characters */ + SV** ret_invlist) /* Return an inversion list, not a node */ +{ + /* parse a bracketed class specification. Most of these will produce an + * ANYOF node; but something like [a] will produce an EXACT node; [aA], an + * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex + * under /i with multi-character folds: it will be rewritten following the + * paradigm of this example, where the s are characters which + * fold to multiple character sequences: + * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i + * gets effectively rewritten as: + * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i + * reg() gets called (recursively) on the rewritten version, and this + * function will return what it constructs. (Actually the s + * aren't physically removed from the [abcdefghi], it's just that they are + * ignored in the recursion by means of a flag: + * .) + * + * ANYOF nodes contain a bit map for the first 256 characters, with the + * corresponding bit set if that character is in the list. For characters + * above 255, a range list or swash is used. There are extra bits for \w, + * etc. in locale ANYOFs, as what these match is not determinable at + * compile time + * + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs + * to be restarted. This can only happen if ret_invlist is non-NULL. + */ + + UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; + IV range = 0; + UV value = OOB_UNICODE, save_value = OOB_UNICODE; + regnode *ret; + STRLEN numlen; + IV namedclass = OOB_NAMEDCLASS; + char *rangebegin = NULL; + bool need_class = 0; + SV *listsv = NULL; + STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more + than just initialized. */ + SV* properties = NULL; /* Code points that match \p{} \P{} */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ + UV element_count = 0; /* Number of distinct elements in the class. + Optimizations may be possible if this is tiny */ + AV * multi_char_matches = NULL; /* Code points that fold to more than one + character; used under /i */ + UV n; + char * stop_ptr = RExC_end; /* where to stop parsing */ + const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white + space? */ + const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */ + + /* Unicode properties are stored in a swash; this holds the current one + * being parsed. If this swash is the only above-latin1 component of the + * character class, an optimization is to pass it directly on to the + * execution engine. Otherwise, it is set to NULL to indicate that there + * are other things in the class that have to be dealt with at execution + * time */ + SV* swash = NULL; /* Code points that match \p{} \P{} */ + + /* Set if a component of this character class is user-defined; just passed + * on to the engine */ + bool has_user_defined_property = FALSE; + + /* inversion list of code points this node matches only when the target + * string is in UTF-8. (Because is under /d) */ + SV* depends_list = NULL; + + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ + SV* cp_list = NULL; + + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + +#ifdef EBCDIC + /* In a range, counts how many 0-2 of the ends of it came from literals, + * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ + UV literal_endpoint = 0; +#endif + bool invert = FALSE; /* Is this class to be complemented */ + + bool warn_super = ALWAYS_WARN_SUPER; + + regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in + case we need to change the emitted regop to an EXACT. */ + const char * orig_parse = RExC_parse; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCLASS; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + DEBUG_PARSE("clas"); + + /* Assume we are going to generate an ANYOF node. */ + ret = reganode(pRExC_state, ANYOF, 0); + + if (SIZE_ONLY) { + RExC_size += ANYOF_SKIP; + listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ + } + else { + ANYOF_FLAGS(ret) = 0; + + RExC_emit += ANYOF_SKIP; + listsv = newSVpvs_flags("# comment\n", SVs_TEMP); + initial_listsv_len = SvCUR(listsv); + SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ + } + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ + RExC_parse++; + invert = TRUE; + allow_multi_folds = FALSE; + RExC_naughty++; + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + } + + /* Check that they didn't say [:posix:] instead of [[:posix:]] */ + if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) { + const char *s = RExC_parse; + const char c = *s++; + + while (isWORDCHAR(*s)) + s++; + if (*s && c == *s && s[1] == ']') { + SAVEFREESV(RExC_rx_sv); + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + + /* If the caller wants us to just parse a single element, accomplish this + * by faking the loop ending condition */ + if (stop_at_1 && RExC_end > RExC_parse) { + stop_ptr = RExC_parse + 1; + } + + /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ + if (UCHARAT(RExC_parse) == ']') + goto charclassloop; + +parseit: + while (1) { + if (RExC_parse >= stop_ptr) { + break; + } + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (UCHARAT(RExC_parse) == ']') { + break; + } + + charclassloop: + + namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + save_value = value; + save_prevvalue = prevvalue; + + if (!range) { + rangebegin = RExC_parse; + element_count++; + } + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + + if (value == '[' + && RExC_parse < RExC_end + && POSIXCC(UCHARAT(RExC_parse))) + { + namedclass = regpposixcc(pRExC_state, value, strict); + } + else if (value == '\\') { + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. + * A similar issue a little bit later when switching on + * namedclass. --jhi */ + + /* If the \ is escaping white space when white space is being + * skipped, it means that that white space is wanted literally, and + * is already in 'value'. Otherwise, need to translate the escape + * into what it signifies. */ + if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) { + + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; + case 's': namedclass = ANYOF_SPACE; break; + case 'S': namedclass = ANYOF_NSPACE; break; + case 'd': namedclass = ANYOF_DIGIT; break; + case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; + case 'N': /* Handle \N{NAME} in class */ + { + /* We only pay attention to the first char of + multichar strings being returned. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. */ + if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, + TRUE, /* => charclass */ + strict)) + { + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + goto parseit; + } + } + break; + case 'p': + case 'P': + { + char *e; + + /* We will handle any undefined properties ourselves */ + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; + + if (RExC_parse >= RExC_end) + vFAIL2("Empty \\%c{}", (U8)value); + if (*RExC_parse == '{') { + const U8 c = (U8)value; + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL2("Missing right brace on \\%c{}", c); + while (isSPACE(*RExC_parse)) + RExC_parse++; + if (e == RExC_parse) + vFAIL2("Empty \\%c{}", c); + n = e - RExC_parse; + while (isSPACE(*(RExC_parse + n - 1))) + n--; + } + else { + e = RExC_parse; + n = 1; + } + if (!SIZE_ONLY) { + SV* invlist; + char* name; + + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + n--; + /* toggle. (The rhs xor gets the single bit that + * differs between P and p; the other xor inverts just + * that bit) */ + value ^= 'P' ^ 'p'; + + while (isSPACE(*RExC_parse)) { + RExC_parse++; + n--; + } + } + /* Try to get the definition of the property into + * . If /i is in effect, the effective property + * will have its name be <__NAME_i>. The design is + * discussed in commit + * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ + name = savepv(Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + )); + + /* Look up the property name, and get its swash and + * inversion list, if the property is found */ + if (swash) { + SvREFCNT_dec_NN(swash); + } + swash = _core_swash_init("utf8", name, &PL_sv_undef, + 1, /* binary */ + 0, /* not tr/// */ + NULL, /* No inversion list */ + &swash_init_flags + ); + if (! swash || ! (invlist = _get_swash_invlist(swash))) { + HV* curpkg = (IN_PERL_COMPILETIME) + ? PL_curstash + : CopSTASH(PL_curcop); + if (swash) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + /* Here didn't find it. It could be a user-defined + * property that will be available at run-time. If we + * accept only compile-time properties, is an error; + * otherwise add it to the list for run-time look up */ + if (ret_invlist) { + RExC_parse = e + 1; + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); + } + + /* If the property name doesn't already have a package + * name, add the current one to it so that it can be + * referred to outside it. [perl #121777] */ + if (curpkg && ! instr(name, "::")) { + char* pkgname = HvNAME(curpkg); + if (strNE(pkgname, "main")) { + char* full_name = Perl_form(aTHX_ + "%s::%s", + pkgname, + name); + n = strlen(full_name); + Safefree(name); + name = savepvn(full_name, n); + } + } + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", + (value == 'p' ? '+' : '!'), + UTF8fARG(UTF, n, name)); + has_user_defined_property = TRUE; + + /* We don't know yet, so have to assume that the + * property could match something in the Latin1 range, + * hence something that isn't utf8. Note that this + * would cause things in to match + * inappropriately, except that any \p{}, including + * this one forces Unicode semantics, which means there + * is no */ + ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; + } + else { + + /* Here, did get the swash and its inversion list. If + * the swash is from a user-defined property, then this + * whole character class should be regarded as such */ + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + + + /* Invert if asking for the complement */ + if (value == 'P') { + _invlist_union_complement_2nd(properties, + invlist, + &properties); + + /* The swash can't be used as-is, because we've + * inverted things; delay removing it to here after + * have copied its invlist above */ + SvREFCNT_dec_NN(swash); + swash = NULL; + } + else { + _invlist_union(properties, invlist, &properties); + } + } + Safefree(name); + } + RExC_parse = e + 1; + namedclass = ANYOF_UNIPROP; /* no official name, but it's + named */ + + /* \p means they want Unicode semantics */ + RExC_uni_semantics = 1; + } + break; + case 'n': value = '\n'; break; + case 'r': value = '\r'; break; + case 't': value = '\t'; break; + case 'f': value = '\f'; break; + case 'b': value = '\b'; break; + case 'e': value = ASCII_TO_NATIVE('\033');break; + case 'a': value = '\a'; break; + case 'o': + RExC_parse--; /* function expects to be pointed at the 'o' */ + { + const char* error_msg; + bool valid = grok_bslash_o(&RExC_parse, + &value, + &error_msg, + SIZE_ONLY, /* warnings in pass + 1 only */ + strict, + silence_non_portable, + UTF); + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) { + goto recode_encoding; + } + break; + case 'x': + RExC_parse--; /* function expects to be pointed at the 'x' */ + { + const char* error_msg; + bool valid = grok_bslash_x(&RExC_parse, + &value, + &error_msg, + TRUE, /* Output warnings */ + strict, + silence_non_portable, + UTF); + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + case 'c': + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + /* Take 1-3 octal digits */ + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + numlen = (strict) ? 4 : 3; + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); + RExC_parse += numlen; + if (numlen != 3) { + if (strict) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Need exactly 3 octal digits"); + } + else if (! SIZE_ONLY /* like \08, \178 */ + && numlen < 3 + && RExC_parse < RExC_end + && isDIGIT(*RExC_parse) + && ckWARN(WARN_REGEXP)) + { + SAVEFREESV(RExC_rx_sv); + reg_warn_non_literal_string( + RExC_parse + 1, + form_short_octal_warning(RExC_parse, numlen)); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + } + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + value = reg_recode((const char)(U8)value, &enc); + if (!enc) { + if (strict) { + vFAIL("Invalid escape in the specified encoding"); + } + else if (SIZE_ONLY) { + ckWARNreg(RExC_parse, + "Invalid escape in the specified encoding"); + } + } + break; + } + default: + /* Allow \_ to not give an error */ + if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') { + if (strict) { + vFAIL2("Unrecognized escape \\%c in character class", + (int)value); + } + else { + SAVEFREESV(RExC_rx_sv); + ckWARN2reg(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + break; + } /* End of switch on char following backslash */ + } /* end of handling backslash escape sequences */ +#ifdef EBCDIC + else + literal_endpoint++; +#endif + + /* Here, we have the current token in 'value' */ + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; + + /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a + * literal, as is the character that began the false range, i.e. + * the 'a' in the examples */ + if (range) { + if (!SIZE_ONLY) { + const int w = (RExC_parse >= rangebegin) + ? RExC_parse - rangebegin + : 0; + if (strict) { + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + } + else { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + (void)ReREFCNT_inc(RExC_rx_sv); + cp_list = add_cp_to_invlist(cp_list, '-'); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); + } + } + + range = 0; /* this was not a true range */ + element_count += 2; /* So counts for three values */ + } + + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* Coverity thinks it is possible for this to be negative; both + * jhi and khw think it's not, but be safer */ + assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ + if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ + + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ + if (classnum != _CC_VERTSPACE) { + assert( namedclass == ANYOF_HORIZWS + || namedclass == ANYOF_NHORIZWS); + + /* It turns out that \h is just a synonym for + * XPosixBlank */ + classnum = _CC_BLANK; + } + + _invlist_union_maybe_complement_2nd( + cp_list, + PL_XPosix_ptrs[classnum], + namedclass % 2 != 0, /* Complement if odd + (NHORIZWS, NVERTWS) + */ + &cp_list); + } + } + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); + } + continue; /* Go get next character */ + } + } /* end of namedclass \blah */ + + /* Here, we have a single value. If 'range' is set, it is the ending + * of a range--check its validity. Later, we will handle each + * individual code point in the range. If 'range' isn't set, this + * could be the beginning of a range, so check for that by looking + * ahead to see if the next real character to be processed is the range + * indicator--the minus sign */ + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (range) { + if (prevvalue > value) /* b-a */ { + const int w = RExC_parse - rangebegin; + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + range = 0; /* not a valid range */ + } + } + else { + prevvalue = value; /* save the beginning of the potential range */ + if (! stop_at_1 /* Can't be a range if parsing just one thing */ + && *RExC_parse == '-') + { + char* next_char_ptr = RExC_parse + 1; + if (skip_white) { /* Get the next real char after the '-' */ + next_char_ptr = regpatws(pRExC_state, + RExC_parse + 1, + FALSE); /* means don't recognize + comments */ + } + + /* If the '-' is at the end of the class (just before the ']', + * it is a literal minus; otherwise it is a range */ + if (next_char_ptr < RExC_end && *next_char_ptr != ']') { + RExC_parse = next_char_ptr; + + /* a bad range like \w-, [:word:]- ? */ + if (namedclass > OOB_NAMEDCLASS) { + if (strict || ckWARN(WARN_REGEXP)) { + const int w = + RExC_parse >= rangebegin ? + RExC_parse - rangebegin : 0; + if (strict) { + vFAIL4("False [] range \"%*.*s\"", + w, w, rangebegin); + } + else { + vWARN4(RExC_parse, + "False [] range \"%*.*s\"", + w, w, rangebegin); + } + } + if (!SIZE_ONLY) { + cp_list = add_cp_to_invlist(cp_list, '-'); + } + element_count++; + } else + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ + } + } + } + + /* Here, is the beginning of the range, if any; or + * if not */ + + /* non-Latin1 code point implies unicode semantics. Must be set in + * pass1 so is there for the whole of pass 2 */ + if (value > 255) { + RExC_uni_semantics = 1; + } + + /* Ready to process either the single value, or the completed range. + * For single-valued non-inverted ranges, we consider the possibility + * of multi-char folds. (We made a conscious decision to not do this + * for the other cases because it can often lead to non-intuitive + * results. For example, you have the peculiar case that: + * "s s" =~ /^[^\xDF]+$/i => Y + * "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + if (FOLD && allow_multi_folds && value == prevvalue) { + if (value == LATIN_SMALL_LETTER_SHARP_S + || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, + value))) + { + /* Here is indeed a multi-char fold. Get what it is */ + + U8 foldbuf[UTF8_MAXBYTES_CASE]; + STRLEN foldlen; + + UV folded = _to_uni_fold_flags( + value, + foldbuf, + &foldlen, + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + + /* Here, should be the first character of the + * multi-char fold of , with containing the + * whole thing. But, if this fold is not allowed (because of + * the flags), will be the same as , and should + * be processed like any other character, so skip the special + * handling */ + if (folded != value) { + + /* Skip if we are recursed, currently parsing the class + * again. Otherwise add this character to the list of + * multi-char folds. */ + if (! RExC_in_multi_char_class) { + AV** this_array_ptr; + AV* this_array; + STRLEN cp_count = utf8_length(foldbuf, + foldbuf + foldlen); + SV* multi_fold = sv_2mortal(newSVpvs("")); + + Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); + + + if (! multi_char_matches) { + multi_char_matches = newAV(); + } + + /* is actually an array of arrays. + * There will be one or two top-level elements: [2], + * and/or [3]. The [2] element is an array, each + * element thereof is a character which folds to TWO + * characters; [3] is for folds to THREE characters. + * (Unicode guarantees a maximum of 3 characters in any + * fold.) When we rewrite the character class below, + * we will do so such that the longest folds are + * written first, so that it prefers the longest + * matching strings first. This is done even if it + * turns out that any quantifier is non-greedy, out of + * programmer laziness. Tom Christiansen has agreed + * that this is ok. This makes the test for the + * ligature 'ffi' come before the test for 'ff' */ + if (av_exists(multi_char_matches, cp_count)) { + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + this_array = *this_array_ptr; + } + else { + this_array = newAV(); + av_store(multi_char_matches, cp_count, + (SV*) this_array); + } + av_push(this_array, multi_fold); + } + + /* This element should not be processed further in this + * class */ + element_count--; + value = save_value; + prevvalue = save_prevvalue; + continue; + } + } + } + + /* Deal with this element of the class */ + if (! SIZE_ONLY) { +#ifndef EBCDIC + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); +#else + SV* this_range = _new_invlist(1); + _append_range_to_invlist(this_range, prevvalue, value); + + /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. + * If this range was specified using something like 'i-j', we want + * to include only the 'i' and the 'j', and not anything in + * between, so exclude non-ASCII, non-alphabetics from it. + * However, if the range was specified with something like + * [\x89-\x91] or [\x89-j], all code points within it should be + * included. literal_endpoint==2 means both ends of the range used + * a literal character, not \x{foo} */ + if (literal_endpoint == 2 + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) + { + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII], + &this_range); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], + &this_range); + } + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); + literal_endpoint = 0; +#endif + } + + range = 0; /* this range (if it was one) is done now */ + } /* End of loop through all the text within the brackets */ + + /* If anything in the class expands to more than one character, we have to + * deal with them by building up a substitute parse string, and recursively + * calling reg() on it, instead of proceeding */ + if (multi_char_matches) { + SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); + I32 cp_count; + STRLEN len; + char *save_end = RExC_end; + char *save_parse = RExC_parse; + bool first_time = TRUE; /* First multi-char occurrence doesn't get + a "|" */ + I32 reg_flags; + + assert(! invert); +#if 0 /* Have decided not to deal with multi-char folds in inverted classes, + because too confusing */ + if (invert) { + sv_catpv(substitute_parse, "(?:"); + } +#endif + + /* Look at the longest folds first */ + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { + + if (av_exists(multi_char_matches, cp_count)) { + AV** this_array_ptr; + SV* this_sequence; + + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + while ((this_sequence = av_pop(*this_array_ptr)) != + &PL_sv_undef) + { + if (! first_time) { + sv_catpv(substitute_parse, "|"); + } + first_time = FALSE; + + sv_catpv(substitute_parse, SvPVX(this_sequence)); + } + } + } + + /* If the character class contains anything else besides these + * multi-character folds, have to include it in recursive parsing */ + if (element_count) { + sv_catpv(substitute_parse, "|["); + sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); + sv_catpv(substitute_parse, "]"); + } + + sv_catpv(substitute_parse, ")"); +#if 0 + if (invert) { + /* This is a way to get the parse to skip forward a whole named + * sequence instead of matching the 2nd character when it fails the + * first */ + sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); + } +#endif + + RExC_parse = SvPV(substitute_parse, len); + RExC_end = RExC_parse + len; + RExC_in_multi_char_class = 1; + RExC_emit = (regnode *)orig_emit; + + ret = reg(pRExC_state, 1, ®_flags, depth+1); + + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8); + + RExC_parse = save_parse; + RExC_end = save_end; + RExC_in_multi_char_class = 0; + SvREFCNT_dec_NN(multi_char_matches); + return ret; + } + + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { + U8 op = END; + U8 arg = 0; + + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ + + /* All named classes are mapped into POSIXish nodes, with its FLAG + * argument giving which class it is */ + switch ((I32)namedclass) { + case ANYOF_UNIPROP: + break; + + /* These don't depend on the charset modifiers. They always + * match under /u rules */ + case ANYOF_NHORIZWS: + case ANYOF_HORIZWS: + namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS; + /* FALLTHROUGH */ + + case ANYOF_NVERTWS: + case ANYOF_VERTWS: + op = POSIXU; + goto join_posix; + + /* The actual POSIXish node for all the rest depends on the + * charset modifier. The ones in the first set depend only on + * ASCII or, if available on this platform, locale */ + case ANYOF_ASCII: + case ANYOF_NASCII: +#ifdef HAS_ISASCII + op = (LOC) ? POSIXL : POSIXA; +#else + op = POSIXA; +#endif + goto join_posix; + + case ANYOF_NCASED: + case ANYOF_LOWER: + case ANYOF_NLOWER: + case ANYOF_UPPER: + case ANYOF_NUPPER: + /* under /a could be alpha */ + if (FOLD) { + if (ASCII_RESTRICTED) { + namedclass = ANYOF_ALPHA + (namedclass % 2); + } + else if (! LOC) { + break; + } + } + /* FALLTHROUGH */ + + /* The rest have more possibilities depending on the charset. + * We take advantage of the enum ordering of the charset + * modifiers to get the exact node type, */ + default: + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; + } + + join_posix: + /* The odd numbered ones are the complements of the + * next-lower even number one */ + if (namedclass % 2 == 1) { + invert = ! invert; + namedclass--; + } + arg = namedclass_to_classnum(namedclass); + break; + } + } + else if (value == prevvalue) { + + /* Here, the class consists of just a single code point */ + + if (invert) { + if (! LOC && value == '\n') { + op = REG_ANY; /* Optimize [^\n] */ + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + else if (value < 256 || UTF) { + + /* Optimize a single value into an EXACTish node, but not if it + * would require converting the pattern to UTF-8. */ + op = compute_EXACTish(pRExC_state); + } + } /* Otherwise is a range */ + else if (! LOC) { /* locale could vary these */ + if (prevvalue == '0') { + if (value == '9') { + arg = _CC_DIGIT; + op = POSIXA; + } + } + else if (prevvalue == 'A') { + if (value == 'Z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } + } + else if (prevvalue == 'a') { + if (value == 'z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } + } + } + + /* Here, we have changed away from its initial value iff we found + * an optimization */ + if (op != END) { + + /* Throw away this ANYOF regnode, and emit the calculated one, + * which should correspond to the beginning, not current, state of + * the parse */ + const char * cur_parse = RExC_parse; + RExC_parse = (char *)orig_parse; + if ( SIZE_ONLY) { + if (! LOC) { + + /* To get locale nodes to not use the full ANYOF size would + * require moving the code above that writes the portions + * of it that aren't in other nodes to after this point. + * e.g. ANYOF_POSIXL_SET */ + RExC_size = orig_size; + } + } + else { + RExC_emit = (regnode *)orig_emit; + if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } + if (invert) { + op += NPOSIXD - POSIXD; + } + } + } + + ret = reg_node(pRExC_state, op); + + if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { + if (! SIZE_ONLY) { + FLAGS(ret) = arg; + } + *flagp |= HASWIDTH|SIMPLE; + } + else if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); + } + + RExC_parse = (char *) cur_parse; + + SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); + SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); + return ret; + } + } + + if (SIZE_ONLY) + return ret; + /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ + + /* If folding, we calculate all characters that could fold to or from the + * ones already on the list */ + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; + } + + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, + &fold_intersection); + + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { + + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + } + + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; + + if (j < 256) { + + if (IS_IN_SOME_FOLD_L1(j)) { + + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } + } + + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); + } + continue; + } + + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } + + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } + } + } + } + } + SvREFCNT_dec_NN(fold_intersection); + } + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); + } + + /* And combine the result (if any) with any inversion list from posix + * classes. The lists are kept separate up to now because we don't want to + * fold the classes (folding of those is automatically handled by the swash + * fetching code) */ + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } + if (! DEPENDS_SEMANTICS) { + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + } + else { + cp_list = posixes; + } + } + else { + /* Under /d, we put into a separate list the Latin1 things that + * match only when the target string is utf8 */ + SV* nonascii_but_latin1_properties = NULL; + _invlist_intersection(posixes, PL_UpperLatin1, + &nonascii_but_latin1_properties); + _invlist_subtract(posixes, nonascii_but_latin1_properties, + &posixes); + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + } + else { + cp_list = posixes; + } + + if (depends_list) { + _invlist_union(depends_list, nonascii_but_latin1_properties, + &depends_list); + SvREFCNT_dec_NN(nonascii_but_latin1_properties); + } + else { + depends_list = nonascii_but_latin1_properties; + } + } + } + + /* And combine the result (if any) with any inversion list from properties. + * The lists are kept separate up to now so that we can distinguish the two + * in regards to matching above-Unicode. A run-time warning is generated + * if a Unicode property is matched against a non-Unicode code point. But, + * we allow user-defined properties to match anything, without any warning, + * and we also suppress the warning if there is a portion of the character + * class that isn't a Unicode property, and which matches above Unicode, \W + * or [\x{110000}] for example. + * (Note that in this case, unlike the Posix one above, there is no + * , because having a Unicode property forces Unicode + * semantics */ + if (properties) { + if (cp_list) { + + /* If it matters to the final outcome, see if a non-property + * component of the class matches above Unicode. If so, the + * warning gets suppressed. This is true even if just a single + * such code point is specified, as though not strictly correct if + * another such code point is matched against, the fact that they + * are using above-Unicode code points indicates they should know + * the issues involved */ + if (warn_super) { + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); + } + + _invlist_union(properties, cp_list, &cp_list); + SvREFCNT_dec_NN(properties); + } + else { + cp_list = properties; + } + + if (warn_super) { + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + } + } + + /* Here, we have calculated what code points should be in the character + * class. + * + * Now we can see about various optimizations. Fold calculation (which we + * did above) needs to take place before inversion. Otherwise /[^k]/i + * would invert to include K, which under /i would match k, which it + * shouldn't. Therefore we can't invert folded locale now, as it won't be + * folded until runtime */ + + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known + * at compile time. Besides not inverting folded locale now, we can't + * invert if there are things such as \w, which aren't known until runtime + * */ + if (cp_list + && invert + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! depends_list + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + _invlist_invert(cp_list); + + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + /* Clear the invert flag since have just done it here */ + invert = FALSE; + } + + if (ret_invlist) { + *ret_invlist = cp_list; + SvREFCNT_dec(swash); + + /* Discard the generated node */ + if (SIZE_ONLY) { + RExC_size = orig_size; + } + else { + RExC_emit = orig_emit; + } + return orig_emit; + } + + /* Some character classes are equivalent to other nodes. Such nodes take + * up less room and generally fewer operations to execute than ANYOF nodes. + * Above, we checked for and optimized into some such equivalents for + * certain common classes that are easy to test. Getting to this point in + * the code means that the class didn't get optimized there. Since this + * code is only executed in Pass 2, it is too late to save space--it has + * been allocated in Pass 1, and currently isn't given back. But turning + * things into an EXACTish node can allow the optimizer to join it to any + * adjacent such nodes. And if the class is equivalent to things like /./, + * expensive run-time swashes can be avoided. Now that we have more + * complete information, we can find things necessarily missed by the + * earlier code. I (khw) am not sure how much to look for here. It would + * be easy, but perhaps too slow, to check any candidates against all the + * node types they could possibly match using _invlistEQ(). */ + + if (cp_list + && ! invert + && ! depends_list + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) + { + UV start, end; + U8 op = END; /* The optimzation node-type */ + const char * cur_parse= RExC_parse; + + invlist_iterinit(cp_list); + if (! invlist_iternext(cp_list, &start, &end)) { + + /* Here, the list is empty. This happens, for example, when a + * Unicode property is the only thing in the character class, and + * it doesn't match anything. (perluniprops.pod notes such + * properties) */ + op = OPFAIL; + *flagp |= HASWIDTH|SIMPLE; + } + else if (start == end) { /* The range is a single code point */ + if (! invlist_iternext(cp_list, &start, &end) + + /* Don't do this optimization if it would require changing + * the pattern to UTF-8 */ + && (start < 256 || UTF)) + { + /* Here, the list contains a single code point. Can optimize + * into an EXACTish node */ + + value = start; + + if (! FOLD) { + op = EXACT; + } + else if (LOC) { + + /* A locale node under folding with one code point can be + * an EXACTFL, as its fold won't be calculated until + * runtime */ + op = EXACTFL; + } + else { + + /* Here, we are generally folding, but there is only one + * code point to match. If we have to, we use an EXACT + * node, but it would be better for joining with adjacent + * nodes in the optimization pass if we used the same + * EXACTFish node that any such are likely to be. We can + * do this iff the code point doesn't participate in any + * folds. For example, an EXACTF of a colon is the same as + * an EXACT one, since nothing folds to or from a colon. */ + if (value < 256) { + if (IS_IN_SOME_FOLD_L1(value)) { + op = EXACT; + } + } + else { + if (_invlist_contains_cp(PL_utf8_foldable, value)) { + op = EXACT; + } + } + + /* If we haven't found the node type, above, it means we + * can use the prevailing one */ + if (op == END) { + op = compute_EXACTish(pRExC_state); + } + } + } + } + else if (start == 0) { + if (end == UV_MAX) { + op = SANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + else if (end == '\n' - 1 + && invlist_iternext(cp_list, &start, &end) + && start == '\n' + 1 && end == UV_MAX) + { + op = REG_ANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + invlist_iterfinish(cp_list); + + if (op != END) { + RExC_parse = (char *)orig_parse; + RExC_emit = (regnode *)orig_emit; + + ret = reg_node(pRExC_state, op); + + RExC_parse = (char *)cur_parse; + + if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); + } + + SvREFCNT_dec_NN(cp_list); + return ret; + } + } + + /* Here, contains all the code points we can determine at + * compile time that match under all conditions. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * . While we are at it, see if everything above 255 is in the + * list, and if so, set a flag to speed up execution */ + + populate_ANYOF_from_invlist(ret, &cp_list); + + if (invert) { + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + } + + /* Here, the bitmap has been populated with all the Latin1 code points that + * always match. Can now add to the overall list those that match only + * when the target string is UTF-8 (). */ + if (depends_list) { + if (cp_list) { + _invlist_union(cp_list, depends_list, &cp_list); + SvREFCNT_dec_NN(depends_list); + } + else { + cp_list = depends_list; + } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; + } + + /* If there is a swash and more than one element, we can't use the swash in + * the optimization below. */ + if (swash && element_count > 1) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); + } + else { + AV * const av = newAV(); + SV *rv; + + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); + if (swash) { + assert(cp_list); + av_store(av, 1, swash); + SvREFCNT_dec_NN(cp_list); + } + else { + av_store(av, 1, &PL_sv_undef); + if (cp_list) { + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); + } + } + + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + + rv = newRV_noinc(MUTABLE_SV(av)); + n = add_data(pRExC_state, STR_WITH_LEN("s")); + RExC_rxi->data->data[n] = (void*)rv; + ARG_SET(node, n); + } +} + + +/* reg_skipcomment() + + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. + + Note it's the callers responsibility to ensure that we are + actually in /x mode + +*/ + +PERL_STATIC_INLINE char* +S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) +{ + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + + assert(*p == '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; + } + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; +} + +/* nextchar() + + Advances the parse position, and optionally absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + +STATIC char* +S_nextchar(pTHX_ RExC_state_t *pRExC_state) +{ + char* const retval = RExC_parse++; + + PERL_ARGS_ASSERT_NEXTCHAR; + + for (;;) { + if (RExC_end - RExC_parse >= 3 + && *RExC_parse == '(' + && RExC_parse[1] == '?' + && RExC_parse[2] == '#') + { + while (*RExC_parse != ')') { + if (RExC_parse == RExC_end) + FAIL("Sequence (?#... not terminated"); + RExC_parse++; + } + RExC_parse++; + continue; + } + if (RExC_flags & RXf_PMf_EXTENDED) { + char * p = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + if (p != RExC_parse) { + RExC_parse = p; + continue; + } + } + return retval; + } +} + +/* +- reg_node - emit a node +*/ +STATIC regnode * /* Location. */ +S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) +{ + regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 1; + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, (void*)RExC_emit, (void*)RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE(ptr, op); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + "reg_node", __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reganode - emit a node with an argument +*/ +STATIC regnode * /* Location. */ +S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) +{ + regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGANODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 2; + /* + We can't do this: + + assert(2==regarglen[op]+1); + + Anything larger than this has to allocate the extra amount. + If we changed this to be: + + RExC_size += (1 + regarglen[op]); + + then it wouldn't matter. Its not clear what side effect + might come from that so its not done so far. + -- dmq + */ + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, (void*)RExC_emit, (void*)RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE_ARG(ptr, op, arg); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reganode", + __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? + "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Cur_Node_Offset; + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reguni - emit (if appropriate) a Unicode character +*/ +PERL_STATIC_INLINE STRLEN +S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) +{ + PERL_ARGS_ASSERT_REGUNI; + + return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); +} + +/* +- reginsert - insert an operator in front of already-emitted operand +* +* Means relocating the operand. +*/ +STATIC void +S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) +{ + regnode *src; + regnode *dst; + regnode *place; + const int offset = regarglen[(U8)op]; + const int size = NODE_STEP_REGNODE + offset; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGINSERT; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(depth); +/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ + DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); + if (SIZE_ONLY) { + RExC_size += size; + return; + } + + src = RExC_emit; + RExC_emit += size; + dst = RExC_emit; + if (RExC_open_parens) { + int paren; + /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ + for ( paren=0 ; paren < RExC_npar ; paren++ ) { + if ( RExC_open_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("open"," - %d",size);*/ + RExC_open_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ + } + if ( RExC_close_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("close"," - %d",size);*/ + RExC_close_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ + } + } + } + + while (src > opnd) { + StructCopy(--src, --dst, regnode); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD 20010112 */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + "reg_insert", + __LINE__, + PL_reg_name[op], + (UV)(dst - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(src - RExC_emit_start), + (UV)(dst - RExC_emit_start), + (UV)RExC_offsets[0])); + Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); + Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); + } +#endif + } + + + place = opnd; /* Op node, where operand used to be. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reginsert", + __LINE__, + PL_reg_name[op], + (UV)(place - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(place - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(place, RExC_parse); + Set_Node_Length(place, 1); + } +#endif + src = NEXTOPER(place); + FILL_ADVANCE_NODE(place, op); + Zero(src, offset, regnode); +} + +/* +- regtail - set the next-pointer at the end of a node chain of p to val. +- SEE ALSO: regtail_study +*/ +/* TODO: All three parms should be const */ +STATIC void +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) +{ + regnode *scan; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if (SIZE_ONLY) + return; + + /* Find last node. */ + scan = p; + for (;;) { + regnode * const temp = regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tail" : "")); + regprop(RExC_rx, mysv, scan, NULL); + PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", + SvPV_nolen_const(mysv), REG_NODE_NUM(scan), + (temp == NULL ? "->" : ""), + (temp == NULL ? PL_reg_name[OP(val)] : "") + ); + }); + if (temp == NULL) + break; + scan = temp; + } + + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } +} + +#ifdef DEBUGGING +/* +- regtail_study - set the next-pointer at the end of a node chain of p to val. +- Look for optimizable sequences at the same time. +- currently only looks for EXACT chains. + +This is experimental code. The idea is to use this routine to perform +in place optimizations on branches and groups as they are constructed, +with the long term intention of removing optimization from study_chunk so +that it is purely analytical. + +Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used +to control which is which. + +*/ +/* TODO: All four parms should be const */ + +STATIC U8 +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) +{ + dVAR; + regnode *scan; + U8 exact = PSEUDO; +#ifdef EXPERIMENTAL_INPLACESCAN + I32 min = 0; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL_STUDY; + + + if (SIZE_ONLY) + return exact; + + /* Find last node. */ + + scan = p; + for (;;) { + regnode * const temp = regnext(scan); +#ifdef EXPERIMENTAL_INPLACESCAN + if (PL_regkind[OP(scan)] == EXACT) { + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) + return EXACT; + } +#endif + if ( exact ) { + switch (OP(scan)) { + case EXACT: + case EXACTF: + case EXACTFA_NO_TRIE: + case EXACTFA: + case EXACTFU: + case EXACTFU_SS: + case EXACTFL: + if( exact == PSEUDO ) + exact= OP(scan); + else if ( exact != OP(scan) ) + exact= 0; + case NOTHING: + break; + default: + exact= 0; + } + } + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); + regprop(RExC_rx, mysv, scan, NULL); + PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", + SvPV_nolen_const(mysv), + REG_NODE_NUM(scan), + PL_reg_name[exact]); + }); + if (temp == NULL) + break; + scan = temp; + } + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_PARSE_MSG(""); + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(val), + (IV)(val - scan) + ); + }); + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } + + return exact; +} +#endif + +/* + - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form + */ +#ifdef DEBUGGING + +static void +S_regdump_intflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitprogram, ri->program + 1, NULL, NULL, sv, 0, 0); + + /* Header fields of interest. */ + if (r->anchored_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), + RE_SV_DUMPLEN(r->anchored_substr), 30); + PerlIO_printf(Perl_debug_log, + "anchored %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_substr), + (IV)r->anchored_offset); + } else if (r->anchored_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), + RE_SV_DUMPLEN(r->anchored_utf8), 30); + PerlIO_printf(Perl_debug_log, + "anchored utf8 %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_utf8), + (IV)r->anchored_offset); + } + if (r->float_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), + RE_SV_DUMPLEN(r->float_substr), 30); + PerlIO_printf(Perl_debug_log, + "floating %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_substr), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } else if (r->float_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), + RE_SV_DUMPLEN(r->float_utf8), 30); + PerlIO_printf(Perl_debug_log, + "floating utf8 %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_utf8), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, + (const char *) + (r->check_substr == r->float_substr + && r->check_utf8 == r->float_utf8 + ? "(checking floating" : "(checking anchored")); + if (r->intflags & PREGf_NOSCAN) + PerlIO_printf(Perl_debug_log, " noscan"); + if (r->extflags & RXf_CHECK_ALL) + PerlIO_printf(Perl_debug_log, " isall"); + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, ") "); + + if (ri->regstclass) { + regprop(r, sv, ri->regstclass, NULL); + PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); + } + if (r->intflags & PREGf_ANCH) { + PerlIO_printf(Perl_debug_log, "anchored"); + if (r->intflags & PREGf_ANCH_BOL) + PerlIO_printf(Perl_debug_log, "(BOL)"); + if (r->intflags & PREGf_ANCH_MBOL) + PerlIO_printf(Perl_debug_log, "(MBOL)"); + if (r->intflags & PREGf_ANCH_SBOL) + PerlIO_printf(Perl_debug_log, "(SBOL)"); + if (r->intflags & PREGf_ANCH_GPOS) + PerlIO_printf(Perl_debug_log, "(GPOS)"); + PerlIO_putc(Perl_debug_log, ' '); + } + if (r->intflags & PREGf_GPOS_SEEN) + PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); + if (r->intflags & PREGf_SKIP) + PerlIO_printf(Perl_debug_log, "plus "); + if (r->intflags & PREGf_IMPLICIT) + PerlIO_printf(Perl_debug_log, "implicit "); + PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); + if (r->extflags & RXf_EVAL_SEEN) + PerlIO_printf(Perl_debug_log, "with eval "); + PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r({ + regdump_extflags("r->extflags: ",r->extflags); + regdump_intflags("r->intflags: ",r->intflags); + }); +#else + PERL_ARGS_ASSERT_REGDUMP; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(r); +#endif /* DEBUGGING */ +} + +/* +- regprop - printable representation of opcode, with run time support +*/ + +void +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) +{ +#ifdef DEBUGGING + dVAR; + int k; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { +#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ + || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ + || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ + || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ + || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \ + || _CC_VERTSPACE != 16 + #error Need to adjust order of anyofs[] +#endif + "\\w", + "\\W", + "\\d", + "\\D", + "[:alpha:]", + "[:^alpha:]", + "[:lower:]", + "[:^lower:]", + "[:upper:]", + "[:^upper:]", + "[:punct:]", + "[:^punct:]", + "[:print:]", + "[:^print:]", + "[:alnum:]", + "[:^alnum:]", + "[:graph:]", + "[:^graph:]", + "[:cased:]", + "[:^cased:]", + "\\s", + "\\S", + "[:blank:]", + "[:^blank:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:ascii:]", + "[:^ascii:]", + "\\v", + "\\V" + }; + RXi_GET_DECL(prog,progi); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPROP; + + sv_setpvs(sv, ""); + + if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ + /* It would be nice to FAIL() here, but this may be called from + regexec.c, and it would be hard to supply pRExC_state. */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); + sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ + + k = PL_regkind[OP(o)]; + + if (k == EXACT) { + sv_catpvs(sv, " "); + /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) + * is a crude hack but it may be the best for now since + * we have no flag "this EXACTish node was UTF-8" + * --jhi */ + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_ESCAPE_NONASCII | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); + } else if (k == TRIE) { + /* print the details of the trie in dumpuntil instead, as + * progi->data isn't available here */ + const char op = OP(o); + const U32 n = ARG(o); + const reg_ac_data * const ac = IS_TRIE_AC(op) ? + (reg_ac_data *)progi->data->data[n] : + NULL; + const reg_trie_data * const trie + = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; + + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); + DEBUG_TRIE_COMPILE_r( + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); + ); + if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { + sv_catpvs(sv, "["); + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); + sv_catpvs(sv, "]"); + } + + } else if (k == CURLY) { + if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); + } + else if (k == WHILEM && o->flags) /* Ordinal/of */ + Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { + Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ + if ( RXp_PAREN_NAMES(prog) ) { + if ( k != REF || (OP(o) < NREF)) { + AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + SV **name= av_fetch(list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + } + else { + AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + SV **name= av_fetch(list, nums[0], 0 ); + I32 n; + if (name) { + for ( n=0; noffs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } + } else if (k == GOSUB) + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); + else if (k == VERB) { + if (!o->flags) + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); + } else if (k == LOGICAL) + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); + else if (k == ANYOF) { + const U8 flags = ANYOF_FLAGS(o); + int do_sep = 0; + + + if (flags & ANYOF_LOCALE_FLAGS) + sv_catpvs(sv, "{loc}"); + if (flags & ANYOF_LOC_FOLD) + sv_catpvs(sv, "{i}"); + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + if (flags & ANYOF_INVERT) + sv_catpvs(sv, "^"); + + /* output what the standard cp 0-255 bitmap matches */ + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); + + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { + sv_catpv(sv, anyofs[i]); + do_sep = 1; + } + } + } + + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); + } + + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } + + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { + char *s = savesvpv(lv); + char * const origs = s; + + while (*s && *s != '\n') + s++; + + if (*s == '\n') { + const char * const t = ++s; + + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } + + if (byte_output) { + sv_catpvs(sv, " "); + } + + while (*s) { + if (*s == '\n') { + + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); + } + + out_dump: + + Safefree(origs); + SvREFCNT_dec_NN(lv); + } + + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); + } + } + } + + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + } + else if (k == POSIXD || k == NPOSIXD) { + U8 index = FLAGS(o) * 2; + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } + } + else { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + } + else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) + Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); +#else + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(sv); + PERL_UNUSED_ARG(o); + PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); +#endif /* DEBUGGING */ +} + + + +SV * +Perl_re_intuit_string(pTHX_ REGEXP * const r) +{ /* Assume that RE_INTUIT is set */ + struct regexp *const prog = ReANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_STRING; + PERL_UNUSED_CONTEXT; + + DEBUG_COMPILE_r( + { + const char * const s = SvPV_nolen_const(prog->check_substr + ? prog->check_substr : prog->check_utf8); + + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, + "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", + PL_colors[4], + prog->check_substr ? "" : "utf8 ", + PL_colors[5],PL_colors[0], + s, + PL_colors[1], + (strlen(s) > 60 ? "..." : "")); + } ); + + return prog->check_substr ? prog->check_substr : prog->check_utf8; +} + +/* + pregfree() + + handles refcounting and freeing the perl core regexp structure. When + it is necessary to actually free the structure the first thing it + does is call the 'free' method of the regexp_engine associated to + the regexp, allowing the handling of the void *pprivate; member + first. (This routine is not overridable by extensions, which is why + the extensions free is called first.) + + See regdupe and regdupe_internal if you change anything here. +*/ +#ifndef PERL_IN_XSUB_RE +void +Perl_pregfree(pTHX_ REGEXP *r) +{ + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) +{ + struct regexp *const r = ReANY(rx); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_PREGFREE2; + + if (r->mother_re) { + ReREFCNT_dec(r->mother_re); + } else { + CALLREGFREE_PVT(rx); /* free the private data */ + SvREFCNT_dec(RXp_PAREN_NAMES(r)); + Safefree(r->xpv_len_u.xpvlenu_pv); + } + if (r->substrs) { + SvREFCNT_dec(r->anchored_substr); + SvREFCNT_dec(r->anchored_utf8); + SvREFCNT_dec(r->float_substr); + SvREFCNT_dec(r->float_utf8); + Safefree(r->substrs); + } + RX_MATCH_COPY_FREE(rx); +#ifdef PERL_ANY_COW + SvREFCNT_dec(r->saved_copy); +#endif + Safefree(r->offs); + SvREFCNT_dec(r->qr_anoncv); + rx->sv_u.svu_rx = 0; +} + +/* reg_temp_copy() + + This is a hacky workaround to the structural issue of match results + being stored in the regexp structure which is in turn stored in + PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern + could be PL_curpm in multiple contexts, and could require multiple + result sets being associated with the pattern simultaneously, such + as when doing a recursive match with (??{$qr}) + + The solution is to make a lightweight copy of the regexp structure + when a qr// is returned from the code executed by (??{$qr}) this + lightweight copy doesn't actually own any of its data except for + the starp/end and the actual regexp structure itself. + +*/ + + +REGEXP * +Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) +{ + struct regexp *ret; + struct regexp *const r = ReANY(rx); + const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV; + + PERL_ARGS_ASSERT_REG_TEMP_COPY; + + if (!ret_x) + ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + else { + SvOK_off((SV *)ret_x); + if (islv) { + /* For PVLVs, SvANY points to the xpvlv body while sv_u points + to the regexp. (For SVt_REGEXPs, sv_upgrade has already + made both spots point to the same regexp body.) */ + REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); + assert(!SvPVX(ret_x)); + ret_x->sv_u.svu_rx = temp->sv_any; + temp->sv_any = NULL; + SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; + SvREFCNT_dec_NN(temp); + /* SvCUR still resides in the xpvlv struct, so the regexp copy- + ing below will not set it. */ + SvCUR_set(ret_x, SvCUR(rx)); + } + } + /* This ensures that SvTHINKFIRST(sv) is true, and hence that + sv_force_normal(sv) is called. */ + SvFAKE_on(ret_x); + ret = ReANY(ret_x); + + SvFLAGS(ret_x) |= SvUTF8(rx); + /* We share the same string buffer as the original regexp, on which we + hold a reference count, incremented when mother_re is set below. + The string pointer is copied here, being part of the regexp struct. + */ + memcpy(&(ret->xpv_cur), &(r->xpv_cur), + sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); + if (r->offs) { + const I32 npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + } + if (r->substrs) { + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + SvREFCNT_inc_void(ret->anchored_substr); + SvREFCNT_inc_void(ret->anchored_utf8); + SvREFCNT_inc_void(ret->float_substr); + SvREFCNT_inc_void(ret->float_utf8); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + } + RX_MATCH_COPIED_off(ret_x); +#ifdef PERL_ANY_COW + ret->saved_copy = NULL; +#endif + ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); + SvREFCNT_inc_void(ret->qr_anoncv); + + return ret_x; +} +#endif + +/* regfree_internal() + + Free the private data in a regexp. This is overloadable by + extensions. Perl takes care of the regexp structure in pregfree(), + this covers the *pprivate pointer which technically perl doesn't + know about, however of course we have to handle the + regexp_internal structure when no extension is in use. + + Note this is called before freeing anything in the regexp + structure. + */ + +void +Perl_regfree_internal(pTHX_ REGEXP * const rx) +{ + struct regexp *const r = ReANY(rx); + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGFREE_INTERNAL; + + DEBUG_COMPILE_r({ + if (!PL_colorset) + reginitcolors(); + { + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); + PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + PL_colors[4],PL_colors[5],s); + } + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) + Safefree(ri->u.offsets); /* 20010421 MJD */ +#endif + if (ri->code_blocks) { + int n; + for (n = 0; n < ri->num_code_blocks; n++) + SvREFCNT_dec(ri->code_blocks[n].src_regex); + Safefree(ri->code_blocks); + } + + if (ri->data) { + int n = ri->data->count; + + while (--n >= 0) { + /* If you add a ->what type here, update the comment in regcomp.h */ + switch (ri->data->what[n]) { + case 'a': + case 'r': + case 's': + case 'S': + case 'u': + SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); + break; + case 'f': + Safefree(ri->data->data[n]); + break; + case 'l': + case 'L': + break; + case 'T': + { /* Aho Corasick add-on structure for a trie node. + Used in stclass optimization only */ + U32 refcount; + reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif + OP_REFCNT_LOCK; + refcount = --aho->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(aho->states); + PerlMemShared_free(aho->fail); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } + } + } + break; + case 't': + { + /* trie structure. */ + U32 refcount; + reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif + OP_REFCNT_LOCK; + refcount = --trie->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(trie->charmap); + PerlMemShared_free(trie->states); + PerlMemShared_free(trie->trans); + if (trie->bitmap) + PerlMemShared_free(trie->bitmap); + if (trie->jump) + PerlMemShared_free(trie->jump); + PerlMemShared_free(trie->wordinfo); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + } + } + break; + default: + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); + } + } + Safefree(ri->data->what); + Safefree(ri->data); + } + + Safefree(ri); +} + +#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t)) +#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) +#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) + +/* + re_dup - duplicate a regexp. + + This routine is expected to clone a given regexp structure. It is only + compiled under USE_ITHREADS. + + After all of the core data stored in struct regexp is duplicated + the regexp_engine.dupe method is used to copy any private data + stored in the *pprivate pointer. This allows extensions to handle + any duplication it needs to do. + + See pregfree() and regfree_internal() if you change anything here. +*/ +#if defined(USE_ITHREADS) +#ifndef PERL_IN_XSUB_RE +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) +{ + dVAR; + I32 npar; + const struct regexp *r = ReANY(sstr); + struct regexp *ret = ReANY(dstr); + + PERL_ARGS_ASSERT_RE_DUP_GUTS; + + npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + + if (ret->substrs) { + /* Do it this way to avoid reading from *r after the StructCopy(). + That way, if any of the sv_dup_inc()s dislodge *r from the L1 + cache, it doesn't matter. */ + const bool anchored = r->check_substr + ? r->check_substr == r->anchored_substr + : r->check_utf8 == r->anchored_utf8; + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); + ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); + ret->float_substr = sv_dup_inc(ret->float_substr, param); + ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + + if (ret->check_substr) { + if (anchored) { + assert(r->check_utf8 == r->anchored_utf8); + ret->check_substr = ret->anchored_substr; + ret->check_utf8 = ret->anchored_utf8; + } else { + assert(r->check_substr == r->float_substr); + assert(r->check_utf8 == r->float_utf8); + ret->check_substr = ret->float_substr; + ret->check_utf8 = ret->float_utf8; + } + } else if (ret->check_utf8) { + if (anchored) { + ret->check_utf8 = ret->anchored_utf8; + } else { + ret->check_utf8 = ret->float_utf8; + } + } + } + + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); + ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); + + if (ret->pprivate) + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); + + if (RX_MATCH_COPIED(dstr)) + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); + else + ret->subbeg = NULL; +#ifdef PERL_ANY_COW + ret->saved_copy = NULL; +#endif + + /* Whether mother_re be set or no, we need to copy the string. We + cannot refrain from copying it when the storage points directly to + our mother regexp, because that's + 1: a buffer in a different thread + 2: something we no longer hold a reference on + so we need to copy it locally. */ + RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); + ret->mother_re = NULL; +} +#endif /* PERL_IN_XSUB_RE */ + +/* + regdupe_internal() + + This is the internal complement to regdupe() which is used to copy + the structure pointed to by the *pprivate pointer in the regexp. + This is the core version of the extension overridable cloning hook. + The regexp structure being duplicated will be copied by perl prior + to this and will be provided as the regexp *r argument, however + with the /old/ structures pprivate pointer value. Thus this routine + may override any copying normally done by perl. + + It returns a pointer to the new regexp_internal structure. +*/ + +void * +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) +{ + dVAR; + struct regexp *const r = ReANY(rx); + regexp_internal *reti; + int len; + RXi_GET_DECL(r,ri); + + PERL_ARGS_ASSERT_REGDUPE_INTERNAL; + + len = ProgLen(ri); + + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); + Copy(ri->program, reti->program, len+1, regnode); + + reti->num_code_blocks = ri->num_code_blocks; + if (ri->code_blocks) { + int n; + Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, + struct reg_code_block); + Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, + struct reg_code_block); + for (n = 0; n < ri->num_code_blocks; n++) + reti->code_blocks[n].src_regex = (REGEXP*) + sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); + } + else + reti->code_blocks = NULL; + + reti->regstclass = NULL; + + if (ri->data) { + struct reg_data *d; + const int count = ri->data->count; + int i; + + Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), + char, struct reg_data); + Newx(d->what, count, U8); + + d->count = count; + for (i = 0; i < count; i++) { + d->what[i] = ri->data->what[i]; + switch (d->what[i]) { + /* see also regcomp.h and regfree_internal() */ + case 'a': /* actually an AV, but the dup function is identical. */ + case 'r': + case 's': + case 'S': + case 'u': /* actually an HV, but the dup function is identical. */ + d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); + break; + case 'f': + /* This is cheating. */ + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); + reti->regstclass = (regnode*)d->data[i]; + break; + case 'T': + /* Trie stclasses are readonly and can thus be shared + * without duplication. We free the stclass in pregfree + * when the corresponding reg_ac_data struct is freed. + */ + reti->regstclass= ri->regstclass; + /* FALLTHROUGH */ + case 't': + OP_REFCNT_LOCK; + ((reg_trie_data*)ri->data->data[i])->refcount++; + OP_REFCNT_UNLOCK; + /* FALLTHROUGH */ + case 'l': + case 'L': + d->data[i] = ri->data->data[i]; + break; + default: + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); + } + } + + reti->data = d; + } + else + reti->data = NULL; + + reti->name_list_idx = ri->name_list_idx; + +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) { + Newx(reti->u.offsets, 2*len+1, U32); + Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); + } +#else + SetProgLen(reti,len); +#endif + + return (void*)reti; +} + +#endif /* USE_ITHREADS */ + +#ifndef PERL_IN_XSUB_RE + +/* + - regnext - dig the "next" pointer out of a node + */ +regnode * +Perl_regnext(pTHX_ regnode *p) +{ + I32 offset; + + if (!p) + return(NULL); + + if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); + } + + offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); + if (offset == 0) + return(NULL); + + return(p+offset); +} +#endif + +STATIC void +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) +{ + va_list args; + STRLEN l1 = strlen(pat1); + STRLEN l2 = strlen(pat2); + char buf[512]; + SV *msv; + const char *message; + + PERL_ARGS_ASSERT_RE_CROAK2; + + if (l1 > 510) + l1 = 510; + if (l1 + l2 > 510) + l2 = 510 - l1; + Copy(pat1, buf, l1 , char); + Copy(pat2, buf + l1, l2 , char); + buf[l1 + l2] = '\n'; + buf[l1 + l2 + 1] = '\0'; + va_start(args, pat2); + msv = vmess(buf, &args); + va_end(args); + message = SvPV_const(msv,l1); + if (l1 > 512) + l1 = 512; + Copy(message, buf, l1 , char); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); +} + +/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ + +#ifndef PERL_IN_XSUB_RE +void +Perl_save_re_context(pTHX) +{ + /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ + if (PL_curpm) { + const REGEXP * const rx = PM_GETRE(PL_curpm); + if (rx) { + U32 i; + for (i = 1; i <= RX_NPARENS(rx); i++) { + char digits[TYPE_CHARS(long)]; + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); + GV *const *const gvp + = (GV**)hv_fetch(PL_defstash, digits, len, 0); + + if (gvp) { + GV * const gv = *gvp; + if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) + save_scalar(gv); + } + } + } + } +} +#endif + +#ifdef DEBUGGING + +STATIC void +S_put_byte(pTHX_ SV *sv, int c) +{ + PERL_ARGS_ASSERT_PUT_BYTE; + + if (!isPRINT(c)) { + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; + } + } + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } +} + +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end'. It assumes that only ASCII printables are displayable + * as-is (though some of these will be escaped by put_byte()). For the + * time being, this subroutine only works for latin1 (< 256) code points */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + while (start <= end) { + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) { + put_byte(sv, start); + } + break; + } + + /* For small ranges that include printable ASCII characters, it's more + * legible to print those characters rather than hex values. For + * larger ranges that include more than printables, it's probably + * clearer to just give the start and end points of the range in hex, + * and that's all we can do if there aren't any printables within the + * range + * + * On ASCII platforms the range of printables is contiguous. If the + * entire range is printable, we print each character as such. If the + * range is partially printable and partially not, it's less likely + * that the individual printables are meaningful, especially if all or + * almost all of them are in the range. But we err on the side of the + * individual printables being meaningful by using the hex only if the + * range contains all but 2 of the printables. + * + * On EBCDIC platforms, the printables are scattered around so that the + * maximum range length containing only them is about 10. Anything + * longer we treat as hex; otherwise we examine the range character by + * character to see */ +#ifdef EBCDIC + if (start < 256 && (((end < 255) ? end : 255) - start <= 10)) +#else + if ((isPRINT_A(start) && isPRINT_A(end)) + || (end >= 0x7F && (isPRINT_A(start) && start > 0x21)) + || ((end < 0x7D && isPRINT_A(end)) && start < 0x20)) +#endif + { + /* If the range beginning isn't an ASCII printable, we find the + * last such in the range, then split the output, so all the + * non-printables are in one subrange; then process the remaining + * portion as usual. If the entire range isn't printables, we + * don't split, but drop down to print as hex */ + if (! isPRINT_A(start)) { + UV temp_end = start + 1; + while (temp_end <= end && ! isPRINT_A(temp_end)) { + temp_end++; + } + if (temp_end <= end) { + put_range(sv, start, temp_end - 1); + start = temp_end; + continue; + } + } + + /* If the range beginning is a digit, output a subrange of just the + * digits, then process the remaining portion as usual */ + if (isDIGIT_A(start)) { + put_byte(sv, start); + sv_catpvs(sv, "-"); + while (start <= end && isDIGIT_A(start)) start++; + put_byte(sv, start - 1); + continue; + } + + /* Similarly for alphabetics. Because in both ASCII and EBCDIC, + * the code points for upper and lower A-Z and a-z aren't + * intermixed, the resulting subrange will consist solely of either + * upper- or lower- alphabetics */ + if (isALPHA_A(start)) { + put_byte(sv, start); + sv_catpvs(sv, "-"); + while (start <= end && isALPHA_A(start)) start++; + put_byte(sv, start - 1); + continue; + } + + /* We output any remaining printables as individual characters */ + if (isPUNCT_A(start) || isSPACE_A(start)) { + while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) { + put_byte(sv, start); + start++; + } + continue; + } + } + + /* Here is a control or non-ascii. Output the range or subrange as + * hex. */ + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + break; + } +} + +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i < 256; i++) { + if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j <= 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); + has_output_anything = TRUE; + i = j; + } + } + + return has_output_anything; +} + +#define CLEAR_OPTSTART \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ + } STMT_END + +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); + +STATIC const regnode * +S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, + const regnode *last, const regnode *plast, + SV* sv, I32 indent, U32 depth) +{ + dVAR; + U8 op = PSEUDO; /* Arbitrary non-END op. */ + const regnode *next; + const regnode *optstart= NULL; + + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMPUNTIL; + +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, + last ? last-start : 0,plast ? plast-start : 0); +#endif + + if (plast && plast < last) + last= plast; + + while (PL_regkind[op] != END && (!last || node < last)) { + assert(node); + /* While that wasn't END last time... */ + NODE_ALIGN(node); + op = OP(node); + if (op == CLOSE || op == WHILEM) + indent--; + next = regnext((regnode *)node); + + /* Where, what. */ + if (OP(node) == OPTIMIZED) { + if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) + optstart = node; + else + goto after_print; + } else + CLEAR_OPTSTART; + + regprop(r, sv, node, NULL); + PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), + (int)(2*indent + 1), "", SvPVX_const(sv)); + + if (OP(node) != OPTIMIZED) { + if (next == NULL) /* Next ptr. */ + PerlIO_printf(Perl_debug_log, " (0)"); + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) + PerlIO_printf(Perl_debug_log, " (FAIL)"); + else + PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); + (void)PerlIO_putc(Perl_debug_log, '\n'); + } + + after_print: + if (PL_regkind[(U8)op] == BRANCHJ) { + assert(next); + { + const regnode *nnode = (OP(next) == LONGJMP + ? regnext((regnode *)next) + : next); + if (last && nnode > last) + nnode = last; + DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); + } + } + else if (PL_regkind[(U8)op] == BRANCH) { + assert(next); + DUMPUNTIL(NEXTOPER(node), next); + } + else if ( PL_regkind[(U8)op] == TRIE ) { + const regnode *this_trie = node; + const char op = OP(node); + const U32 n = ARG(node); + const reg_ac_data * const ac = op>=AHOCORASICK ? + (reg_ac_data *)ri->data->data[n] : + NULL; + const reg_trie_data * const trie = + (reg_trie_data*)ri->data->data[optrie]; +#ifdef DEBUGGING + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); +#endif + const regnode *nextbranch= NULL; + I32 word_idx; + sv_setpvs(sv, ""); + for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { + SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); + + PerlIO_printf(Perl_debug_log, "%*s%s ", + (int)(2*(indent+3)), "", + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT + ) + : "???" + ); + if (trie->jump) { + U16 dist= trie->jump[word_idx+1]; + PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", + (UV)((dist ? this_trie + dist : next) - start)); + if (dist) { + if (!nextbranch) + nextbranch= this_trie + trie->jump[0]; + DUMPUNTIL(this_trie + dist, nextbranch); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode *)nextbranch); + } else { + PerlIO_printf(Perl_debug_log, "\n"); + } + } + if (last && next > last) + node= last; + else + node= next; + } + else if ( op == CURLY ) { /* "next" might be very big: optimizer */ + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, + NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); + } + else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { + assert(next); + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); + } + else if ( op == PLUS || op == STAR) { + DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); + } + else if (PL_regkind[(U8)op] == ANYOF) { + /* arglen 1 + class block */ + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); + node = NEXTOPER(node); + } + else if (PL_regkind[(U8)op] == EXACT) { + /* Literal string, where present. */ + node += NODE_SZ_STR(node) - 1; + node = NEXTOPER(node); + } + else { + node = NEXTOPER(node); + node += regarglen[(U8)op]; + } + if (op == CURLYX || op == OPEN) + indent++; + } + CLEAR_OPTSTART; +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); +#endif + return node; +} + +#endif /* DEBUGGING */ + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021002/orig/regexec.c b/src/5021002/orig/regexec.c new file mode 100644 index 0000000..58b3f60 --- /dev/null +++ b/src/5021002/orig/regexec.c @@ -0,0 +1,8174 @@ +/* regexec.c + */ + +/* + * One Ring to rule them all, One Ring to find them + & + * [p.v of _The Lord of the Rings_, opening poem] + * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] + * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] + */ + +/* This file contains functions for executing a regular expression. See + * also regcomp.c which funnily enough, contains functions for compiling + * a regular expression. + * + * This file is also copied at build time to ext/re/re_exec.c, where + * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. + * This causes the main functions to be compiled under new names and with + * debugging support added, which makes "use re 'debug'" work. + */ + +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +/* + * pregcomp and pregexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + **** Alterations to Henry's code are... + **** + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others + **** + **** You may distribute under the terms of either the GNU General Public + **** License or the Artistic License, as specified in the README file. + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#define PERL_IN_REGEXEC_C +#include "perl.h" + +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +#else +# include "regcomp.h" +#endif + +#include "inline_invlist.c" +#include "unicode_constants.h" + +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + +#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) + +#ifndef STATIC +#define STATIC static +#endif + +/* Valid only for non-utf8 strings: avoids the reginclass + * call if there are no complications: i.e., if everything matchable is + * straight forward in the bitmap */ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ + : ANYOF_BITMAP_TEST(p,*(c))) + +/* + * Forwards. + */ + +#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b) + +#define HOPc(pos,off) \ + (char *)(reginfo->is_utf8_target \ + ? reghop3((U8*)pos, off, \ + (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ + : (U8*)(pos + off)) + +#define HOPBACKc(pos, off) \ + (char*)(reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ + : (pos - off >= reginfo->strbeg) \ + ? (U8*)pos - off \ + : NULL) + +#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) +#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) + +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) + +#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ +#define NEXTCHR_IS_EOS (nextchr < 0) + +#define SET_nextchr \ + nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) + +#define SET_locinput(p) \ + locinput = (p); \ + SET_nextchr + + +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ + if (!swash_ptr) { \ + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ + swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ + 1, 0, invlist, &flags); \ + assert(swash_ptr); \ + } \ + } STMT_END + +/* If in debug mode, we test that a known character properly matches */ +#ifdef DEBUGGING +# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ + property_name, \ + invlist, \ + utf8_char_in_property) \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ + assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); +#else +# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ + property_name, \ + invlist, \ + utf8_char_in_property) \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) +#endif + +#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ + PL_utf8_swash_ptrs[_CC_WORDCHAR], \ + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); + +#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ + STMT_START { \ + LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ + "_X_regular_begin", \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ + LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ + "_X_extend", \ + NULL, \ + COMBINING_GRAVE_ACCENT_UTF8); \ + } STMT_END + +#define PLACEHOLDER /* Something for the preprocessor to grab onto */ +/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ + +/* for use after a quantifier and before an EXACT-like node -- japhy */ +/* it would be nice to rework regcomp.sym to generate this stuff. sigh + * + * NOTE that *nothing* that affects backtracking should be in here, specifically + * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a + * node that is in between two EXACT like nodes when ascertaining what the required + * "follow" character is. This should probably be moved to regex compile time + * although it may be done at run time beause of the REF possibility - more + * investigation required. -- demerphq +*/ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ + (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ +) +#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) + +#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) + +#if 0 +/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so + we don't need this definition. */ +#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) + +#else +/* ... so we use this as its faster. */ +#define IS_TEXT(rn) ( OP(rn)==EXACT ) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) +#define IS_TEXTF(rn) ( OP(rn)==EXACTF ) +#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) + +#endif + +/* + Search for mandatory following text node; for lookahead, the text must + follow but for lookbehind (rn->flags != 0) we skip to the next step. +*/ +#define FIND_NEXT_IMPT(rn) STMT_START { \ + while (JUMPABLE(rn)) { \ + const OPCODE type = OP(rn); \ + if (type == SUSPEND || PL_regkind[type] == CURLY) \ + rn = NEXTOPER(NEXTOPER(rn)); \ + else if (type == PLUS) \ + rn = NEXTOPER(rn); \ + else if (type == IFMATCH) \ + rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ + else rn += NEXT_OFF(rn); \ + } \ +} STMT_END + +/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode. + * These are for the pre-composed Hangul syllables, which are all in a + * contiguous block and arranged there in such a way so as to facilitate + * alorithmic determination of their characteristics. As such, they don't need + * a swash, but can be determined by simple arithmetic. Almost all are + * GCB=LVT, but every 28th one is a GCB=LV */ +#define SBASE 0xAC00 /* Start of block */ +#define SCount 11172 /* Length of block */ +#define TCount 28 + +#define SLAB_FIRST(s) (&(s)->states[0]) +#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) + +static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); +static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); +static regmatch_state * S_push_slab(pTHX); + +#define REGCP_PAREN_ELEMS 3 +#define REGCP_OTHER_ELEMS 3 +#define REGCP_FRAME_ELEMS 1 +/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and + * are needed for the regexp context stack bookkeeping. */ + +STATIC CHECKPOINT +S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) +{ + const int retval = PL_savestack_ix; + const int paren_elems_to_push = + (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS; + const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; + const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; + I32 p; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCPPUSH; + + if (paren_elems_to_push < 0) + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u", + (int)paren_elems_to_push, (int)maxopenparen, + (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS); + + if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) + Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf + " out of range (%lu-%ld)", + total_elems, + (unsigned long)maxopenparen, + (long)parenfloor); + + SSGROW(total_elems + REGCP_FRAME_ELEMS); + + DEBUG_BUFFERS_r( + if ((int)maxopenparen > (int)parenfloor) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { +/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); + SSPUSHINT(rex->offs[p].start_tmp); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", + (UV)p, + (IV)rex->offs[p].start, + (IV)rex->offs[p].start_tmp, + (IV)rex->offs[p].end + )); + } +/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ + SSPUSHINT(maxopenparen); + SSPUSHINT(rex->lastparen); + SSPUSHINT(rex->lastcloseparen); + SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ + + return retval; +} + +/* These are needed since we do not localize EVAL nodes: */ +#define REGCP_SET(cp) \ + DEBUG_STATE_r( \ + PerlIO_printf(Perl_debug_log, \ + " Setting an EVAL scope, savestack=%"IVdf"\n", \ + (IV)PL_savestack_ix)); \ + cp = PL_savestack_ix + +#define REGCP_UNWIND(cp) \ + DEBUG_STATE_r( \ + if (cp != PL_savestack_ix) \ + PerlIO_printf(Perl_debug_log, \ + " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ + (IV)(cp), (IV)PL_savestack_ix)); \ + regcpblow(cp) + +#define UNWIND_PAREN(lp, lcp) \ + for (n = rex->lastparen; n > lp; n--) \ + rex->offs[n].end = -1; \ + rex->lastparen = n; \ + rex->lastcloseparen = lcp; + + +STATIC void +S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) +{ + UV i; + U32 paren; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCPPOP; + + /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ + i = SSPOPUV; + assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ + i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ + rex->lastcloseparen = SSPOPINT; + rex->lastparen = SSPOPINT; + *maxopenparen_p = SSPOPINT; + + i -= REGCP_OTHER_ELEMS; + /* Now restore the parentheses context. */ + DEBUG_BUFFERS_r( + if (i || rex->lastparen + 1 <= rex->nparens) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + paren = *maxopenparen_p; + for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { + SSize_t tmps; + rex->offs[paren].start_tmp = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; + if (paren <= rex->lastparen) + rex->offs[paren].end = tmps; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n", + (UV)paren, + (IV)rex->offs[paren].start, + (IV)rex->offs[paren].start_tmp, + (IV)rex->offs[paren].end, + (paren > rex->lastparen ? "(skipped)" : "")); + ); + paren--; + } +#if 1 + /* It would seem that the similar code in regtry() + * already takes care of this, and in fact it is in + * a better location to since this code can #if 0-ed out + * but the code in regtry() is needed or otherwise tests + * requiring null fields (pat.t#187 and split.t#{13,14} + * (as of patchlevel 7877) will fail. Then again, + * this code seems to be necessary or otherwise + * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ + * --jhi updated by dapm */ + for (i = rex->lastparen + 1; i <= rex->nparens; i++) { + if (i > *maxopenparen_p) + rex->offs[i].start = -1; + rex->offs[i].end = -1; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %s ..-1 undeffing\n", + (UV)i, + (i > *maxopenparen_p) ? "-1" : " " + )); + } +#endif +} + +/* restore the parens and associated vars at savestack position ix, + * but without popping the stack */ + +STATIC void +S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p) +{ + I32 tmpix = PL_savestack_ix; + PL_savestack_ix = ix; + regcppop(rex, maxopenparen_p); + PL_savestack_ix = tmpix; +} + +#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ + +STATIC bool +S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) +{ + /* Returns a boolean as to whether or not 'character' is a member of the + * Posix character class given by 'classnum' that should be equivalent to a + * value in the typedef '_char_class_number'. + * + * Ideally this could be replaced by a just an array of function pointers + * to the C library functions that implement the macros this calls. + * However, to compile, the precise function signatures are required, and + * these may vary from platform to to platform. To avoid having to figure + * out what those all are on each platform, I (khw) am using this method, + * which adds an extra layer of function call overhead (unless the C + * optimizer strips it away). But we don't particularly care about + * performance with locales anyway. */ + + switch ((_char_class_number) classnum) { + case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character); + case _CC_ENUM_ALPHA: return isALPHA_LC(character); + case _CC_ENUM_ASCII: return isASCII_LC(character); + case _CC_ENUM_BLANK: return isBLANK_LC(character); + case _CC_ENUM_CASED: return isLOWER_LC(character) + || isUPPER_LC(character); + case _CC_ENUM_CNTRL: return isCNTRL_LC(character); + case _CC_ENUM_DIGIT: return isDIGIT_LC(character); + case _CC_ENUM_GRAPH: return isGRAPH_LC(character); + case _CC_ENUM_LOWER: return isLOWER_LC(character); + case _CC_ENUM_PRINT: return isPRINT_LC(character); + case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character); + case _CC_ENUM_PUNCT: return isPUNCT_LC(character); + case _CC_ENUM_SPACE: return isSPACE_LC(character); + case _CC_ENUM_UPPER: return isUPPER_LC(character); + case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character); + case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character); + default: /* VERTSPACE should never occur in locales */ + Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum); + } + + assert(0); /* NOTREACHED */ + return FALSE; +} + +STATIC bool +S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) +{ + /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded + * 'character' is a member of the Posix character class given by 'classnum' + * that should be equivalent to a value in the typedef + * '_char_class_number'. + * + * This just calls isFOO_lc on the code point for the character if it is in + * the range 0-255. Outside that range, all characters avoid Unicode + * rules, ignoring any locale. So use the Unicode function if this class + * requires a swash, and use the Unicode macro otherwise. */ + + PERL_ARGS_ASSERT_ISFOO_UTF8_LC; + + if (UTF8_IS_INVARIANT(*character)) { + return isFOO_lc(classnum, *character); + } + else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { + return isFOO_lc(classnum, + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); + } + + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Initialize the swash unless done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) + character, + TRUE /* is UTF */ )); + } + + switch ((_char_class_number) classnum) { + case _CC_ENUM_SPACE: + case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character); + + case _CC_ENUM_BLANK: return is_HORIZWS_high(character); + case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character); + case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character); + default: return 0; /* Things like CNTRL are always + below 256 */ + } + + assert(0); /* NOTREACHED */ + return FALSE; +} + +/* + * pregexec and friends + */ + +#ifndef PERL_IN_XSUB_RE +/* + - pregexec - match a regexp against a string + */ +I32 +Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) +/* stringarg: the point in the string at which to begin matching */ +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >= minend bytes after stringarg. */ +/* screamer: SV being matched: only used for utf8 flag, pos() etc; string + * itself is accessed via the pointers above */ +/* nosave: For optimizations. */ +{ + PERL_ARGS_ASSERT_PREGEXEC; + + return + regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, + nosave ? 0 : REXEC_COPY_STR); +} +#endif + + + +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. + */ + +char * +Perl_re_intuit_start(pTHX_ + REGEXP * const rx, + SV *sv, + const char * const strbeg, + char *strpos, + char *strend, + const U32 flags, + re_scream_pos_data *data) +{ + struct regexp *const prog = ReANY(rx); + SSize_t start_shift = prog->check_offset_min; + /* Should be nonnegative! */ + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; + SV *check; + const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ + char *check_at = NULL; /* check substr found at this pos */ + const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; + RXi_GET_DECL(prog,progi); + regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ + regmatch_info *const reginfo = ®info_buf; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_START; + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(data); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ + if (prog->minlen > strend - strpos) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too short...\n")); + goto fail; + } + + reginfo->is_utf8_target = cBOOL(utf8_target); + reginfo->info_aux = NULL; + reginfo->strbeg = strbeg; + reginfo->strend = strend; + reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); + reginfo->intuit = 1; + /* not actually used within intuit, but zero for safety anyway */ + reginfo->poscache_maxiter = 0; + + if (utf8_target) { + if (!prog->check_utf8 && prog->check_substr) + to_utf8_substr(prog); + check = prog->check_utf8; + } else { + if (!prog->check_substr && prog->check_utf8) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); + } + } + check = prog->check_substr; + } + + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); + } + }); + + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ + + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); + + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; + } + + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; + } + if (slen && (*SvPVX_const(check) != *s + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); + goto fail_finish; + } + + check_at = s; + goto success_at_start; + } + } + } + + end_shift = prog->check_end_shift; + +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ + if (end_shift < 0) + Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", + (IV)end_shift, RX_PRECOMP(prog)); +#endif + + restart: + + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ + + { + U8* start_point; + U8* end_point; + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); + + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; + } else { + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; + } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } + } + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", + (int)(end_point - start_point), + (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), + start_point); + }); + + check_at = fbm_instr( start_point, end_point, + check, multiline ? FBMrf_MULTILINE : 0); + + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ + + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ + + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); + } + + + /* now look for the 'other' substring if defined */ + + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) + { + /* Take into account the "other" substring. */ + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ + + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); + + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); + } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; + } + else { + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); + } + } + } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); + } + + postprocess_substr_matches: + + /* handle the extra constraint of /^.../m if present */ + + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); + goto fail_finish; + } + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) + { + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; + } + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; + } + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); + } + + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ + + if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { + const U8* const str = (U8*)STRING(progi->regstclass); + + /* XXX this value could be pre-computed */ + const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT + ? (reginfo->is_utf8_pat + ? utf8_distance(str + STR_LEN(progi->regstclass), str) + : STR_LEN(progi->regstclass)) + : 1); + char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } + else + endpos= strend; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); + + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, + reginfo); + if (!s) { + if (endpos == strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) + goto fail; + + /* Contradict one of substrings */ + if (prog->anchored_substr || prog->anchored_utf8) { + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; + } + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; + } + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; + } + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; + } + + /* Success !!! */ + + if (rx_origin != s) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) + ); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Does not contradict STCLASS...\n"); + ); + } + } + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; + } + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; + + fail_finish: /* Substring not found */ + if (prog->check_substr || prog->check_utf8) /* could be removed already */ + BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ + fail: + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", + PL_colors[4], PL_colors[5])); + return NULL; +} + + +#define DECL_TRIE_TYPE(scan) \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ + trie_type = ((scan->flags == EXACT) \ + ? (utf8_target ? trie_utf8 : trie_plain) \ + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + +#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ +STMT_START { \ + STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ + switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ + case trie_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ + len = UTF8SKIP(uc); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ + case trie_latin_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_utf8: \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + break; \ + case trie_plain: \ + uvc = (UV)*uc; \ + len = 1; \ + } \ + if (uvc < 256) { \ + charid = trie->charmap[ uvc ]; \ + } \ + else { \ + charid = 0; \ + if (widecharmap) { \ + SV** const svpp = hv_fetch(widecharmap, \ + (char*)&uvc, sizeof(UV), 0); \ + if (svpp) \ + charid = (U16)SvIV(*svpp); \ + } \ + } \ +} STMT_END + +#define DUMP_EXEC_POS(li,s,doutf8) \ + dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ + startpos, doutf8) + +#define REXEC_FBC_EXACTISH_SCAN(COND) \ +STMT_START { \ + while (s <= e) { \ + if ( (COND) \ + && (ln == 1 || folder(s, pat_string, ln)) \ + && (reginfo->intuit || regtry(reginfo, &s)) )\ + goto got_it; \ + s++; \ + } \ +} STMT_END + +#define REXEC_FBC_UTF8_SCAN(CODE) \ +STMT_START { \ + while (s < strend) { \ + CODE \ + s += UTF8SKIP(s); \ + } \ +} STMT_END + +#define REXEC_FBC_SCAN(CODE) \ +STMT_START { \ + while (s < strend) { \ + CODE \ + s++; \ + } \ +} STMT_END + +#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \ +REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \ + if (COND) { \ + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it; \ + else \ + tmp = doevery; \ + } \ + else \ + tmp = 1; \ +) + +#define REXEC_FBC_CLASS_SCAN(COND) \ +REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ + if (COND) { \ + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it; \ + else \ + tmp = doevery; \ + } \ + else \ + tmp = 1; \ +) + +#define REXEC_FBC_CSCAN(CONDUTF8,COND) \ + if (utf8_target) { \ + REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \ + } \ + else { \ + REXEC_FBC_CLASS_SCAN(COND); \ + } + +/* The three macros below are slightly different versions of the same logic. + * + * The first is for /a and /aa when the target string is UTF-8. This can only + * match ascii, but it must advance based on UTF-8. The other two handle the + * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking + * for the boundary (or non-boundary) between a word and non-word character. + * The utf8 and non-utf8 cases have the same logic, but the details must be + * different. Find the "wordness" of the character just prior to this one, and + * compare it with the wordness of this one. If they differ, we have a + * boundary. At the beginning of the string, pretend that the previous + * character was a new-line. + * + * All these macros uncleanly have side-effects with each other and outside + * variables. So far it's been too much trouble to clean-up + * + * TEST_NON_UTF8 is the macro or function to call to test if its byte input is + * a word character or not. + * IF_SUCCESS is code to do if it finds that we are at a boundary between + * word/non-word + * IF_FAIL is code to do if we aren't at a boundary between word/non-word + * + * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we + * are looking for a boundary or for a non-boundary. If we are looking for a + * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and + * see if this tentative match actually works, and if so, to quit the loop + * here. And vice-versa if we are looking for a non-boundary. + * + * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and + * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of + * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be + * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal + * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that + * complement. But in that branch we complement tmp, meaning that at the + * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s), + * which means at the top of the loop in the next iteration, it is + * TEST_NON_UTF8(s-1) */ +#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + tmp = !tmp; \ + IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + +/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and + * TEST_UTF8 is a macro that for the same input code points returns identically + * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */ +#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ + if (s == reginfo->strbeg) { \ + tmp = '\n'; \ + } \ + else { /* Back-up to the start of the previous character */ \ + U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ + } \ + tmp = TEST_UV(tmp); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ + REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + if (tmp == ! (TEST_UTF8((U8 *) s))) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); + +/* Like the above two macros. UTF8_CODE is the complete code for handling + * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc + * macros below */ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + if (utf8_target) { \ + UTF8_CODE \ + } \ + else { /* Not utf8 */ \ + tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_SCAN( /* advances s while s < strend */ \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + IF_SUCCESS; \ + tmp = !tmp; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + } \ + /* Here, things have been set up by the previous code so that tmp is the \ + * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \ + * utf8ness of the target). We also have to check if this matches against \ + * the EOS, which we treat as a \n (which is the same value in both UTF-8 \ + * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \ + * string */ \ + if (tmp == ! TEST_NON_UTF8('\n')) { \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } + +/* This is the macro to use when we want to see if something that looks like it + * could match, actually does, and if so exits the loop */ +#define REXEC_FBC_TRYIT \ + if ((reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it + +/* The only difference between the BOUND and NBOUND cases is that + * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in + * NBOUND. This is accomplished by passing it as either the if or else clause, + * with the other one being empty (PLACEHOLDER is defined as empty). + * + * The TEST_FOO parameters are for operating on different forms of input, but + * all should be ones that return identically for the same underlying code + * points */ +#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ + TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ + TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ + TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + +#define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ + TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + + +/* We know what class REx starts with. Try to find this position... */ +/* if reginfo->intuit, its a dryrun */ +/* annoyingly all the vars in this routine have different names from their counterparts + in regmatch. /grrr */ +STATIC char * +S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, + const char *strend, regmatch_info *reginfo) +{ + dVAR; + const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; + char *pat_string; /* The pattern's exactish string */ + char *pat_end; /* ptr to end char of pat_string */ + re_fold_t folder; /* Function for computing non-utf8 folds */ + const U8 *fold_array; /* array for folding ords < 256 */ + STRLEN ln; + STRLEN lnc; + U8 c1; + U8 c2; + char *e; + I32 tmp = 1; /* Scratch variable? */ + const bool utf8_target = reginfo->is_utf8_target; + UV utf8_fold_flags = 0; + const bool is_utf8_pat = reginfo->is_utf8_pat; + bool to_complement = FALSE; /* Invert the result? Taking the xor of this + with a result inverts that result, as 0^1 = + 1 and 1^1 = 0 */ + _char_class_number classnum; + + RXi_GET_DECL(prog,progi); + + PERL_ARGS_ASSERT_FIND_BYCLASS; + + /* We know what class it must start with. */ + switch (OP(c)) { + case ANYOF: + if (utf8_target) { + REXEC_FBC_UTF8_CLASS_SCAN( + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); + } + else { + REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); + } + break; + case CANY: + REXEC_FBC_SCAN( + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) + goto got_it; + else + tmp = doevery; + ); + break; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + if (is_utf8_pat || utf8_target) { + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf_utf8; + } + fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */ + folder = foldEQ_latin1; /* /a, except the sharp s one which */ + goto do_exactf_non_utf8; /* isn't dealt with by these */ + + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + if (utf8_target) { + utf8_fold_flags = 0; + goto do_exactf_utf8; + } + fold_array = PL_fold; + folder = foldEQ; + goto do_exactf_non_utf8; + + case EXACTFL: + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_exactf_utf8; + } + fold_array = PL_fold_locale; + folder = foldEQ_locale; + goto do_exactf_non_utf8; + + case EXACTFU_SS: + if (is_utf8_pat) { + utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + } + goto do_exactf_utf8; + + case EXACTFU: + if (is_utf8_pat || utf8_target) { + utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; + goto do_exactf_utf8; + } + + /* Any 'ss' in the pattern should have been replaced by regcomp, + * so we don't have to worry here about this single special case + * in the Latin1 range */ + fold_array = PL_fold_latin1; + folder = foldEQ_latin1; + + /* FALLTHROUGH */ + + do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there + are no glitches with fold-length differences + between the target string and pattern */ + + /* The idea in the non-utf8 EXACTF* cases is to first find the + * first character of the EXACTF* node and then, if necessary, + * case-insensitively compare the full text of the node. c1 is the + * first character. c2 is its fold. This logic will not work for + * Unicode semantics and the german sharp ss, which hence should + * not be compiled into a node that gets here. */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + + /* We know that we have to match at least 'ln' bytes (which is the + * same as characters, since not utf8). If we have to match 3 + * characters, and there are only 2 availabe, we know without + * trying that it will fail; so don't start a match past the + * required minimum number from the far end */ + e = HOP3c(strend, -((SSize_t)ln), s); + + if (reginfo->intuit && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + c1 = *pat_string; + c2 = fold_array[c1]; + if (c1 == c2) { /* If char and fold are the same */ + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); + } + else { + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); + } + break; + + do_exactf_utf8: + { + unsigned expansion; + + /* If one of the operands is in utf8, we can't use the simpler folding + * above, due to the fact that many different characters can have the + * same fold, or portion of a fold, or different- length fold */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_end = pat_string + ln; + lnc = is_utf8_pat /* length to match in characters */ + ? utf8_length((U8 *) pat_string, (U8 *) pat_end) + : ln; + + /* We have 'lnc' characters to match in the pattern, but because of + * multi-character folding, each character in the target can match + * up to 3 characters (Unicode guarantees it will never exceed + * this) if it is utf8-encoded; and up to 2 if not (based on the + * fact that the Latin 1 folds are already determined, and the + * only multi-char fold in that range is the sharp-s folding to + * 'ss'. Thus, a pattern character can match as little as 1/3 of a + * string character. Adjust lnc accordingly, rounding up, so that + * if we need to match at least 4+1/3 chars, that really is 5. */ + expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; + lnc = (lnc + expansion - 1) / expansion; + + /* As in the non-UTF8 case, if we have to match 3 characters, and + * only 2 are left, it's guaranteed to fail, so don't start a + * match that would require us to go beyond the end of the string + */ + e = HOP3c(strend, -((SSize_t)lnc), s); + + if (reginfo->intuit && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + /* XXX Note that we could recalculate e to stop the loop earlier, + * as the worst case expansion above will rarely be met, and as we + * go along we would usually find that e moves further to the left. + * This would happen only after we reached the point in the loop + * where if there were no expansion we should fail. Unclear if + * worth the expense */ + + while (s <= e) { + char *my_strend= (char *)strend; + if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, + pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags) + && (reginfo->intuit || regtry(reginfo, &s)) ) + { + goto got_it; + } + s += (utf8_target) ? UTF8SKIP(s) : 1; + } + break; + } + + case BOUNDL: + FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); + break; + case NBOUNDL: + FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); + break; + case BOUND: + FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case BOUNDA: + FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A); + break; + case NBOUND: + FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case NBOUNDA: + FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A); + break; + case BOUNDU: + FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case NBOUNDU: + FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case LNBREAK: + REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), + is_LNBREAK_latin1_safe(s, strend) + ); + break; + + /* The argument to all the POSIX node types is the class number to pass to + * _generic_isCC() to build a mask for searching in PL_charclass[] */ + + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: + REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), + to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); + break; + + case NPOSIXD: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: + if (utf8_target) { + goto posix_utf8; + } + goto posixa; + + case NPOSIXA: + if (utf8_target) { + /* The complement of something that matches only ASCII matches all + * non-ASCII, plus everything in ASCII that isn't in the class. */ + REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s) + || ! _generic_isCC_A(*s, FLAGS(c))); + break; + } + + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXA: + posixa: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character. */ + REXEC_FBC_CLASS_SCAN( + to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); + break; + + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s, + FLAGS(c)))); + } + else { + + posix_utf8: + classnum = (_char_class_number) FLAGS(c); + if (classnum < _FIRST_NON_SWASH_CC) { + while (s < strend) { + + /* We avoid loading in the swash as long as possible, but + * should we have to, we jump to a separate loop. This + * extra 'if' statement is what keeps this code from being + * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */ + if (UTF8_IS_ABOVE_LATIN1(*s)) { + goto found_above_latin1; + } + if ((UTF8_IS_INVARIANT(*s) + && to_complement ^ cBOOL(_generic_isCC((U8) *s, + classnum))) + || (UTF8_IS_DOWNGRADEABLE_START(*s) + && to_complement ^ cBOOL( + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), + classnum)))) + { + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) + goto got_it; + else { + tmp = doevery; + } + } + else { + tmp = 1; + } + s += UTF8SKIP(s); + } + } + else switch (classnum) { /* These classes are implemented as + macros */ + case _CC_ENUM_SPACE: /* XXX would require separate code if we + revert the change of \v matching this */ + /* FALLTHROUGH */ + + case _CC_ENUM_PSXSPC: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isSPACE_utf8(s))); + break; + + case _CC_ENUM_BLANK: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isBLANK_utf8(s))); + break; + + case _CC_ENUM_XDIGIT: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isXDIGIT_utf8(s))); + break; + + case _CC_ENUM_VERTSPACE: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isVERTWS_utf8(s))); + break; + + case _CC_ENUM_CNTRL: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isCNTRL_utf8(s))); + break; + + default: + Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum); + assert(0); /* NOTREACHED */ + } + } + break; + + found_above_latin1: /* Here we have to load a swash to get the result + for the current code point */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + /* This is a copy of the loop above for swash classes, though using the + * FBC macro instead of being expanded out. Since we've loaded the + * swash, we don't have to check for that each time through the loop */ + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(_generic_utf8( + classnum, + s, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) s, TRUE)))); + break; + + case AHOCORASICKC: + case AHOCORASICK: + { + DECL_TRIE_TYPE(c); + /* what trie are we using right now */ + reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; + reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ]; + HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); + + const char *last_start = strend - trie->minlen; +#ifdef DEBUGGING + const char *real_start = s; +#endif + STRLEN maxlen = trie->maxlen; + SV *sv_points; + U8 **points; /* map of where we were in the input string + when reading a given char. For ASCII this + is unnecessary overhead as the relationship + is always 1:1, but for Unicode, especially + case folded Unicode this is not true. */ + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + U8 *bitmap=NULL; + + + GET_RE_DEBUG_FLAGS_DECL; + + /* We can't just allocate points here. We need to wrap it in + * an SV so it gets freed properly if there is a croak while + * running the match */ + ENTER; + SAVETMPS; + sv_points=newSV(maxlen * sizeof(U8 *)); + SvCUR_set(sv_points, + maxlen * sizeof(U8 *)); + SvPOK_on(sv_points); + sv_2mortal(sv_points); + points=(U8**)SvPV_nolen(sv_points ); + if ( trie_type != trie_utf8_fold + && (trie->bitmap || OP(c)==AHOCORASICKC) ) + { + if (trie->bitmap) + bitmap=(U8*)trie->bitmap; + else + bitmap=(U8*)ANYOF_BITMAP(c); + } + /* this is the Aho-Corasick algorithm modified a touch + to include special handling for long "unknown char" sequences. + The basic idea being that we use AC as long as we are dealing + with a possible matching char, when we encounter an unknown char + (and we have not encountered an accepting state) we scan forward + until we find a legal starting char. + AC matching is basically that of trie matching, except that when + we encounter a failing transition, we fall back to the current + states "fail state", and try the current char again, a process + we repeat until we reach the root state, state 1, or a legal + transition. If we fail on the root state then we can either + terminate if we have reached an accepting state previously, or + restart the entire process from the beginning if we have not. + + */ + while (s <= last_start) { + const U32 uniflags = UTF8_ALLOW_DEFAULT; + U8 *uc = (U8*)s; + U16 charid = 0; + U32 base = 1; + U32 state = 1; + UV uvc = 0; + STRLEN len = 0; + STRLEN foldlen = 0; + U8 *uscan = (U8*)NULL; + U8 *leftmost = NULL; +#ifdef DEBUGGING + U32 accepted_word= 0; +#endif + U32 pointpos = 0; + + while ( state && uc <= (U8*)strend ) { + int failed=0; + U32 word = aho->states[ state ].wordnum; + + if( state==1 ) { + if ( bitmap ) { + DEBUG_TRIE_EXECUTE_r( + if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + dump_exec_pos( (char *)uc, c, strend, real_start, + (char *)uc, utf8_target ); + PerlIO_printf( Perl_debug_log, + " Scanning for legal start char...\n"); + } + ); + if (utf8_target) { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc += UTF8SKIP(uc); + } + } else { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; + } + } + s= (char *)uc; + } + if (uc >(U8*)last_start) break; + } + + if ( word ) { + U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ]; + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=word); + leftmost= lpos; + } + if (base==0) break; + + } + points[pointpos++ % maxlen]= uc; + if (foldlen || uc < (U8*)strend) { + REXEC_TRIE_READ_CHAR(trie_type, trie, + widecharmap, uc, + uscan, len, uvc, charid, foldlen, + foldbuf, uniflags); + DEBUG_TRIE_EXECUTE_r({ + dump_exec_pos( (char *)uc, c, strend, + real_start, s, utf8_target); + PerlIO_printf(Perl_debug_log, + " Charid:%3u CP:%4"UVxf" ", + charid, uvc); + }); + } + else { + len = 0; + charid = 0; + } + + + do { +#ifdef DEBUGGING + word = aho->states[ state ].wordnum; +#endif + base = aho->states[ state ].trans.base; + + DEBUG_TRIE_EXECUTE_r({ + if (failed) + dump_exec_pos( (char *)uc, c, strend, real_start, + s, utf8_target ); + PerlIO_printf( Perl_debug_log, + "%sState: %4"UVxf", word=%"UVxf, + failed ? " Fail transition to " : "", + (UV)state, (UV)word); + }); + if ( base ) { + U32 tmp; + I32 offset; + if (charid && + ( ((offset = base + charid + - 1 - trie->uniquecharcount)) >= 0) + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state + && (tmp=trie->trans[offset].next)) + { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - legal\n")); + state = tmp; + break; + } + else { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - fail\n")); + failed = 1; + state = aho->fail[state]; + } + } + else { + /* we must be accepting here */ + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - accepting\n")); + failed = 1; + break; + } + } while(state); + uc += len; + if (failed) { + if (leftmost) + break; + if (!state) state = 1; + } + } + if ( aho->states[ state ].wordnum ) { + U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ]; + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=aho->states[ state ].wordnum); + leftmost = lpos; + } + } + if (leftmost) { + s = (char*)leftmost; + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( + Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n", + (UV)accepted_word, (IV)(s - real_start) + ); + }); + if (reginfo->intuit || regtry(reginfo, &s)) { + FREETMPS; + LEAVE; + goto got_it; + } + s = HOPc(s,1); + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n"); + }); + } else { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log,"No match.\n")); + break; + } + } + FREETMPS; + LEAVE; + } + break; + default: + Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); + } + return 0; + got_it: + return s; +} + +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +static void +S_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) + { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + + +/* + - regexec_flags - match a regexp against a string + */ +I32 +Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) +/* stringarg: the point in the string at which to begin matching */ +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >= minend bytes after stringarg. */ +/* sv: SV being matched: only used for utf8 flag, pos() etc; string + * itself is accessed via the pointers above */ +/* data: May be used for some additional optimizations. + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ + +{ + struct regexp *const prog = ReANY(rx); + char *s; + regnode *c; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ + const bool utf8_target = cBOOL(DO_UTF8(sv)); + I32 multiline; + RXi_GET_DECL(prog,progi); + regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ + regmatch_info *const reginfo = ®info_buf; + regexp_paren_pair *swap = NULL; + I32 oldsave; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGEXEC_FLAGS; + PERL_UNUSED_ARG(data); + + /* Be paranoid... */ + if (prog == NULL || stringarg == NULL) { + Perl_croak(aTHX_ "NULL regexp parameter"); + } + + DEBUG_EXECUTE_r( + debug_start_match(rx, utf8_target, stringarg, strend, + "Matching"); + ); + + startpos = stringarg; + + if (prog->intflags & PREGf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if prog->gofs is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ + + if (prog->intflags & PREGf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->intflags & PREGf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; + } + + /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), + * which will call destuctors to reset PL_regmatch_state, free higher + * PL_regmatch_slabs, and clean up regmatch_info_aux and + * regmatch_info_aux_eval */ + + oldsave = PL_savestack_ix; + + s = startpos; + + if ((prog->extflags & RXf_USE_INTUIT) + && !(flags & REXEC_CHECKED)) + { + s = re_intuit_start(rx, sv, strbeg, startpos, strend, + flags, NULL); + if (!s) + return 0; + + if (prog->extflags & RXf_CHECK_ALL) { + /* we can match based purely on the result of INTUIT. + * Set up captures etc just for $& and $-[0] + * (an intuit-only match wont have $1,$2,..) */ + assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + /* match via INTUIT shouldn't have any captures. + * Let @-, @+, $^N know */ + prog->lastparen = prog->lastcloseparen = 0; + RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, strend, + sv, flags, utf8_target); + + return 1; + } + } + + multiline = prog->extflags & RXf_PMf_MULTILINE; + + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "String too short [regexec_flags]...\n")); + goto phooey; + } + + /* Check validity of program. */ + if (UCHARAT(progi->program) != REG_MAGIC) { + Perl_croak(aTHX_ "corrupted regexp program"); + } + + RX_MATCH_TAINTED_off(rx); + + reginfo->prog = rx; /* Yes, sorry that this is confusing. */ + reginfo->intuit = 0; + reginfo->is_utf8_target = cBOOL(utf8_target); + reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); + reginfo->warned = FALSE; + reginfo->strbeg = strbeg; + reginfo->sv = sv; + reginfo->poscache_maxiter = 0; /* not yet started a countdown */ + reginfo->strend = strend; + /* see how far we have to get to not match where we matched before */ + reginfo->till = stringarg + minend; + + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { + /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after + S_cleanup_regmatch_info_aux has executed (registered by + SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies + magic belonging to this SV. + Not newSVsv, either, as it does not COW. + */ + assert(!IS_PADGV(sv)); + reginfo->sv = newSV(0); + SvSetSV_nosteal(reginfo->sv, sv); + SAVEFREESV(reginfo->sv); + } + + /* reserve next 2 or 3 slots in PL_regmatch_state: + * slot N+0: may currently be in use: skip it + * slot N+1: use for regmatch_info_aux struct + * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s + * slot N+3: ready for use by regmatch() + */ + + { + regmatch_state *old_regmatch_state; + regmatch_slab *old_regmatch_slab; + int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; + + /* on first ever match, allocate first slab */ + if (!PL_regmatch_slab) { + Newx(PL_regmatch_slab, 1, regmatch_slab); + PL_regmatch_slab->prev = NULL; + PL_regmatch_slab->next = NULL; + PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); + } + + old_regmatch_state = PL_regmatch_state; + old_regmatch_slab = PL_regmatch_slab; + + for (i=0; i <= max; i++) { + if (i == 1) + reginfo->info_aux = &(PL_regmatch_state->u.info_aux); + else if (i ==2) + reginfo->info_aux_eval = + reginfo->info_aux->info_aux_eval = + &(PL_regmatch_state->u.info_aux_eval); + + if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) + PL_regmatch_state = S_push_slab(aTHX); + } + + /* note initial PL_regmatch_state position; at end of match we'll + * pop back to there and free any higher slabs */ + + reginfo->info_aux->old_regmatch_state = old_regmatch_state; + reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; + reginfo->info_aux->poscache = NULL; + + SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); + + if ((prog->extflags & RXf_EVAL_SEEN)) + S_setup_eval_state(aTHX_ reginfo); + else + reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; + } + + /* If there is a "must appear" string, look for it. */ + + if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { + /* We have to be careful. If the previous successful match + was from this regex we don't want a subsequent partially + successful match to clobber the old results. + So when we detect this possibility we add a swap buffer + to the re, and switch the buffer each match. If we fail, + we switch it back; otherwise we leave it swapped. + */ + swap = prog->offs; + /* do we need a save destructor here for eval dies? */ + Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap), + PTR2UV(prog->offs) + )); + } + + /* Simplest case: anchored match need be tried only once. */ + /* [unless only anchor is BOL and multiline is set] */ + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { + if (s == startpos && regtry(reginfo, &s)) + goto got_it; + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ + { + char *end; + + if (minlen) + dontbother = minlen - 1; + end = HOP3c(strend, -dontbother, strbeg) - 1; + /* for multiline we only have to try after newlines */ + if (prog->check_substr || prog->check_utf8) { + /* because of the goto we can not easily reuse the macros for bifurcating the + unicode/non-unicode match modes here like we do elsewhere - demerphq */ + if (utf8_target) { + if (s == startpos) + goto after_try_utf8; + while (1) { + if (regtry(reginfo, &s)) { + goto got_it; + } + after_try_utf8: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, strbeg, + s + UTF8SKIP(s), strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s += UTF8SKIP(s); + } + } + } /* end search for check string in unicode */ + else { + if (s == startpos) { + goto after_try_latin; + } + while (1) { + if (regtry(reginfo, &s)) { + goto got_it; + } + after_try_latin: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, strbeg, + s + 1, strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s++; + } + } + } /* end search for check string in latin*/ + } /* end search for check string */ + else { /* search for newline */ + if (s > startpos) { + /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ + s--; + } + /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ + while (s <= end) { /* note it could be possible to match at the end of the string */ + if (*s++ == '\n') { /* don't need PL_utf8skip here */ + if (regtry(reginfo, &s)) + goto got_it; + } + } + } /* end search for newline */ + } /* end anchored/multiline check string search */ + goto phooey; + } else if (prog->intflags & PREGf_ANCH_GPOS) + { + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) + goto got_it; + goto phooey; + } + + /* Messy cases: unanchored match. */ + if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { + /* we have /x+whatever/ */ + /* it must be a one character string (XXXX Except is_utf8_pat?) */ + char ch; +#ifdef DEBUGGING + int did_match = 0; +#endif + if (utf8_target) { + if (! prog->anchored_utf8) { + to_utf8_substr(prog); + } + ch = SvPVX_const(prog->anchored_utf8)[0]; + REXEC_FBC_SCAN( + if (*s == ch) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (regtry(reginfo, &s)) goto got_it; + s += UTF8SKIP(s); + while (s < strend && *s == ch) + s += UTF8SKIP(s); + } + ); + + } + else { + if (! prog->anchored_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + ch = SvPVX_const(prog->anchored_substr)[0]; + REXEC_FBC_SCAN( + if (*s == ch) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (regtry(reginfo, &s)) goto got_it; + s++; + while (s < strend && *s == ch) + s++; + } + ); + } + DEBUG_EXECUTE_r(if (!did_match) + PerlIO_printf(Perl_debug_log, + "Did not find anchored character...\n") + ); + } + else if (prog->anchored_substr != NULL + || prog->anchored_utf8 != NULL + || ((prog->float_substr != NULL || prog->float_utf8 != NULL) + && prog->float_max_offset < strend - s)) { + SV *must; + SSize_t back_max; + SSize_t back_min; + char *last; + char *last1; /* Last position checked before */ +#ifdef DEBUGGING + int did_match = 0; +#endif + if (prog->anchored_substr || prog->anchored_utf8) { + if (utf8_target) { + if (! prog->anchored_utf8) { + to_utf8_substr(prog); + } + must = prog->anchored_utf8; + } + else { + if (! prog->anchored_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + must = prog->anchored_substr; + } + back_max = back_min = prog->anchored_offset; + } else { + if (utf8_target) { + if (! prog->float_utf8) { + to_utf8_substr(prog); + } + must = prog->float_utf8; + } + else { + if (! prog->float_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + must = prog->float_substr; + } + back_max = prog->float_max_offset; + back_min = prog->float_min_offset; + } + + if (back_min<0) { + last = strend; + } else { + last = HOP3c(strend, /* Cannot start after this */ + -(SSize_t)(CHR_SVLEN(must) + - (SvTAIL(must) != 0) + back_min), strbeg); + } + if (s > reginfo->strbeg) + last1 = HOPc(s, -1); + else + last1 = s - 1; /* bogus */ + + /* XXXX check_substr already used to find "s", can optimize if + check_substr==must. */ + dontbother = 0; + strend = HOPc(strend, -dontbother); + while ( (s <= last) && + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), + (unsigned char*)strend, must, + multiline ? FBMrf_MULTILINE : 0)) ) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (HOPc(s, -back_max) > last1) { + last1 = HOPc(s, -back_min); + s = HOPc(s, -back_max); + } + else { + char * const t = (last1 >= reginfo->strbeg) + ? HOPc(last1, 1) : last1 + 1; + + last1 = HOPc(s, -back_min); + s = t; + } + if (utf8_target) { + while (s <= last1) { + if (regtry(reginfo, &s)) + goto got_it; + if (s >= last1) { + s++; /* to break out of outer loop */ + break; + } + s += UTF8SKIP(s); + } + } + else { + while (s <= last1) { + if (regtry(reginfo, &s)) + goto got_it; + s++; + } + } + } + DEBUG_EXECUTE_r(if (!did_match) { + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", + ((must == prog->anchored_substr || must == prog->anchored_utf8) + ? "anchored" : "floating"), + quoted, RE_SV_TAIL(must)); + }); + goto phooey; + } + else if ( (c = progi->regstclass) ) { + if (minlen) { + const OPCODE op = OP(progi->regstclass); + /* don't bother with what can't match */ + if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE) + strend = HOPc(strend, -(minlen - 1)); + } + DEBUG_EXECUTE_r({ + SV * const prop = sv_newmortal(); + regprop(prog, prop, c, reginfo); + { + RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), + s,strend-s,60); + PerlIO_printf(Perl_debug_log, + "Matching stclass %.*s against %s (%d bytes)\n", + (int)SvCUR(prop), SvPVX_const(prop), + quoted, (int)(strend - s)); + } + }); + if (find_byclass(prog, c, s, strend, reginfo)) + goto got_it; + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n")); + } + else { + dontbother = 0; + if (prog->float_substr != NULL || prog->float_utf8 != NULL) { + /* Trim the end. */ + char *last= NULL; + SV* float_real; + STRLEN len; + const char *little; + + if (utf8_target) { + if (! prog->float_utf8) { + to_utf8_substr(prog); + } + float_real = prog->float_utf8; + } + else { + if (! prog->float_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + float_real = prog->float_substr; + } + + little = SvPV_const(float_real, len); + if (SvTAIL(float_real)) { + /* This means that float_real contains an artificial \n on + * the end due to the presence of something like this: + * /foo$/ where we can match both "foo" and "foo\n" at the + * end of the string. So we have to compare the end of the + * string first against the float_real without the \n and + * then against the full float_real with the string. We + * have to watch out for cases where the string might be + * smaller than the float_real or the float_real without + * the \n. */ + char *checkpos= strend - len; + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%sChecking for float_real.%s\n", + PL_colors[4], PL_colors[5])); + if (checkpos + 1 < strbeg) { + /* can't match, even if we remove the trailing \n + * string is too short to match */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString shorter than required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (memEQ(checkpos + 1, little, len - 1)) { + /* can match, the end of the string matches without the + * "\n" */ + last = checkpos + 1; + } else if (checkpos < strbeg) { + /* cant match, string is too short when the "\n" is + * included */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (!multiline) { + /* non multiline match, so compare with the "\n" at the + * end of the string */ + if (memEQ(checkpos, little, len)) { + last= checkpos; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } + } else { + /* multiline match, so we have to search for a place + * where the full string is located */ + goto find_last; + } + } else { + find_last: + if (len) + last = rninstr(s, strend, little, little + len); + else + last = strend; /* matching "$" */ + } + if (!last) { + /* at one point this block contained a comment which was + * probably incorrect, which said that this was a "should not + * happen" case. Even if it was true when it was written I am + * pretty sure it is not anymore, so I have removed the comment + * and replaced it with this one. Yves */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "String does not contain required substring, cannot match.\n" + )); + goto phooey; + } + dontbother = strend - last + prog->float_min_offset; + } + if (minlen && (dontbother < minlen)) + dontbother = minlen - 1; + strend -= dontbother; /* this one's always in bytes! */ + /* We don't know much -- general case. */ + if (utf8_target) { + for (;;) { + if (regtry(reginfo, &s)) + goto got_it; + if (s >= strend) + break; + s += UTF8SKIP(s); + }; + } + else { + do { + if (regtry(reginfo, &s)) + goto got_it; + } while (s++ < strend); + } + } + + /* Failure. */ + goto phooey; + +got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + DEBUG_BUFFERS_r( + if (swap) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap) + ); + ); + Safefree(swap); + + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ + + LEAVE_SCOPE(oldsave); + + if (RXp_PAREN_NAMES(prog)) + (void)hv_iterinit(RXp_PAREN_NAMES(prog)); + + RX_MATCH_UTF8_set(rx, utf8_target); + + /* make sure $`, $&, $', and $digit will work later */ + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); + + return 1; + +phooey: + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", + PL_colors[4], PL_colors[5])); + + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ + + LEAVE_SCOPE(oldsave); + + if (swap) { + /* we failed :-( roll it back */ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(prog->offs), + PTR2UV(swap) + )); + Safefree(prog->offs); + prog->offs = swap; + } + return 0; +} + + +/* Set which rex is pointed to by PL_reg_curpm, handling ref counting. + * Do inc before dec, in case old and new rex are the same */ +#define SET_reg_curpm(Re2) \ + if (reginfo->info_aux_eval) { \ + (void)ReREFCNT_inc(Re2); \ + ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ + PM_SETRE((PL_reg_curpm), (Re2)); \ + } + + +/* + - regtry - try match at specific point + */ +STATIC I32 /* 0 failure, 1 success */ +S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) +{ + CHECKPOINT lastcp; + REGEXP *const rx = reginfo->prog; + regexp *const prog = ReANY(rx); + SSize_t result; + RXi_GET_DECL(prog,progi); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTRY; + + reginfo->cutpoint=NULL; + + prog->offs[0].start = *startposp - reginfo->strbeg; + prog->lastparen = 0; + prog->lastcloseparen = 0; + + /* XXXX What this code is doing here?!!! There should be no need + to do this again and again, prog->lastparen should take care of + this! --ilya*/ + + /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. + * Actually, the code in regcppop() (which Ilya may be meaning by + * prog->lastparen), is not needed at all by the test suite + * (op/regexp, op/pat, op/split), but that code is needed otherwise + * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ + * Meanwhile, this code *is* needed for the + * above-mentioned test suite tests to succeed. The common theme + * on those tests seems to be returning null fields from matches. + * --jhi updated by dapm */ +#if 1 + if (prog->nparens) { + regexp_paren_pair *pp = prog->offs; + I32 i; + for (i = prog->nparens; i > (I32)prog->lastparen; i--) { + ++pp; + pp->start = -1; + pp->end = -1; + } + } +#endif + REGCP_SET(lastcp); + result = regmatch(reginfo, *startposp, progi->program + 1); + if (result != -1) { + prog->offs[0].end = result; + return 1; + } + if (reginfo->cutpoint) + *startposp= reginfo->cutpoint; + REGCP_UNWIND(lastcp); + return 0; +} + + +#define sayYES goto yes +#define sayNO goto no +#define sayNO_SILENT goto no_silent + +/* we dont use STMT_START/END here because it leads to + "unreachable code" warnings, which are bogus, but distracting. */ +#define CACHEsayNO \ + if (ST.cache_mask) \ + reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ + sayNO + +/* this is used to determine how far from the left messages like + 'failed...' are printed. It should be set such that messages + are inline with the regop output that created them. +*/ +#define REPORT_CODE_OFF 32 + + +#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ +#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ +#define CHRTEST_NOT_A_CP_1 -999 +#define CHRTEST_NOT_A_CP_2 -998 + +/* grab a new slab and return the first slot in it */ + +STATIC regmatch_state * +S_push_slab(pTHX) +{ +#if PERL_VERSION < 9 && !defined(PERL_CORE) + dMY_CXT; +#endif + regmatch_slab *s = PL_regmatch_slab->next; + if (!s) { + Newx(s, 1, regmatch_slab); + s->prev = PL_regmatch_slab; + s->next = NULL; + PL_regmatch_slab->next = s; + } + PL_regmatch_slab = s; + return SLAB_FIRST(s); +} + + +/* push a new state then goto it */ + +#define PUSH_STATE_GOTO(state, node, input) \ + pushinput = input; \ + scan = node; \ + st->resume_state = state; \ + goto push_state; + +/* push a new state with success backtracking, then goto it */ + +#define PUSH_YES_STATE_GOTO(state, node, input) \ + pushinput = input; \ + scan = node; \ + st->resume_state = state; \ + goto push_yes_state; + + + + +/* + +regmatch() - main matching routine + +This is basically one big switch statement in a loop. We execute an op, +set 'next' to point the next op, and continue. If we come to a point which +we may need to backtrack to on failure such as (A|B|C), we push a +backtrack state onto the backtrack stack. On failure, we pop the top +state, and re-enter the loop at the state indicated. If there are no more +states to pop, we return failure. + +Sometimes we also need to backtrack on success; for example /A+/, where +after successfully matching one A, we need to go back and try to +match another one; similarly for lookahead assertions: if the assertion +completes successfully, we backtrack to the state just before the assertion +and then carry on. In these cases, the pushed state is marked as +'backtrack on success too'. This marking is in fact done by a chain of +pointers, each pointing to the previous 'yes' state. On success, we pop to +the nearest yes state, discarding any intermediate failure-only states. +Sometimes a yes state is pushed just to force some cleanup code to be +called at the end of a successful match or submatch; e.g. (??{$re}) uses +it to free the inner regex. + +Note that failure backtracking rewinds the cursor position, while +success backtracking leaves it alone. + +A pattern is complete when the END op is executed, while a subpattern +such as (?=foo) is complete when the SUCCESS op is executed. Both of these +ops trigger the "pop to last yes state if any, otherwise return true" +behaviour. + +A common convention in this function is to use A and B to refer to the two +subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is +the subpattern to be matched possibly multiple times, while B is the entire +rest of the pattern. Variable and state names reflect this convention. + +The states in the main switch are the union of ops and failure/success of +substates associated with with that op. For example, IFMATCH is the op +that does lookahead assertions /(?=A)B/ and so the IFMATCH state means +'execute IFMATCH'; while IFMATCH_A is a state saying that we have just +successfully matched A and IFMATCH_A_fail is a state saying that we have +just failed to match A. Resume states always come in pairs. The backtrack +state we push is marked as 'IFMATCH_A', but when that is popped, we resume +at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking +on success or failure. + +The struct that holds a backtracking state is actually a big union, with +one variant for each major type of op. The variable st points to the +top-most backtrack struct. To make the code clearer, within each +block of code we #define ST to alias the relevant union. + +Here's a concrete example of a (vastly oversimplified) IFMATCH +implementation: + + switch (state) { + .... + +#define ST st->u.ifmatch + + case IFMATCH: // we are executing the IFMATCH op, (?=A)B + ST.foo = ...; // some state we wish to save + ... + // push a yes backtrack state with a resume value of + // IFMATCH_A/IFMATCH_A_fail, then continue execution at the + // first node of A: + PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); + // NOTREACHED + + case IFMATCH_A: // we have successfully executed A; now continue with B + next = B; + bar = ST.foo; // do something with the preserved value + break; + + case IFMATCH_A_fail: // A failed, so the assertion failed + ...; // do some housekeeping, then ... + sayNO; // propagate the failure + +#undef ST + + ... + } + +For any old-timers reading this who are familiar with the old recursive +approach, the code above is equivalent to: + + case IFMATCH: // we are executing the IFMATCH op, (?=A)B + { + int foo = ... + ... + if (regmatch(A)) { + next = B; + bar = foo; + break; + } + ...; // do some housekeeping, then ... + sayNO; // propagate the failure + } + +The topmost backtrack state, pointed to by st, is usually free. If you +want to claim it, populate any ST.foo fields in it with values you wish to +save, then do one of + + PUSH_STATE_GOTO(resume_state, node, newinput); + PUSH_YES_STATE_GOTO(resume_state, node, newinput); + +which sets that backtrack state's resume value to 'resume_state', pushes a +new free entry to the top of the backtrack stack, then goes to 'node'. +On backtracking, the free slot is popped, and the saved state becomes the +new free state. An ST.foo field in this new top state can be temporarily +accessed to retrieve values, but once the main loop is re-entered, it +becomes available for reuse. + +Note that the depth of the backtrack stack constantly increases during the +left-to-right execution of the pattern, rather than going up and down with +the pattern nesting. For example the stack is at its maximum at Z at the +end of the pattern, rather than at X in the following: + + /(((X)+)+)+....(Y)+....Z/ + +The only exceptions to this are lookahead/behind assertions and the cut, +(?>A), which pop all the backtrack states associated with A before +continuing. + +Backtrack state structs are allocated in slabs of about 4K in size. +PL_regmatch_state and st always point to the currently active state, +and PL_regmatch_slab points to the slab currently containing +PL_regmatch_state. The first time regmatch() is called, the first slab is +allocated, and is never freed until interpreter destruction. When the slab +is full, a new one is allocated and chained to the end. At exit from +regmatch(), slabs allocated since entry are freed. + +*/ + + +#define DEBUG_STATE_pp(pp) \ + DEBUG_STATE_r({ \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ + PerlIO_printf(Perl_debug_log, \ + " %*s"pp" %s%s%s%s%s\n", \ + depth*2, "", \ + PL_reg_name[st->resume_state], \ + ((st==yes_state||st==mark_state) ? "[" : ""), \ + ((st==yes_state) ? "Y" : ""), \ + ((st==mark_state) ? "M" : ""), \ + ((st==yes_state||st==mark_state) ? "]" : "") \ + ); \ + }); + + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1) + +#ifdef DEBUGGING + +STATIC void +S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, + const char *start, const char *end, const char *blurb) +{ + const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; + + PERL_ARGS_ASSERT_DEBUG_START_MATCH; + + if (!PL_colorset) + reginitcolors(); + { + RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), + RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); + + RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), + start, end - start, 60); + + PerlIO_printf(Perl_debug_log, + "%s%s REx%s %s against %s\n", + PL_colors[4], blurb, PL_colors[5], s0, s1); + + if (utf8_target||utf8_pat) + PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", + utf8_pat ? "pattern" : "", + utf8_pat && utf8_target ? " and " : "", + utf8_target ? "string" : "" + ); + } +} + +STATIC void +S_dump_exec_pos(pTHX_ const char *locinput, + const regnode *scan, + const char *loc_regeol, + const char *loc_bostr, + const char *loc_reg_starttry, + const bool utf8_target) +{ + const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; + const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ + int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); + /* The part of the string before starttry has one color + (pref0_len chars), between starttry and current + position another one (pref_len - pref0_len chars), + after the current position the third one. + We assume that pref0_len <= pref_len, otherwise we + decrease pref0_len. */ + int pref_len = (locinput - loc_bostr) > (5 + taill) - l + ? (5 + taill) - l : locinput - loc_bostr; + int pref0_len; + + PERL_ARGS_ASSERT_DUMP_EXEC_POS; + + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) + pref_len++; + pref0_len = pref_len - (locinput - loc_reg_starttry); + if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) + l = ( loc_regeol - locinput > (5 + taill) - pref_len + ? (5 + taill) - pref_len : loc_regeol - locinput); + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) + l--; + if (pref0_len < 0) + pref0_len = 0; + if (pref0_len > pref_len) + pref0_len = pref_len; + { + const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0; + + RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), + (locinput - pref_len),pref0_len, 60, 4, 5); + + RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), + (locinput - pref_len + pref0_len), + pref_len - pref0_len, 60, 2, 3); + + RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), + locinput, loc_regeol - locinput, 10, 0, 1); + + const STRLEN tlen=len0+len1+len2; + PerlIO_printf(Perl_debug_log, + "%4"IVdf" <%.*s%.*s%s%.*s>%*s|", + (IV)(locinput - loc_bostr), + len0, s0, + len1, s1, + (docolor ? "" : "> <"), + len2, s2, + (int)(tlen > 19 ? 0 : 19 - tlen), + ""); + } +} + +#endif + +/* reg_check_named_buff_matched() + * Checks to see if a named buffer has matched. The data array of + * buffer numbers corresponding to the buffer is expected to reside + * in the regexp->data->data array in the slot stored in the ARG() of + * node involved. Note that this routine doesn't actually care about the + * name, that information is not preserved from compilation to execution. + * Returns the index of the leftmost defined buffer with the given name + * or 0 if non of the buffers matched. + */ +STATIC I32 +S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) +{ + I32 n; + RXi_GET_DECL(rex,rexi); + SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + + PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; + + for ( n=0; nlastparen >= nums[n] && + rex->offs[nums[n]].end != -1) + { + return nums[n]; + } + } + return 0; +} + + +static bool +S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, + U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) +{ + /* This function determines if there are one or two characters that match + * the first character of the passed-in EXACTish node , and if + * so, returns them in the passed-in pointers. + * + * If it determines that no possible character in the target string can + * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if + * the first character in requires UTF-8 to represent, and the + * target string isn't in UTF-8.) + * + * If there are more than two characters that could match the beginning of + * , or if more context is required to determine a match or not, + * it sets both * and * to CHRTEST_VOID. + * + * The motiviation behind this function is to allow the caller to set up + * tight loops for matching. If is of type EXACT, there is + * only one possible character that can match its first character, and so + * the situation is quite simple. But things get much more complicated if + * folding is involved. It may be that the first character of an EXACTFish + * node doesn't participate in any possible fold, e.g., punctuation, so it + * can be matched only by itself. The vast majority of characters that are + * in folds match just two things, their lower and upper-case equivalents. + * But not all are like that; some have multiple possible matches, or match + * sequences of more than one character. This function sorts all that out. + * + * Consider the patterns A*B or A*?B where A and B are arbitrary. In a + * loop of trying to match A*, we know we can't exit where the thing + * following it isn't a B. And something can't be a B unless it is the + * beginning of B. By putting a quick test for that beginning in a tight + * loop, we can rule out things that can't possibly be B without having to + * break out of the loop, thus avoiding work. Similarly, if A is a single + * character, we can make a tight loop matching A*, using the outputs of + * this function. + * + * If the target string to match isn't in UTF-8, and there aren't + * complications which require CHRTEST_VOID, * and * are set to + * the one or two possible octets (which are characters in this situation) + * that can match. In all cases, if there is only one character that can + * match, * and * will be identical. + * + * If the target string is in UTF-8, the buffers pointed to by + * and will contain the one or two UTF-8 sequences of bytes that + * can match the beginning of . They should be declared with at + * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is + * undefined what these contain.) If one or both of the buffers are + * invariant under UTF-8, *, and * will also be set to the + * corresponding invariant. If variant, the corresponding * and/or + * * will be set to a negative number(s) that shouldn't match any code + * point (unless inappropriately coerced to unsigned). * will equal + * * if and only if and are the same. */ + + const bool utf8_target = reginfo->is_utf8_target; + + UV c1 = CHRTEST_NOT_A_CP_1; + UV c2 = CHRTEST_NOT_A_CP_2; + bool use_chrtest_void = FALSE; + const bool is_utf8_pat = reginfo->is_utf8_pat; + + /* Used when we have both utf8 input and utf8 output, to avoid converting + * to/from code points */ + bool utf8_has_been_setup = FALSE; + + dVAR; + + U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + + if (OP(text_node) == EXACT) { + + /* In an exact node, only one thing can be matched, that first + * character. If both the pat and the target are UTF-8, we can just + * copy the input to the output, avoiding finding the code point of + * that character */ + if (!is_utf8_pat) { + c2 = c1 = *pat; + } + else if (utf8_target) { + Copy(pat, c1_utf8, UTF8SKIP(pat), U8); + Copy(pat, c2_utf8, UTF8SKIP(pat), U8); + utf8_has_been_setup = TRUE; + } + else { + c2 = c1 = valid_utf8_to_uvchr(pat, NULL); + } + } + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; + } + } + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } + } + + pat = folded; + pat_end = d; + } + } + + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 255) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { + + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; + } + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 255, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255. */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } + } + } + } + } + } + else /* Here, c1 is <= 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } + } + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { + + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; + + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; + break; + + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } + } + } + } + + /* Here have figured things out. Set up the returns */ + if (use_chrtest_void) { + *c2p = *c1p = CHRTEST_VOID; + } + else if (utf8_target) { + if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ + uvchr_to_utf8(c1_utf8, c1); + uvchr_to_utf8(c2_utf8, c2); + } + + /* Invariants are stored in both the utf8 and byte outputs; Use + * negative numbers otherwise for the byte ones. Make sure that the + * byte ones are the same iff the utf8 ones are the same */ + *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; + *c2p = (UTF8_IS_INVARIANT(*c2_utf8)) + ? *c2_utf8 + : (c1 == c2) + ? CHRTEST_NOT_A_CP_1 + : CHRTEST_NOT_A_CP_2; + } + else if (c1 > 255) { + if (c2 > 255) { /* both possibilities are above what a non-utf8 string + can represent */ + return FALSE; + } + + *c1p = *c2p = c2; /* c2 is the only representable value */ + } + else { /* c1 is representable; see about c2 */ + *c1p = c1; + *c2p = (c2 < 256) ? c2 : c1; + } + + return TRUE; +} + +/* returns -1 on failure, $+[0] on success */ +STATIC SSize_t +S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) +{ +#if PERL_VERSION < 9 && !defined(PERL_CORE) + dMY_CXT; +#endif + dVAR; + const bool utf8_target = reginfo->is_utf8_target; + const U32 uniflags = UTF8_ALLOW_DEFAULT; + REGEXP *rex_sv = reginfo->prog; + regexp *rex = ReANY(rex_sv); + RXi_GET_DECL(rex,rexi); + /* the current state. This is a cached copy of PL_regmatch_state */ + regmatch_state *st; + /* cache heavy used fields of st in registers */ + regnode *scan; + regnode *next; + U32 n = 0; /* general value; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ + char *locinput = startpos; + char *pushinput; /* where to continue after a PUSH */ + I32 nextchr; /* is always set to UCHARAT(locinput) */ + + bool result = 0; /* return value of S_regmatch */ + int depth = 0; /* depth of backtrack stack */ + U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ + const U32 max_nochange_depth = + (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? + 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; + regmatch_state *yes_state = NULL; /* state to pop to on success of + subpattern */ + /* mark_state piggy backs on the yes_state logic so that when we unwind + the stack on success we can update the mark_state as we go */ + regmatch_state *mark_state = NULL; /* last mark state we have seen */ + regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ + struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ + U32 state_num; + bool no_final = 0; /* prevent failure from backtracking? */ + bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ + char *startpoint = locinput; + SV *popmark = NULL; /* are we looking for a mark? */ + SV *sv_commit = NULL; /* last mark name seen in failure */ + SV *sv_yes_mark = NULL; /* last mark name we have seen + during a successful match */ + U32 lastopen = 0; /* last open we saw */ + bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; + SV* const oreplsv = GvSVn(PL_replgv); + /* these three flags are set by various ops to signal information to + * the very next op. They have a useful lifetime of exactly one loop + * iteration, and are not preserved or restored by state pushes/pops + */ + bool sw = 0; /* the condition value in (?(cond)a|b) */ + bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ + int logical = 0; /* the following EVAL is: + 0: (?{...}) + 1: (?(?{...})X|Y) + 2: (??{...}) + or the following IFMATCH/UNLESSM is: + false: plain (?=foo) + true: used as a condition: (?(?=foo)) + */ + PAD* last_pad = NULL; + dMULTICALL; + I32 gimme = G_SCALAR; + CV *caller_cv = NULL; /* who called us */ + CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ + CHECKPOINT runops_cp; /* savestack position before executing EVAL */ + U32 maxopenparen = 0; /* max '(' index seen so far */ + int to_complement; /* Invert the result? */ + _char_class_number classnum; + bool is_utf8_pat = reginfo->is_utf8_pat; + +#ifdef DEBUGGING + GET_RE_DEBUG_FLAGS_DECL; +#endif + + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ + multicall_oldcatch = 0; + multicall_cv = NULL; + cx = NULL; + PERL_UNUSED_VAR(multicall_cop); + PERL_UNUSED_VAR(newsp); + + + PERL_ARGS_ASSERT_REGMATCH; + + DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ + PerlIO_printf(Perl_debug_log,"regmatch start\n"); + })); + + st = PL_regmatch_state; + + /* Note that nextchr is a byte even in UTF */ + SET_nextchr; + scan = prog; + while (scan != NULL) { + + DEBUG_EXECUTE_r( { + SV * const prop = sv_newmortal(); + regnode *rnext=regnext(scan); + DUMP_EXEC_POS( locinput, scan, utf8_target ); + regprop(rex, prop, scan, reginfo); + + PerlIO_printf(Perl_debug_log, + "%3"IVdf":%*s%s(%"IVdf")\n", + (IV)(scan - rexi->program), depth*2, "", + SvPVX_const(prop), + (PL_regkind[OP(scan)] == END || !rnext) ? + 0 : (IV)(rnext - rexi->program)); + }); + + next = scan + NEXT_OFF(scan); + if (next == scan) + next = NULL; + state_num = OP(scan); + + reenter_switch: + to_complement = 0; + + SET_nextchr; + assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); + + switch (state_num) { + case BOL: /* /^../ */ + case SBOL: /* /^../s */ + if (locinput == reginfo->strbeg) + break; + sayNO; + + case MBOL: /* /^../m */ + if (locinput == reginfo->strbeg || + (!NEXTCHR_IS_EOS && locinput[-1] == '\n')) + { + break; + } + sayNO; + + case GPOS: /* \G */ + if (locinput == reginfo->ganch) + break; + sayNO; + + case KEEPS: /* \K */ + /* update the startpoint */ + st->u.keeper.val = rex->offs[0].start; + rex->offs[0].start = locinput - reginfo->strbeg; + PUSH_STATE_GOTO(KEEPS_next, next, locinput); + assert(0); /*NOTREACHED*/ + case KEEPS_next_fail: + /* rollback the start point change */ + rex->offs[0].start = st->u.keeper.val; + sayNO_SILENT; + assert(0); /*NOTREACHED*/ + + case MEOL: /* /..$/m */ + if (!NEXTCHR_IS_EOS && nextchr != '\n') + sayNO; + break; + + case EOL: /* /..$/ */ + /* FALLTHROUGH */ + case SEOL: /* /..$/s */ + if (!NEXTCHR_IS_EOS && nextchr != '\n') + sayNO; + if (reginfo->strend - locinput > 1) + sayNO; + break; + + case EOS: /* \z */ + if (!NEXTCHR_IS_EOS) + sayNO; + break; + + case SANY: /* /./s */ + if (NEXTCHR_IS_EOS) + sayNO; + goto increment_locinput; + + case CANY: /* \C */ + if (NEXTCHR_IS_EOS) + sayNO; + locinput++; + break; + + case REG_ANY: /* /./ */ + if ((NEXTCHR_IS_EOS) || nextchr == '\n') + sayNO; + goto increment_locinput; + + +#undef ST +#define ST st->u.trie + case TRIEC: /* (ab|cd) with known charclass */ + /* In this case the charclass data is available inline so + we can fail fast without a lot of extra overhead. + */ + if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + assert(0); /* NOTREACHED */ + } + /* FALLTHROUGH */ + case TRIE: /* (ab|cd) */ + /* the basic plan of execution of the trie is: + * At the beginning, run though all the states, and + * find the longest-matching word. Also remember the position + * of the shortest matching word. For example, this pattern: + * 1 2 3 4 5 + * ab|a|x|abcd|abc + * when matched against the string "abcde", will generate + * accept states for all words except 3, with the longest + * matching word being 4, and the shortest being 2 (with + * the position being after char 1 of the string). + * + * Then for each matching word, in word order (i.e. 1,2,4,5), + * we run the remainder of the pattern; on each try setting + * the current position to the character following the word, + * returning to try the next word on failure. + * + * We avoid having to build a list of words at runtime by + * using a compile-time structure, wordinfo[].prev, which + * gives, for each word, the previous accepting word (if any). + * In the case above it would contain the mappings 1->2, 2->0, + * 3->0, 4->5, 5->1. We can use this table to generate, from + * the longest word (4 above), a list of all words, by + * following the list of prev pointers; this gives us the + * unordered list 4,5,1,2. Then given the current word we have + * just tried, we can go through the list and find the + * next-biggest word to try (so if we just failed on word 2, + * the next in the list is 4). + * + * Since at runtime we don't record the matching position in + * the string for each word, we have to work that out for + * each word we're about to process. The wordinfo table holds + * the character length of each word; given that we recorded + * at the start: the position of the shortest word and its + * length in chars, we just need to move the pointer the + * difference between the two char lengths. Depending on + * Unicode status and folding, that's cheap or expensive. + * + * This algorithm is optimised for the case where are only a + * small number of accept states, i.e. 0,1, or maybe 2. + * With lots of accepts states, and having to try all of them, + * it becomes quadratic on number of accept states to find all + * the next words. + */ + + { + /* what type of TRIE am I? (utf8 makes this contextual) */ + DECL_TRIE_TYPE(scan); + + /* what trie are we using right now */ + reg_trie_data * const trie + = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; + HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); + U32 state = trie->startstate; + + if ( trie->bitmap + && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) + { + if (trie->states[ state ].wordnum) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %smatched empty string...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + if (!trie->jump) + break; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + } + } + + { + U8 *uc = ( U8* )locinput; + + STRLEN len = 0; + STRLEN foldlen = 0; + U8 *uscan = (U8*)NULL; + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + U32 charcount = 0; /* how many input chars we have matched */ + U32 accepted = 0; /* have we seen any accepting states? */ + + ST.jump = trie->jump; + ST.me = scan; + ST.firstpos = NULL; + ST.longfold = FALSE; /* char longer if folded => it's harder */ + ST.nextword = 0; + + /* fully traverse the TRIE; note the position of the + shortest accept state and the wordnum of the longest + accept state */ + + while ( state && uc <= (U8*)(reginfo->strend) ) { + U32 base = trie->states[ state ].trans.base; + UV uvc = 0; + U16 charid = 0; + U16 wordnum; + wordnum = trie->states[ state ].wordnum; + + if (wordnum) { /* it's an accept state */ + if (!accepted) { + accepted = 1; + /* record first match position */ + if (ST.longfold) { + ST.firstpos = (U8*)locinput; + ST.firstchars = 0; + } + else { + ST.firstpos = uc; + ST.firstchars = charcount; + } + } + if (!ST.nextword || wordnum < ST.nextword) + ST.nextword = wordnum; + ST.topword = wordnum; + } + + DEBUG_TRIE_EXECUTE_r({ + DUMP_EXEC_POS( (char *)uc, scan, utf8_target ); + PerlIO_printf( Perl_debug_log, + "%*s %sState: %4"UVxf" Accepted: %c ", + 2+depth * 2, "", PL_colors[4], + (UV)state, (accepted ? 'Y' : 'N')); + }); + + /* read a char and goto next state */ + if ( base && (foldlen || uc < (U8*)(reginfo->strend))) { + I32 offset; + REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, + uscan, len, uvc, charid, foldlen, + foldbuf, uniflags); + charcount++; + if (foldlen>0) + ST.longfold = TRUE; + if (charid && + ( ((offset = + base + charid - 1 - trie->uniquecharcount)) >= 0) + + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state) + { + state = trie->trans[offset].next; + } + else { + state = 0; + } + uc += len; + + } + else { + state = 0; + } + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log, + "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n", + charid, uvc, (UV)state, PL_colors[5] ); + ); + } + if (!accepted) + sayNO; + + /* calculate total number of accept states */ + { + U16 w = ST.topword; + accepted = 0; + while (w) { + w = trie->wordinfo[w].prev; + accepted++; + } + ST.accepted = accepted; + } + + DEBUG_EXECUTE_r( + PerlIO_printf( Perl_debug_log, + "%*s %sgot %"IVdf" possible matches%s\n", + REPORT_CODE_OFF + depth * 2, "", + PL_colors[4], (IV)ST.accepted, PL_colors[5] ); + ); + goto trie_first_try; /* jump into the fail handler */ + }} + assert(0); /* NOTREACHED */ + + case TRIE_next_fail: /* we failed - try next alternative */ + { + U8 *uc; + if ( ST.jump) { + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + if (!--ST.accepted) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sTRIE failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); + sayNO_SILENT; + } + { + /* Find next-highest word to process. Note that this code + * is O(N^2) per trie run (O(N) per branch), so keep tight */ + U16 min = 0; + U16 word; + U16 const nextword = ST.nextword; + reg_trie_wordinfo * const wordinfo + = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; + for (word=ST.topword; word; word=wordinfo[word].prev) { + if (word > nextword && (!min || word < min)) + min = word; + } + ST.nextword = min; + } + + trie_first_try: + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } + + if ( ST.jump) { + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + REGCP_SET(ST.cp); + } + + /* find start char of end of current word */ + { + U32 chars; /* how many chars to skip */ + reg_trie_data * const trie + = (reg_trie_data*)rexi->data->data[ARG(ST.me)]; + + assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) + >= ST.firstchars); + chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) + - ST.firstchars; + uc = ST.firstpos; + + if (ST.longfold) { + /* the hard option - fold each char in turn and find + * its folded length (which may be different */ + U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; + STRLEN foldlen; + STRLEN len; + UV uvc; + U8 *uscan; + + while (chars) { + if (utf8_target) { + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, + uniflags); + uc += len; + } + else { + uvc = *uc; + uc++; + } + uvc = to_uni_fold(uvc, foldbuf, &foldlen); + uscan = foldbuf; + while (foldlen) { + if (!--chars) + break; + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, + uniflags); + uscan += len; + foldlen -= len; + } + } + } + else { + if (utf8_target) + while (chars--) + uc += UTF8SKIP(uc); + else + uc += chars; + } + } + + scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) + ? ST.jump[ST.nextword] + : NEXT_OFF(ST.me)); + + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sTRIE matched word #%d, continuing%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + ST.nextword, + PL_colors[5] + ); + }); + + if (ST.accepted > 1 || has_cutgroup) { + PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc); + assert(0); /* NOTREACHED */ + } + /* only one choice left - just continue */ + DEBUG_EXECUTE_r({ + AV *const trie_words + = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); + SV ** const tmp = av_fetch( trie_words, + ST.nextword-1, 0 ); + SV *sv= tmp ? sv_newmortal() : NULL; + + PerlIO_printf( Perl_debug_log, + "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], + ST.nextword, + tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII + ) + : "not compiled under -Dr", + PL_colors[5] ); + }); + + locinput = (char*)uc; + continue; /* execute rest of RE */ + assert(0); /* NOTREACHED */ + } +#undef ST + + case EXACT: { /* /abc/ */ + char *s = STRING(scan); + ln = STR_LEN(scan); + if (utf8_target != is_utf8_pat) { + /* The target and the pattern have differing utf8ness. */ + char *l = locinput; + const char * const e = s + ln; + + if (utf8_target) { + /* The target is utf8, the pattern is not utf8. + * Above-Latin1 code points can't match the pattern; + * invariants match exactly, and the other Latin1 ones need + * to be downgraded to a single byte in order to do the + * comparison. (If we could be confident that the target + * is not malformed, this could be refactored to have fewer + * tests by just assuming that if the first bytes match, it + * is an invariant, but there are tests in the test suite + * dealing with (??{...}) which violate this) */ + while (s < e) { + if (l >= reginfo->strend + || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) + { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)l)) { + if (*l != *s) { + sayNO; + } + l++; + } + else { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { + sayNO; + } + l += 2; + } + s++; + } + } + else { + /* The target is not utf8, the pattern is utf8. */ + while (s < e) { + if (l >= reginfo->strend + || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) + { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)s)) { + if (*s != *l) { + sayNO; + } + s++; + } + else { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { + sayNO; + } + s += 2; + } + l++; + } + } + locinput = l; + } + else { + /* The target and the pattern have the same utf8ness. */ + /* Inline the first character, for speed. */ + if (reginfo->strend - locinput < ln + || UCHARAT(s) != nextchr + || (ln > 1 && memNE(s, locinput, ln))) + { + sayNO; + } + locinput += ln; + } + break; + } + + case EXACTFL: { /* /abc/il */ + re_fold_t folder; + const U8 * fold_array; + const char * s; + U32 fold_utf8_flags; + + folder = foldEQ_locale; + fold_array = PL_fold_locale; + fold_utf8_flags = FOLDEQ_LOCALE; + goto do_exactf; + + case EXACTFU_SS: /* /\x{df}/iu */ + case EXACTFU: /* /abc/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; + goto do_exactf; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: /* /abc/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf; + + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + folder = foldEQ; + fold_array = PL_fold; + fold_utf8_flags = 0; + + do_exactf: + s = STRING(scan); + ln = STR_LEN(scan); + + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { + /* Either target or the pattern are utf8, or has the issue where + * the fold lengths may differ. */ + const char * const l = locinput; + char *e = reginfo->strend; + + if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat, + l, &e, 0, utf8_target, fold_utf8_flags)) + { + sayNO; + } + locinput = e; + break; + } + + /* Neither the target nor the pattern are utf8 */ + if (UCHARAT(s) != nextchr + && !NEXTCHR_IS_EOS + && UCHARAT(s) != fold_array[nextchr]) + { + sayNO; + } + if (reginfo->strend - locinput < ln) + sayNO; + if (ln > 1 && ! folder(s, locinput, ln)) + sayNO; + locinput += ln; + break; + } + + /* XXX Could improve efficiency by separating these all out using a + * macro or in-line function. At that point regcomp.c would no longer + * have to set the FLAGS fields of these */ + case BOUNDL: /* /\b/l */ + case NBOUNDL: /* /\B/l */ + case BOUND: /* /\b/ */ + case BOUNDU: /* /\b/u */ + case BOUNDA: /* /\b/a */ + case NBOUND: /* /\B/ */ + case NBOUNDU: /* /\B/u */ + case NBOUNDA: /* /\B/a */ + /* was last char in word? */ + if (utf8_target + && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET + && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET) + { + if (locinput == reginfo->strbeg) + ln = '\n'; + else { + const U8 * const r = + reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); + + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); + } + if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { + ln = isWORDCHAR_uni(ln); + if (NEXTCHR_IS_EOS) + n = 0; + else { + LOAD_UTF8_CHARCLASS_ALNUM(); + n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput, + utf8_target); + } + } + else { + ln = isWORDCHAR_LC_uvchr(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); + } + } + else { + + /* Here the string isn't utf8, or is utf8 and only ascii + * characters are to match \w. In the latter case looking at + * the byte just prior to the current one may be just the final + * byte of a multi-byte character. This is ok. There are two + * cases: + * 1) it is a single byte character, and then the test is doing + * just what it's supposed to. + * 2) it is a multi-byte character, in which case the final + * byte is never mistakable for ASCII, and so the test + * will say it is not a word character, which is the + * correct answer. */ + ln = (locinput != reginfo->strbeg) ? + UCHARAT(locinput - 1) : '\n'; + switch (FLAGS(scan)) { + case REGEX_UNICODE_CHARSET: + ln = isWORDCHAR_L1(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr); + break; + case REGEX_LOCALE_CHARSET: + ln = isWORDCHAR_LC(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr); + break; + case REGEX_DEPENDS_CHARSET: + ln = isWORDCHAR(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr); + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + ln = isWORDCHAR_A(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr); + break; + default: + Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); + } + } + /* Note requires that all BOUNDs be lower than all NBOUNDs in + * regcomp.sym */ + if (((!ln) == (!n)) == (OP(scan) < NBOUND)) + sayNO; + break; + + case ANYOF: /* /[abc]/ */ + if (NEXTCHR_IS_EOS) + sayNO; + if (utf8_target) { + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) + sayNO; + locinput += UTF8SKIP(locinput); + } + else { + if (!REGINCLASS(rex, scan, (U8*)locinput)) + sayNO; + locinput++; + } + break; + + /* The argument (FLAGS) to all the POSIX node types is the class number + * */ + + case NPOSIXL: /* \W or [:^punct:] etc. under /l */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: /* \w or [:punct:] etc. under /l */ + if (NEXTCHR_IS_EOS) + sayNO; + + /* Use isFOO_lc() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) { + sayNO; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, + *(locinput + 1)))))) + { + sayNO; + } + } + else { /* Here, must be an above Latin-1 code point */ + goto utf8_posix_not_eos; + } + + /* Here, must be utf8 */ + locinput += UTF8SKIP(locinput); + break; + + case NPOSIXD: /* \W or [:^punct:] etc. under /d */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: /* \w or [:punct:] etc. under /d */ + if (utf8_target) { + goto utf8_posix; + } + goto posixa; + + case NPOSIXA: /* \W or [:^punct:] etc. under /a */ + + if (NEXTCHR_IS_EOS) { + sayNO; + } + + /* All UTF-8 variants match */ + if (! UTF8_IS_INVARIANT(nextchr)) { + goto increment_locinput; + } + + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXA: /* \w or [:punct:] etc. under /a */ + + posixa: + /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in + * UTF-8, and also from NPOSIXA even in UTF-8 when the current + * character is a single byte */ + + if (NEXTCHR_IS_EOS + || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, + FLAGS(scan))))) + { + sayNO; + } + + /* Here we are either not in utf8, or we matched a utf8-invariant, + * so the next char is the next byte */ + locinput++; + break; + + case NPOSIXU: /* \W or [:^punct:] etc. under /u */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: /* \w or [:punct:] etc. under /u */ + utf8_posix: + if (NEXTCHR_IS_EOS) { + sayNO; + } + utf8_posix_not_eos: + + /* Use _generic_isCC() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(_generic_isCC(nextchr, + FLAGS(scan))))) + { + sayNO; + } + locinput++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, + *(locinput + 1)), + FLAGS(scan))))) + { + sayNO; + } + locinput += 2; + } + else { /* Handle above Latin-1 code points */ + classnum = (_char_class_number) FLAGS(scan); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, uses a swash to find such code points. Load if if + * not done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] + = _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + if (! (to_complement + ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) locinput, TRUE)))) + { + sayNO; + } + } + else { /* Here, uses macros to find above Latin-1 code points */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate + code if we revert the change + of \v matching this */ + case _CC_ENUM_PSXSPC: + if (! (to_complement + ^ cBOOL(is_XPERLSPACE_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_BLANK: + if (! (to_complement + ^ cBOOL(is_HORIZWS_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_XDIGIT: + if (! (to_complement + ^ cBOOL(is_XDIGIT_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_VERTSPACE: + if (! (to_complement + ^ cBOOL(is_VERTWS_high(locinput)))) + { + sayNO; + } + break; + default: /* The rest, e.g. [:cntrl:], can't match + above Latin1 */ + if (! to_complement) { + sayNO; + } + break; + } + } + locinput += UTF8SKIP(locinput); + } + break; + + case CLUMP: /* Match \X: logical Unicode character. This is defined as + a Unicode extended Grapheme Cluster */ + /* From http://www.unicode.org/reports/tr29 (5.2 version). An + extended Grapheme Cluster is: + + CR LF + | Prepend* Begin Extend* + | . + + Begin is: ( Special_Begin | ! Control ) + Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) + Extend is: ( Grapheme_Extend | Spacing_Mark ) + Control is: [ GCB_Control | CR | LF ] + Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) + + If we create a 'Regular_Begin' = Begin - Special_Begin, then + we can rewrite + + Begin is ( Regular_Begin + Special Begin ) + + It turns out that 98.4% of all Unicode code points match + Regular_Begin. Doing it this way eliminates a table match in + the previous implementation for almost all Unicode code points. + + There is a subtlety with Prepend* which showed up in testing. + Note that the Begin, and only the Begin is required in: + | Prepend* Begin Extend* + Also, Begin contains '! Control'. A Prepend must be a + '! Control', which means it must also be a Begin. What it + comes down to is that if we match Prepend* and then find no + suitable Begin afterwards, that if we backtrack the last + Prepend, that one will be a suitable Begin. + */ + + if (NEXTCHR_IS_EOS) + sayNO; + if (! utf8_target) { + + /* Match either CR LF or '.', as all the other possibilities + * require utf8 */ + locinput++; /* Match the . or CR */ + if (nextchr == '\r' /* And if it was CR, and the next is LF, + match the LF */ + && locinput < reginfo->strend + && UCHARAT(locinput) == '\n') + { + locinput++; + } + } + else { + + /* Utf8: See if is ( CR LF ); already know that locinput < + * reginfo->strend, so locinput+1 is in bounds */ + if ( nextchr == '\r' && locinput+1 < reginfo->strend + && UCHARAT(locinput + 1) == '\n') + { + locinput += 2; + } + else { + STRLEN len; + + /* In case have to backtrack to beginning, then match '.' */ + char *starting = locinput; + + /* In case have to backtrack the last prepend */ + char *previous_prepend = NULL; + + LOAD_UTF8_CHARCLASS_GCB(); + + /* Match (prepend)* */ + while (locinput < reginfo->strend + && (len = is_GCB_Prepend_utf8(locinput))) + { + previous_prepend = locinput; + locinput += len; + } + + /* As noted above, if we matched a prepend character, but + * the next thing won't match, back off the last prepend we + * matched, as it is guaranteed to match the begin */ + if (previous_prepend + && (locinput >= reginfo->strend + || (! swash_fetch(PL_utf8_X_regular_begin, + (U8*)locinput, utf8_target) + && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput))) + ) + { + locinput = previous_prepend; + } + + /* Note that here we know reginfo->strend > locinput, as we + * tested that upon input to this switch case, and if we + * moved locinput forward, we tested the result just above + * and it either passed, or we backed off so that it will + * now pass */ + if (swash_fetch(PL_utf8_X_regular_begin, + (U8*)locinput, utf8_target)) { + locinput += UTF8SKIP(locinput); + } + else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) { + + /* Here did not match the required 'Begin' in the + * second term. So just match the very first + * character, the '.' of the final term of the regex */ + locinput = starting + UTF8SKIP(starting); + goto exit_utf8; + } else { + + /* Here is a special begin. It can be composed of + * several individual characters. One possibility is + * RI+ */ + if ((len = is_GCB_RI_utf8(locinput))) { + locinput += len; + while (locinput < reginfo->strend + && (len = is_GCB_RI_utf8(locinput))) + { + locinput += len; + } + } else if ((len = is_GCB_T_utf8(locinput))) { + /* Another possibility is T+ */ + locinput += len; + while (locinput < reginfo->strend + && (len = is_GCB_T_utf8(locinput))) + { + locinput += len; + } + } else { + + /* Here, neither RI+ nor T+; must be some other + * Hangul. That means it is one of the others: L, + * LV, LVT or V, and matches: + * L* (L | LVT T* | V * V* T* | LV V* T*) */ + + /* Match L* */ + while (locinput < reginfo->strend + && (len = is_GCB_L_utf8(locinput))) + { + locinput += len; + } + + /* Here, have exhausted L*. If the next character + * is not an LV, LVT nor V, it means we had to have + * at least one L, so matches L+ in the original + * equation, we have a complete hangul syllable. + * Are done. */ + + if (locinput < reginfo->strend + && is_GCB_LV_LVT_V_utf8(locinput)) + { + /* Otherwise keep going. Must be LV, LVT or V. + * See if LVT, by first ruling out V, then LV */ + if (! is_GCB_V_utf8(locinput) + /* All but every TCount one is LV */ + && (valid_utf8_to_uvchr((U8 *) locinput, + NULL) + - SBASE) + % TCount != 0) + { + locinput += UTF8SKIP(locinput); + } else { + + /* Must be V or LV. Take it, then match + * V* */ + locinput += UTF8SKIP(locinput); + while (locinput < reginfo->strend + && (len = is_GCB_V_utf8(locinput))) + { + locinput += len; + } + } + + /* And any of LV, LVT, or V can be followed + * by T* */ + while (locinput < reginfo->strend + && (len = is_GCB_T_utf8(locinput))) + { + locinput += len; + } + } + } + } + + /* Match any extender */ + while (locinput < reginfo->strend + && swash_fetch(PL_utf8_X_extend, + (U8*)locinput, utf8_target)) + { + locinput += UTF8SKIP(locinput); + } + } + exit_utf8: + if (locinput > reginfo->strend) sayNO; + } + break; + + case NREFFL: /* /\g{name}/il */ + { /* The capture buffer cases. The ones beginning with N for the + named buffers just convert to the equivalent numbered and + pretend they were called as the corresponding numbered buffer + op. */ + /* don't initialize these in the declaration, it makes C++ + unhappy */ + const char *s; + char type; + re_fold_t folder; + const U8 *fold_array; + UV utf8_fold_flags; + + folder = foldEQ_locale; + fold_array = PL_fold_locale; + type = REFFL; + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_nref; + + case NREFFA: /* /\g{name}/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFA; + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_nref; + + case NREFFU: /* /\g{name}/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFU; + utf8_fold_flags = 0; + goto do_nref; + + case NREFF: /* /\g{name}/i */ + folder = foldEQ; + fold_array = PL_fold; + type = REFF; + utf8_fold_flags = 0; + goto do_nref; + + case NREF: /* /\g{name}/ */ + type = REF; + folder = NULL; + fold_array = NULL; + utf8_fold_flags = 0; + do_nref: + + /* For the named back references, find the corresponding buffer + * number */ + n = reg_check_named_buff_matched(rex,scan); + + if ( ! n ) { + sayNO; + } + goto do_nref_ref_common; + + case REFFL: /* /\1/il */ + folder = foldEQ_locale; + fold_array = PL_fold_locale; + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_ref; + + case REFFA: /* /\1/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_ref; + + case REFFU: /* /\1/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + utf8_fold_flags = 0; + goto do_ref; + + case REFF: /* /\1/i */ + folder = foldEQ; + fold_array = PL_fold; + utf8_fold_flags = 0; + goto do_ref; + + case REF: /* /\1/ */ + folder = NULL; + fold_array = NULL; + utf8_fold_flags = 0; + + do_ref: + type = OP(scan); + n = ARG(scan); /* which paren pair */ + + do_nref_ref_common: + ln = rex->offs[n].start; + reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ + if (rex->lastparen < n || ln == -1) + sayNO; /* Do not match unless seen CLOSEn. */ + if (ln == rex->offs[n].end) + break; + + s = reginfo->strbeg + ln; + if (type != REF /* REF can do byte comparison */ + && (utf8_target || type == REFFU || type == REFFL)) + { + char * limit = reginfo->strend; + + /* This call case insensitively compares the entire buffer + * at s, with the current input starting at locinput, but + * not going off the end given by reginfo->strend, and + * returns in upon success, how much of the + * current input was matched */ + if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, + locinput, &limit, 0, utf8_target, utf8_fold_flags)) + { + sayNO; + } + locinput = limit; + break; + } + + /* Not utf8: Inline the first character, for speed. */ + if (!NEXTCHR_IS_EOS && + UCHARAT(s) != nextchr && + (type == REF || + UCHARAT(s) != fold_array[nextchr])) + sayNO; + ln = rex->offs[n].end - ln; + if (locinput + ln > reginfo->strend) + sayNO; + if (ln > 1 && (type == REF + ? memNE(s, locinput, ln) + : ! folder(s, locinput, ln))) + sayNO; + locinput += ln; + break; + } + + case NOTHING: /* null op; e.g. the 'nothing' following + * the '*' in m{(a+|b)*}' */ + break; + case TAIL: /* placeholder while compiling (A|B|C) */ + break; + + case BACK: /* ??? doesn't appear to be used ??? */ + break; + +#undef ST +#define ST st->u.eval + { + SV *ret; + REGEXP *re_sv; + regexp *re; + regexp_internal *rei; + regnode *startpoint; + + case GOSTART: /* (?R) */ + case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) + Perl_croak(aTHX_ "Infinite recursion in regex"); + if ( ++nochange_depth > max_nochange_depth ) + Perl_croak(aTHX_ + "Pattern subroutine nesting without pos change" + " exceeded limit in regex"); + } else { + nochange_depth = 0; + } + re_sv = rex_sv; + re = rex; + rei = rexi; + if (OP(scan)==GOSUB) { + startpoint = scan + ARG2L(scan); + ST.close_paren = ARG(scan); + } else { + startpoint = rei->program+1; + ST.close_paren = 0; + } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ + goto eval_recurse_doit; + + assert(0); /* NOTREACHED */ + + case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if ( ++nochange_depth > max_nochange_depth ) + Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); + } else { + nochange_depth = 0; + } + { + /* execute the code in the {...} */ + + dSP; + IV before; + OP * const oop = PL_op; + COP * const ocurcop = PL_curcop; + OP *nop; + CV *newcv; + + /* save *all* paren positions */ + regcppush(rex, 0, maxopenparen); + REGCP_SET(runops_cp); + + if (!caller_cv) + caller_cv = find_runcv(NULL); + + n = ARG(scan); + + if (rexi->data->what[n] == 'r') { /* code from an external qr */ + newcv = (ReANY( + (REGEXP*)(rexi->data->data[n]) + ))->qr_anoncv + ; + nop = (OP*)rexi->data->data[n+1]; + } + else if (rexi->data->what[n] == 'l') { /* literal code */ + newcv = caller_cv; + nop = (OP*)rexi->data->data[n]; + assert(CvDEPTH(newcv)); + } + else { + /* literal with own CV */ + assert(rexi->data->what[n] == 'L'); + newcv = rex->qr_anoncv; + nop = (OP*)rexi->data->data[n]; + } + + /* normally if we're about to execute code from the same + * CV that we used previously, we just use the existing + * CX stack entry. However, its possible that in the + * meantime we may have backtracked, popped from the save + * stack, and undone the SAVECOMPPAD(s) associated with + * PUSH_MULTICALL; in which case PL_comppad no longer + * points to newcv's pad. */ + if (newcv != last_pushed_cv || PL_comppad != last_pad) + { + U8 flags = (CXp_SUB_RE | + ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); + if (last_pushed_cv) { + CHANGE_MULTICALL_FLAGS(newcv, flags); + } + else { + PUSH_MULTICALL_FLAGS(newcv, flags); + } + last_pushed_cv = newcv; + } + else { + /* these assignments are just to silence compiler + * warnings */ + multicall_cop = NULL; + newsp = NULL; + } + last_pad = PL_comppad; + + /* the initial nextstate you would normally execute + * at the start of an eval (which would cause error + * messages to come from the eval), may be optimised + * away from the execution path in the regex code blocks; + * so manually set PL_curcop to it initially */ + { + OP *o = cUNOPx(nop)->op_first; + assert(o->op_type == OP_NULL); + if (o->op_targ == OP_SCOPE) { + o = cUNOPo->op_first; + } + else { + assert(o->op_targ == OP_LEAVE); + o = cUNOPo->op_first; + assert(o->op_type == OP_ENTER); + o = OP_SIBLING(o); + } + + if (o->op_type != OP_STUB) { + assert( o->op_type == OP_NEXTSTATE + || o->op_type == OP_DBSTATE + || (o->op_type == OP_NULL + && ( o->op_targ == OP_NEXTSTATE + || o->op_targ == OP_DBSTATE + ) + ) + ); + PL_curcop = (COP*)o; + } + } + nop = nop->op_next; + + DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, + " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); + + rex->offs[0].end = locinput - reginfo->strbeg; + if (reginfo->info_aux_eval->pos_magic) + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); + + if (sv_yes_mark) { + SV *sv_mrk = get_sv("REGMARK", 1); + sv_setsv(sv_mrk, sv_yes_mark); + } + + /* we don't use MULTICALL here as we want to call the + * first op of the block of interest, rather than the + * first op of the sub */ + before = (IV)(SP-PL_stack_base); + PL_op = nop; + CALLRUNOPS(aTHX); /* Scalar context. */ + SPAGAIN; + if ((IV)(SP-PL_stack_base) == before) + ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ + else { + ret = POPs; + PUTBACK; + } + + /* before restoring everything, evaluate the returned + * value, so that 'uninit' warnings don't use the wrong + * PL_op or pad. Also need to process any magic vars + * (e.g. $1) *before* parentheses are restored */ + + PL_op = NULL; + + re_sv = NULL; + if (logical == 0) /* (?{})/ */ + sv_setsv(save_scalar(PL_replgv), ret); /* $^R */ + else if (logical == 1) { /* /(?(?{...})X|Y)/ */ + sw = cBOOL(SvTRUE(ret)); + logical = 0; + } + else { /* /(??{}) */ + /* if its overloaded, let the regex compiler handle + * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); + if (!SvAMAGIC(ret)) { + SV *sv = ret; + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) == SVt_REGEXP) + re_sv = (REGEXP*) sv; + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); + if (mg) + re_sv = (REGEXP *) mg->mg_obj; + } + + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { + ret = sv_mortalcopy(ret); + (void) SvPV_force_nolen(ret); + } + } + + } + + /* *** Note that at this point we don't restore + * PL_comppad, (or pop the CxSUB) on the assumption it may + * be used again soon. This is safe as long as nothing + * in the regexp code uses the pad ! */ + PL_op = oop; + PL_curcop = ocurcop; + S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); + PL_curpm = PL_reg_curpm; + + if (logical != 2) + break; + } + + /* only /(??{})/ from now on */ + logical = 0; + { + /* extract RE object from returned value; compiling if + * necessary */ + + if (re_sv) { + re_sv = reg_temp_copy(NULL, re_sv); + } + else { + U32 pm_flags = 0; + + if (SvUTF8(ret) && IN_BYTES) { + /* In use 'bytes': make a copy of the octet + * sequence, but without the flag on */ + STRLEN len; + const char *const p = SvPV(ret, len); + ret = newSVpvn_flags(p, len, SVs_TEMP); + } + if (rex->intflags & PREGf_USE_RE_EVAL) + pm_flags |= PMf_USE_RE_EVAL; + + /* if we got here, it should be an engine which + * supports compiling code blocks and stuff */ + assert(rex->engine && rex->engine->op_comp); + assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); + re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, + rex->engine, NULL, NULL, + /* copy /msix etc to inner pattern */ + scan->flags, + pm_flags); + + if (!(SvFLAGS(ret) + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { + /* This isn't a first class regexp. Instead, it's + caching a regexp onto an existing, Perl visible + scalar. */ + sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); + } + } + SAVEFREESV(re_sv); + re = ReANY(re_sv); + } + RXp_MATCH_COPIED_off(re); + re->subbeg = rex->subbeg; + re->sublen = rex->sublen; + re->suboffset = rex->suboffset; + re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; + rei = RXi_GET(re); + DEBUG_EXECUTE_r( + debug_start_match(re_sv, utf8_target, locinput, + reginfo->strend, "Matching embedded"); + ); + startpoint = rei->program + 1; + ST.close_paren = 0; /* only used for GOSUB */ + /* Save all the seen positions so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + /* and set maxopenparen to 0, since we are starting a "fresh" match */ + maxopenparen = 0; + /* run the pattern returned from (??{...}) */ + + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ + + /* invalidate the S-L poscache. We're now executing a + * different set of WHILEM ops (and their associated + * indexes) against the same string, so the bits in the + * cache are meaningless. Setting maxiter to zero forces + * the cache to be invalidated and zeroed before reuse. + * XXX This is too dramatic a measure. Ideally we should + * save the old cache and restore when running the outer + * pattern again */ + reginfo->poscache_maxiter = 0; + + /* the new regexp might have a different is_utf8_pat than we do */ + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); + + ST.prev_rex = rex_sv; + ST.prev_curlyx = cur_curlyx; + rex_sv = re_sv; + SET_reg_curpm(rex_sv); + rex = re; + rexi = rei; + cur_curlyx = NULL; + ST.B = next; + ST.prev_eval = cur_eval; + cur_eval = st; + /* now continue from first node in postoned RE */ + PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput); + assert(0); /* NOTREACHED */ + } + + case EVAL_AB: /* cleanup after a successful (??{A})B */ + /* note: this is called twice; first after popping B, then A */ + rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } + cur_eval = ST.prev_eval; + cur_curlyx = ST.prev_curlyx; + + /* Invalidate cache. See "invalidate" comment above. */ + reginfo->poscache_maxiter = 0; + if ( nochange_depth ) + nochange_depth--; + sayYES; + + + case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ + /* note: this is called twice; first after popping B, then A */ + rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + cur_eval = ST.prev_eval; + cur_curlyx = ST.prev_curlyx; + /* Invalidate cache. See "invalidate" comment above. */ + reginfo->poscache_maxiter = 0; + if ( nochange_depth ) + nochange_depth--; + sayNO_SILENT; +#undef ST + + case OPEN: /* ( */ + n = ARG(scan); /* which paren pair */ + rex->offs[n].start_tmp = locinput - reginfo->strbeg; + if (n > maxopenparen) + maxopenparen = n; + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n", + PTR2UV(rex), + PTR2UV(rex->offs), + (UV)n, + (IV)rex->offs[n].start_tmp, + (UV)maxopenparen + )); + lastopen = n; + break; + +/* XXX really need to log other places start/end are set too */ +#define CLOSE_CAPTURE \ + rex->offs[n].start = rex->offs[n].start_tmp; \ + rex->offs[n].end = locinput - reginfo->strbeg; \ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \ + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \ + PTR2UV(rex), \ + PTR2UV(rex->offs), \ + (UV)n, \ + (IV)rex->offs[n].start, \ + (IV)rex->offs[n].end \ + )) + + case CLOSE: /* ) */ + n = ARG(scan); /* which paren pair */ + CLOSE_CAPTURE; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; + if (cur_eval && cur_eval->u.eval.close_paren == n) { + goto fake_end; + } + break; + + case ACCEPT: /* (*ACCEPT) */ + if (ARG(scan)){ + regnode *cursor; + for (cursor=scan; + cursor && OP(cursor)!=END; + cursor=regnext(cursor)) + { + if ( OP(cursor)==CLOSE ){ + n = ARG(cursor); + if ( n <= lastopen ) { + CLOSE_CAPTURE; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; + if ( n == ARG(scan) || (cur_eval && + cur_eval->u.eval.close_paren == n)) + break; + } + } + } + } + goto fake_end; + /*NOTREACHED*/ + + case GROUPP: /* (?(1)) */ + n = ARG(scan); /* which paren pair */ + sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); + break; + + case NGROUPP: /* (?()) */ + /* reg_check_named_buff_matched returns 0 for no match */ + sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); + break; + + case INSUBP: /* (?(R)) */ + n = ARG(scan); + sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n)); + break; + + case DEFINEP: /* (?(DEFINE)) */ + sw = 0; + break; + + case IFTHEN: /* (?(cond)A|B) */ + reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ + if (sw) + next = NEXTOPER(NEXTOPER(scan)); + else { + next = scan + ARG(scan); + if (OP(next) == IFTHEN) /* Fake one. */ + next = NEXTOPER(NEXTOPER(next)); + } + break; + + case LOGICAL: /* modifier for EVAL and IFMATCH */ + logical = scan->flags; + break; + +/******************************************************************* + +The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ +pattern, where A and B are subpatterns. (For simple A, CURLYM or +STAR/PLUS/CURLY/CURLYN are used instead.) + +A*B is compiled as + +On entry to the subpattern, CURLYX is called. This pushes a CURLYX +state, which contains the current count, initialised to -1. It also sets +cur_curlyx to point to this state, with any previous value saved in the +state block. + +CURLYX then jumps straight to the WHILEM op, rather than executing A, +since the pattern may possibly match zero times (i.e. it's a while {} loop +rather than a do {} while loop). + +Each entry to WHILEM represents a successful match of A. The count in the +CURLYX block is incremented, another WHILEM state is pushed, and execution +passes to A or B depending on greediness and the current count. + +For example, if matching against the string a1a2a3b (where the aN are +substrings that match /A/), then the match progresses as follows: (the +pushed states are interspersed with the bits of strings matched so far): + + + + a1 + a1 a2 + a1 a2 a3 + a1 a2 a3 b + +(Contrast this with something like CURLYM, which maintains only a single +backtrack state: + + a1 + a1 a2 + a1 a2 a3 + a1 a2 a3 b +) + +Each WHILEM state block marks a point to backtrack to upon partial failure +of A or B, and also contains some minor state data related to that +iteration. The CURLYX block, pointed to by cur_curlyx, contains the +overall state, such as the count, and pointers to the A and B ops. + +This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx +must always point to the *current* CURLYX block, the rules are: + +When executing CURLYX, save the old cur_curlyx in the CURLYX state block, +and set cur_curlyx to point the new block. + +When popping the CURLYX block after a successful or unsuccessful match, +restore the previous cur_curlyx. + +When WHILEM is about to execute B, save the current cur_curlyx, and set it +to the outer one saved in the CURLYX block. + +When popping the WHILEM block after a successful or unsuccessful B match, +restore the previous cur_curlyx. + +Here's an example for the pattern (AI* BI)*BO +I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: + +cur_ +curlyx backtrack stack +------ --------------- +NULL +CO +CI ai +CO ai bi +NULL ai bi bo + +At this point the pattern succeeds, and we work back down the stack to +clean up, restoring as we go: + +CO ai bi +CI ai +CO +NULL + +*******************************************************************/ + +#define ST st->u.curlyx + + case CURLYX: /* start of /A*B/ (for complex A) */ + { + /* No need to save/restore up to this paren */ + I32 parenfloor = scan->flags; + + assert(next); /* keep Coverity happy */ + if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ + next += ARG(next); + + /* XXXX Probably it is better to teach regpush to support + parenfloor > maxopenparen ... */ + if (parenfloor > (I32)rex->lastparen) + parenfloor = rex->lastparen; /* Pessimization... */ + + ST.prev_curlyx= cur_curlyx; + cur_curlyx = st; + ST.cp = PL_savestack_ix; + + /* these fields contain the state of the current curly. + * they are accessed by subsequent WHILEMs */ + ST.parenfloor = parenfloor; + ST.me = scan; + ST.B = next; + ST.minmod = minmod; + minmod = 0; + ST.count = -1; /* this will be updated by WHILEM */ + ST.lastloc = NULL; /* this will be updated by WHILEM */ + + PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput); + assert(0); /* NOTREACHED */ + } + + case CURLYX_end: /* just finished matching all of A*B */ + cur_curlyx = ST.prev_curlyx; + sayYES; + assert(0); /* NOTREACHED */ + + case CURLYX_end_fail: /* just failed to match all of A*B */ + regcpblow(ST.cp); + cur_curlyx = ST.prev_curlyx; + sayNO; + assert(0); /* NOTREACHED */ + + +#undef ST +#define ST st->u.whilem + + case WHILEM: /* just matched an A in /A*B/ (for complex A) */ + { + /* see the discussion above about CURLYX/WHILEM */ + I32 n; + int min, max; + regnode *A; + + assert(cur_curlyx); /* keep Coverity happy */ + + min = ARG1(cur_curlyx->u.curlyx.me); + max = ARG2(cur_curlyx->u.curlyx.me); + A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS; + n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ + ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; + ST.cache_offset = 0; + ST.cache_mask = 0; + + + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: matched %ld out of %d..%d\n", + REPORT_CODE_OFF+depth*2, "", (long)n, min, max) + ); + + /* First just match a string of min A's. */ + + if (n < min) { + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + cur_curlyx->u.curlyx.lastloc = locinput; + REGCP_SET(ST.lastcp); + + PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput); + assert(0); /* NOTREACHED */ + } + + /* If degenerate A matches "", assume A done. */ + + if (locinput == cur_curlyx->u.curlyx.lastloc) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: empty match detected, trying continuation...\n", + REPORT_CODE_OFF+depth*2, "") + ); + goto do_whilem_B_max; + } + + /* super-linear cache processing. + * + * The idea here is that for certain types of CURLYX/WHILEM - + * principally those whose upper bound is infinity (and + * excluding regexes that have things like \1 and other very + * non-regular expresssiony things), then if a pattern like + * /....A*.../ fails and we backtrack to the WHILEM, then we + * make a note that this particular WHILEM op was at string + * position 47 (say) when the rest of pattern failed. Then, if + * we ever find ourselves back at that WHILEM, and at string + * position 47 again, we can just fail immediately rather than + * running the rest of the pattern again. + * + * This is very handy when patterns start to go + * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up + * with a combinatorial explosion of backtracking. + * + * The cache is implemented as a bit array, with one bit per + * string byte position per WHILEM op (up to 16) - so its + * between 0.25 and 2x the string size. + * + * To avoid allocating a poscache buffer every time, we do an + * initially countdown; only after we have executed a WHILEM + * op (string-length x #WHILEMs) times do we allocate the + * cache. + * + * The top 4 bits of scan->flags byte say how many different + * relevant CURLLYX/WHILEM op pairs there are, while the + * bottom 4-bits is the identifying index number of this + * WHILEM. + */ + + if (scan->flags) { + + if (!reginfo->poscache_maxiter) { + /* start the countdown: Postpone detection until we + * know the match is not *that* much linear. */ + reginfo->poscache_maxiter + = (reginfo->strend - reginfo->strbeg + 1) + * (scan->flags>>4); + /* possible overflow for long strings and many CURLYX's */ + if (reginfo->poscache_maxiter < 0) + reginfo->poscache_maxiter = I32_MAX; + reginfo->poscache_iter = reginfo->poscache_maxiter; + } + + if (reginfo->poscache_iter-- == 0) { + /* initialise cache */ + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; + regmatch_info_aux *const aux = reginfo->info_aux; + if (aux->poscache) { + if ((SSize_t)reginfo->poscache_size < size) { + Renew(aux->poscache, size, char); + reginfo->poscache_size = size; + } + Zero(aux->poscache, size, char); + } + else { + reginfo->poscache_size = size; + Newxz(aux->poscache, size, char); + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%swhilem: Detected a super-linear match, switching on caching%s...\n", + PL_colors[4], PL_colors[5]) + ); + } + + if (reginfo->poscache_iter < 0) { + /* have we already failed at this position? */ + SSize_t offset, mask; + + reginfo->poscache_iter = -1; /* stop eventual underflow */ + offset = (scan->flags & 0xf) - 1 + + (locinput - reginfo->strbeg) + * (scan->flags>>4); + mask = 1 << (offset % 8); + offset /= 8; + if (reginfo->info_aux->poscache[offset] & mask) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: (cache) already tried at this position...\n", + REPORT_CODE_OFF+depth*2, "") + ); + sayNO; /* cache records failure */ + } + ST.cache_offset = offset; + ST.cache_mask = mask; + } + } + + /* Prefer B over A for minimal matching. */ + + if (cur_curlyx->u.curlyx.minmod) { + ST.save_curlyx = cur_curlyx; + cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; + ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor, + maxopenparen); + REGCP_SET(ST.lastcp); + PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, + locinput); + assert(0); /* NOTREACHED */ + } + + /* Prefer A over B for maximal matching. */ + + if (n < max) { /* More greed allowed? */ + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + cur_curlyx->u.curlyx.lastloc = locinput; + REGCP_SET(ST.lastcp); + PUSH_STATE_GOTO(WHILEM_A_max, A, locinput); + assert(0); /* NOTREACHED */ + } + goto do_whilem_B_max; + } + assert(0); /* NOTREACHED */ + + case WHILEM_B_min: /* just matched B in a minimal match */ + case WHILEM_B_max: /* just matched B in a maximal match */ + cur_curlyx = ST.save_curlyx; + sayYES; + assert(0); /* NOTREACHED */ + + case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ + cur_curlyx = ST.save_curlyx; + cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + assert(0); /* NOTREACHED */ + + case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ + /* FALLTHROUGH */ + case WHILEM_A_pre_fail: /* just failed to match even minimal A */ + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + assert(0); /* NOTREACHED */ + + case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); /* Restore some previous $s? */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%*s whilem: failed, trying continuation...\n", + REPORT_CODE_OFF+depth*2, "") + ); + do_whilem_B_max: + if (cur_curlyx->u.curlyx.count >= REG_INFTY + && ckWARN(WARN_REGEXP) + && !reginfo->warned) + { + reginfo->warned = TRUE; + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion limit (%d) " + "exceeded", + REG_INFTY - 1); + } + + /* now try B */ + ST.save_curlyx = cur_curlyx; + cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; + PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, + locinput); + assert(0); /* NOTREACHED */ + + case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ + cur_curlyx = ST.save_curlyx; + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + + if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { + /* Maximum greed exceeded */ + if (cur_curlyx->u.curlyx.count >= REG_INFTY + && ckWARN(WARN_REGEXP) + && !reginfo->warned) + { + reginfo->warned = TRUE; + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion " + "limit (%d) exceeded", + REG_INFTY - 1); + } + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "") + ); + /* Try grabbing another A and see if it helps. */ + cur_curlyx->u.curlyx.lastloc = locinput; + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + REGCP_SET(ST.lastcp); + PUSH_STATE_GOTO(WHILEM_A_min, + /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, + locinput); + assert(0); /* NOTREACHED */ + +#undef ST +#define ST st->u.branch + + case BRANCHJ: /* /(...|A|...)/ with long next pointer */ + next = scan + ARG(scan); + if (next == scan) + next = NULL; + scan = NEXTOPER(scan); + /* FALLTHROUGH */ + + case BRANCH: /* /(...|A|...)/ */ + scan = NEXTOPER(scan); /* scan now points to inner node */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + ST.next_branch = next; + REGCP_SET(ST.cp); + + /* Now go into the branch */ + if (has_cutgroup) { + PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput); + } else { + PUSH_STATE_GOTO(BRANCH_next, scan, locinput); + } + assert(0); /* NOTREACHED */ + + case CUTGROUP: /* /(*THEN)/ */ + sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : + MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + PUSH_STATE_GOTO(CUTGROUP_next, next, locinput); + assert(0); /* NOTREACHED */ + + case CUTGROUP_next_fail: + do_cutgroup = 1; + no_final = 1; + if (st->u.mark.mark_name) + sv_commit = st->u.mark.mark_name; + sayNO; + assert(0); /* NOTREACHED */ + + case BRANCH_next: + sayYES; + assert(0); /* NOTREACHED */ + + case BRANCH_next_fail: /* that branch failed; try the next, if any */ + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + scan = ST.next_branch; + /* no more branches? */ + if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sBRANCH failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); + sayNO_SILENT; + } + continue; /* execute next BRANCH[J] op */ + assert(0); /* NOTREACHED */ + + case MINMOD: /* next op will be non-greedy, e.g. A*? */ + minmod = 1; + break; + +#undef ST +#define ST st->u.curlym + + case CURLYM: /* /A{m,n}B/ where A is fixed-length */ + + /* This is an optimisation of CURLYX that enables us to push + * only a single backtracking state, no matter how many matches + * there are in {m,n}. It relies on the pattern being constant + * length, with no parens to influence future backrefs + */ + + ST.me = scan; + scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + + /* if paren positive, emulate an OPEN/CLOSE around A */ + if (ST.me->flags) { + U32 paren = ST.me->flags; + if (paren > maxopenparen) + maxopenparen = paren; + scan += NEXT_OFF(scan); /* Skip former OPEN. */ + } + ST.A = scan; + ST.B = next; + ST.alen = 0; + ST.count = 0; + ST.minmod = minmod; + minmod = 0; + ST.c1 = CHRTEST_UNINIT; + REGCP_SET(ST.cp); + + if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ + goto curlym_do_B; + + curlym_do_A: /* execute the A in /A{m,n}B/ */ + PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */ + assert(0); /* NOTREACHED */ + + case CURLYM_A: /* we've just matched an A */ + ST.count++; + /* after first match, determine A's length: u.curlym.alen */ + if (ST.count == 1) { + if (reginfo->is_utf8_target) { + char *s = st->locinput; + while (s < locinput) { + ST.alen++; + s += UTF8SKIP(s); + } + } + else { + ST.alen = locinput - st->locinput; + } + if (ST.alen == 0) + ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); + } + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n", + (int)(REPORT_CODE_OFF+(depth*2)), "", + (IV) ST.count, (IV)ST.alen) + ); + + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags) + goto fake_end; + + { + I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); + if ( max == REG_INFTY || ST.count < max ) + goto curlym_do_A; /* try to match another A */ + } + goto curlym_do_B; /* try to match B */ + + case CURLYM_A_fail: /* just failed to match an A */ + REGCP_UNWIND(ST.cp); + + if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ + || (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags)) + sayNO; + + curlym_do_B: /* execute the B in /A{m,n}B/ */ + if (ST.c1 == CHRTEST_UNINIT) { + /* calculate c1 and c2 for possible match of 1st char + * following curly */ + ST.c1 = ST.c2 = CHRTEST_VOID; + assert(ST.B); + if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { + regnode *text_node = ST.B; + if (! HAS_TEXT(text_node)) + FIND_NEXT_IMPT(text_node); + /* this used to be + + (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT) + + But the former is redundant in light of the latter. + + if this changes back then the macro for + IS_TEXT and friends need to change. + */ + if (PL_regkind[OP(text_node)] == EXACT) { + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, + reginfo)) + { + sayNO; + } + } + } + } + + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM trying tail with matches=%"IVdf"...\n", + (int)(REPORT_CODE_OFF+(depth*2)), + "", (IV)ST.count) + ); + if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { + if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { + if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + valid_utf8_to_uvchr((U8 *) locinput, NULL), + valid_utf8_to_uvchr(ST.c1_utf8, NULL), + valid_utf8_to_uvchr(ST.c2_utf8, NULL)) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + else if (nextchr != ST.c1 && nextchr != ST.c2) { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + (int) nextchr, ST.c1, ST.c2) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + + if (ST.me->flags) { + /* emulate CLOSE: mark current A as captured */ + I32 paren = ST.me->flags; + if (ST.count) { + rex->offs[paren].start + = HOPc(locinput, -ST.alen) - reginfo->strbeg; + rex->offs[paren].end = locinput - reginfo->strbeg; + if ((U32)paren > rex->lastparen) + rex->lastparen = paren; + rex->lastcloseparen = paren; + } + else + rex->offs[paren].end = -1; + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags) + { + if (ST.count) + goto fake_end; + else + sayNO; + } + } + + PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */ + assert(0); /* NOTREACHED */ + + case CURLYM_B_fail: /* just failed to match a B */ + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + if (ST.minmod) { + I32 max = ARG2(ST.me); + if (max != REG_INFTY && ST.count == max) + sayNO; + goto curlym_do_A; /* try to match a further A */ + } + /* backtrack one A */ + if (ST.count == ARG1(ST.me) /* min */) + sayNO; + ST.count--; + SET_locinput(HOPc(locinput, -ST.alen)); + goto curlym_do_B; /* try to match B */ + +#undef ST +#define ST st->u.curly + +#define CURLY_SETPAREN(paren, success) \ + if (paren) { \ + if (success) { \ + rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \ + rex->offs[paren].end = locinput - reginfo->strbeg; \ + if (paren > rex->lastparen) \ + rex->lastparen = paren; \ + rex->lastcloseparen = paren; \ + } \ + else { \ + rex->offs[paren].end = -1; \ + rex->lastparen = ST.lastparen; \ + rex->lastcloseparen = ST.lastcloseparen; \ + } \ + } + + case STAR: /* /A*B/ where A is width 1 char */ + ST.paren = 0; + ST.min = 0; + ST.max = REG_INFTY; + scan = NEXTOPER(scan); + goto repeat; + + case PLUS: /* /A+B/ where A is width 1 char */ + ST.paren = 0; + ST.min = 1; + ST.max = REG_INFTY; + scan = NEXTOPER(scan); + goto repeat; + + case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ + ST.paren = scan->flags; /* Which paren to set */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + if (ST.paren > maxopenparen) + maxopenparen = ST.paren; + ST.min = ARG1(scan); /* min to match */ + ST.max = ARG2(scan); /* max to match */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + ST.min=1; + ST.max=1; + } + scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); + goto repeat; + + case CURLY: /* /A{m,n}B/ where A is width 1 char */ + ST.paren = 0; + ST.min = ARG1(scan); /* min to match */ + ST.max = ARG2(scan); /* max to match */ + scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + repeat: + /* + * Lookahead to avoid useless match attempts + * when we know what character comes next. + * + * Used to only do .*x and .*?x, but now it allows + * for )'s, ('s and (?{ ... })'s to be in the way + * of the quantifier and the EXACT-like node. -- japhy + */ + + assert(ST.min <= ST.max); + if (! HAS_TEXT(next) && ! JUMPABLE(next)) { + ST.c1 = ST.c2 = CHRTEST_VOID; + } + else { + regnode *text_node = next; + + if (! HAS_TEXT(text_node)) + FIND_NEXT_IMPT(text_node); + + if (! HAS_TEXT(text_node)) + ST.c1 = ST.c2 = CHRTEST_VOID; + else { + if ( PL_regkind[OP(text_node)] != EXACT ) { + ST.c1 = ST.c2 = CHRTEST_VOID; + } + else { + + /* Currently we only get here when + + PL_rekind[OP(text_node)] == EXACT + + if this changes back then the macro for IS_TEXT and + friends need to change. */ + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, + reginfo)) + { + sayNO; + } + } + } + } + + ST.A = scan; + ST.B = next; + if (minmod) { + char *li = locinput; + minmod = 0; + if (ST.min && + regrepeat(rex, &li, ST.A, reginfo, ST.min, depth) + < ST.min) + sayNO; + SET_locinput(li); + ST.count = ST.min; + REGCP_SET(ST.cp); + if (ST.c1 == CHRTEST_VOID) + goto curly_try_B_min; + + ST.oldloc = locinput; + + /* set ST.maxpos to the furthest point along the + * string that could possibly match */ + if (ST.max == REG_INFTY) { + ST.maxpos = reginfo->strend - 1; + if (utf8_target) + while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) + ST.maxpos--; + } + else if (utf8_target) { + int m = ST.max - ST.min; + for (ST.maxpos = locinput; + m >0 && ST.maxpos < reginfo->strend; m--) + ST.maxpos += UTF8SKIP(ST.maxpos); + } + else { + ST.maxpos = locinput + ST.max - ST.min; + if (ST.maxpos >= reginfo->strend) + ST.maxpos = reginfo->strend - 1; + } + goto curly_try_B_min_known; + + } + else { + /* avoid taking address of locinput, so it can remain + * a register var */ + char *li = locinput; + ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth); + if (ST.count < ST.min) + sayNO; + SET_locinput(li); + if ((ST.count > ST.min) + && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) + { + /* A{m,n} must come at the end of the string, there's + * no point in backing off ... */ + ST.min = ST.count; + /* ...except that $ and \Z can match before *and* after + newline at the end. Consider "\n\n" =~ /\n+\Z\n/. + We may back off by one in this case. */ + if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) + ST.min--; + } + REGCP_SET(ST.cp); + goto curly_try_B_max; + } + assert(0); /* NOTREACHED */ + + + case CURLY_B_min_known_fail: + /* failed to find B in a non-greedy match where c1,c2 valid */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* Couldn't or didn't -- move forward. */ + ST.oldloc = locinput; + if (utf8_target) + locinput += UTF8SKIP(locinput); + else + locinput++; + ST.count++; + curly_try_B_min_known: + /* find the next place where 'B' could work, then call B */ + { + int n; + if (utf8_target) { + n = (ST.oldloc == locinput) ? 0 : 1; + if (ST.c1 == ST.c2) { + /* set n to utf8_distance(oldloc, locinput) */ + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); + n++; + } + } + else { + /* set n to utf8_distance(oldloc, locinput) */ + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); + n++; + } + } + } + else { /* Not utf8_target */ + if (ST.c1 == ST.c2) { + while (locinput <= ST.maxpos && + UCHARAT(locinput) != ST.c1) + locinput++; + } + else { + while (locinput <= ST.maxpos + && UCHARAT(locinput) != ST.c1 + && UCHARAT(locinput) != ST.c2) + locinput++; + } + n = locinput - ST.oldloc; + } + if (locinput > ST.maxpos) + sayNO; + if (n) { + /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is + * at b; check that everything between oldloc and + * locinput matches */ + char *li = ST.oldloc; + ST.count += n; + if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n) + sayNO; + assert(n == REG_INFTY || locinput == li); + } + CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput); + } + assert(0); /* NOTREACHED */ + + + case CURLY_B_min_fail: + /* failed to find B in a non-greedy match where c1,c2 invalid */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* failed -- move forward one */ + { + char *li = locinput; + if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) { + sayNO; + } + locinput = li; + } + { + ST.count++; + if (ST.count <= ST.max || (ST.max == REG_INFTY && + ST.count > 0)) /* count overflow ? */ + { + curly_try_B_min: + CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput); + } + } + sayNO; + assert(0); /* NOTREACHED */ + + + curly_try_B_max: + /* a successful greedy match: now try to match B */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + { + bool could_match = locinput < reginfo->strend; + + /* If it could work, try it. */ + if (ST.c1 != CHRTEST_VOID && could_match) { + if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) + { + could_match = memEQ(locinput, + ST.c1_utf8, + UTF8SKIP(locinput)) + || memEQ(locinput, + ST.c2_utf8, + UTF8SKIP(locinput)); + } + else { + could_match = UCHARAT(locinput) == ST.c1 + || UCHARAT(locinput) == ST.c2; + } + } + if (ST.c1 == CHRTEST_VOID || could_match) { + CURLY_SETPAREN(ST.paren, ST.count); + PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput); + assert(0); /* NOTREACHED */ + } + } + /* FALLTHROUGH */ + + case CURLY_B_max_fail: + /* failed to find B in a greedy match */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* back up. */ + if (--ST.count < ST.min) + sayNO; + locinput = HOPc(locinput, -1); + goto curly_try_B_max; + +#undef ST + + case END: /* last op of main pattern */ + fake_end: + if (cur_eval) { + /* we've just finished A in /(??{A})B/; now continue with B */ + + st->u.eval.prev_rex = rex_sv; /* inner */ + + /* Save *all* the positions. */ + st->u.eval.cp = regcppush(rex, 0, maxopenparen); + rex_sv = cur_eval->u.eval.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + cur_curlyx = cur_eval->u.eval.prev_curlyx; + + REGCP_SET(st->u.eval.lastcp); + + /* Restore parens of the outer rex without popping the + * savestack */ + S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp, + &maxopenparen); + + st->u.eval.prev_eval = cur_eval; + cur_eval = cur_eval->u.eval.prev_eval; + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n", + REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval));); + if ( nochange_depth ) + nochange_depth--; + + PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B, + locinput); /* match B */ + } + + if (locinput < reginfo->till) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", + PL_colors[4], + (long)(locinput - startpos), + (long)(reginfo->till - startpos), + PL_colors[5])); + + sayNO_SILENT; /* Cannot match: too short. */ + } + sayYES; /* Success! */ + + case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %ssubpattern success...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])); + sayYES; /* Success! */ + +#undef ST +#define ST st->u.ifmatch + + { + char *newstart; + + case SUSPEND: /* (?>A) */ + ST.wanted = 1; + newstart = locinput; + goto do_ifmatch; + + case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?flags) { + char * const s = HOPBACKc(locinput, scan->flags); + if (!s) { + /* trivial fail */ + if (logical) { + logical = 0; + sw = 1 - cBOOL(ST.wanted); + } + else if (ST.wanted) + sayNO; + next = scan + ARG(scan); + if (next == scan) + next = NULL; + break; + } + newstart = s; + } + else + newstart = locinput; + + do_ifmatch: + ST.me = scan; + ST.logical = logical; + logical = 0; /* XXX: reset state of logical once it has been saved into ST */ + + /* execute body of (?...A) */ + PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart); + assert(0); /* NOTREACHED */ + } + + case IFMATCH_A_fail: /* body of (?...A) failed */ + ST.wanted = !ST.wanted; + /* FALLTHROUGH */ + + case IFMATCH_A: /* body of (?...A) succeeded */ + if (ST.logical) { + sw = cBOOL(ST.wanted); + } + else if (!ST.wanted) + sayNO; + + if (OP(ST.me) != SUSPEND) { + /* restore old position except for (?>...) */ + locinput = st->locinput; + } + scan = ST.me + ARG(ST.me); + if (scan == ST.me) + scan = NULL; + continue; /* execute B */ + +#undef ST + + case LONGJMP: /* alternative with many branches compiles to + * (BRANCHJ; EXACT ...; LONGJMP ) x N */ + next = scan + ARG(scan); + if (next == scan) + next = NULL; + break; + + case COMMIT: /* (*COMMIT) */ + reginfo->cutpoint = reginfo->strend; + /* FALLTHROUGH */ + + case PRUNE: /* (*PRUNE) */ + if (!scan->flags) + sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + PUSH_STATE_GOTO(COMMIT_next, next, locinput); + assert(0); /* NOTREACHED */ + + case COMMIT_next_fail: + no_final = 1; + /* FALLTHROUGH */ + + case OPFAIL: /* (*FAIL) */ + sayNO; + assert(0); /* NOTREACHED */ + +#define ST st->u.mark + case MARKPOINT: /* (*MARK:foo) */ + ST.prev_mark = mark_state; + ST.mark_name = sv_commit = sv_yes_mark + = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + mark_state = st; + ST.mark_loc = locinput; + PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput); + assert(0); /* NOTREACHED */ + + case MARKPOINT_next: + mark_state = ST.prev_mark; + sayYES; + assert(0); /* NOTREACHED */ + + case MARKPOINT_next_fail: + if (popmark && sv_eq(ST.mark_name,popmark)) + { + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + popmark = NULL; /* we found our mark */ + sv_commit = ST.mark_name; + + DEBUG_EXECUTE_r({ + PerlIO_printf(Perl_debug_log, + "%*s %ssetting cutpoint to mark:%"SVf"...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], SVfARG(sv_commit), PL_colors[5]); + }); + } + mark_state = ST.prev_mark; + sv_yes_mark = mark_state ? + mark_state->u.mark.mark_name : NULL; + sayNO; + assert(0); /* NOTREACHED */ + + case SKIP: /* (*SKIP) */ + if (scan->flags) { + /* (*SKIP) : if we fail we cut here*/ + ST.mark_name = NULL; + ST.mark_loc = locinput; + PUSH_STATE_GOTO(SKIP_next,next, locinput); + } else { + /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, + otherwise do nothing. Meaning we need to scan + */ + regmatch_state *cur = mark_state; + SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + + while (cur) { + if ( sv_eq( cur->u.mark.mark_name, + find ) ) + { + ST.mark_name = find; + PUSH_STATE_GOTO( SKIP_next, next, locinput); + } + cur = cur->u.mark.prev_mark; + } + } + /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ + break; + + case SKIP_next_fail: + if (ST.mark_name) { + /* (*CUT:NAME) - Set up to search for the name as we + collapse the stack*/ + popmark = ST.mark_name; + } else { + /* (*CUT) - No name, we cut here.*/ + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + /* but we set sv_commit to latest mark_name if there + is one so they can test to see how things lead to this + cut */ + if (mark_state) + sv_commit=mark_state->u.mark.mark_name; + } + no_final = 1; + sayNO; + assert(0); /* NOTREACHED */ +#undef ST + + case LNBREAK: /* \R */ + if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) { + locinput += n; + } else + sayNO; + break; + + default: + PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", + PTR2UV(scan), OP(scan)); + Perl_croak(aTHX_ "regexp memory corruption"); + + /* this is a point to jump to in order to increment + * locinput by one character */ + increment_locinput: + assert(!NEXTCHR_IS_EOS); + if (utf8_target) { + locinput += PL_utf8skip[nextchr]; + /* locinput is allowed to go 1 char off the end, but not 2+ */ + if (locinput > reginfo->strend) + sayNO; + } + else + locinput++; + break; + + } /* end switch */ + + /* switch break jumps here */ + scan = next; /* prepare to execute the next op and ... */ + continue; /* ... jump back to the top, reusing st */ + assert(0); /* NOTREACHED */ + + push_yes_state: + /* push a state that backtracks on success */ + st->u.yes.prev_yes_state = yes_state; + yes_state = st; + /* FALLTHROUGH */ + push_state: + /* push a new regex state, then continue at scan */ + { + regmatch_state *newst; + + DEBUG_STACK_r({ + regmatch_state *cur = st; + regmatch_state *curyes = yes_state; + int curd = depth; + regmatch_slab *slab = PL_regmatch_slab; + for (;curd > -1;cur--,curd--) { + if (cur < SLAB_FIRST(slab)) { + slab = slab->prev; + cur = SLAB_LAST(slab); + } + PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n", + REPORT_CODE_OFF + 2 + depth * 2,"", + curd, PL_reg_name[cur->resume_state], + (curyes == cur) ? "yes" : "" + ); + if (curyes == cur) + curyes = cur->u.yes.prev_yes_state; + } + } else + DEBUG_STATE_pp("push") + ); + depth++; + st->locinput = locinput; + newst = st+1; + if (newst > SLAB_LAST(PL_regmatch_slab)) + newst = S_push_slab(aTHX); + PL_regmatch_state = newst; + + locinput = pushinput; + st = newst; + continue; + assert(0); /* NOTREACHED */ + } + } + + /* + * We get here only if there's trouble -- normally "case END" is + * the terminating point. + */ + Perl_croak(aTHX_ "corrupted regexp pointers"); + /*NOTREACHED*/ + sayNO; + +yes: + if (yes_state) { + /* we have successfully completed a subexpression, but we must now + * pop to the state marked by yes_state and continue from there */ + assert(st != yes_state); +#ifdef DEBUGGING + while (st != yes_state) { + st--; + if (st < SLAB_FIRST(PL_regmatch_slab)) { + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + DEBUG_STATE_r({ + if (no_final) { + DEBUG_STATE_pp("pop (no final)"); + } else { + DEBUG_STATE_pp("pop (yes)"); + } + }); + depth--; + } +#else + while (yes_state < SLAB_FIRST(PL_regmatch_slab) + || yes_state > SLAB_LAST(PL_regmatch_slab)) + { + /* not in this slab, pop slab */ + depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + depth -= (st - yes_state); +#endif + st = yes_state; + yes_state = st->u.yes.prev_yes_state; + PL_regmatch_state = st; + + if (no_final) + locinput= st->locinput; + state_num = st->resume_state + no_final; + goto reenter_switch; + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", + PL_colors[4], PL_colors[5])); + + if (reginfo->info_aux_eval) { + /* each successfully executed (?{...}) block does the equivalent of + * local $^R = do {...} + * When popping the save stack, all these locals would be undone; + * bypass this by setting the outermost saved $^R to the latest + * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ + if (oreplsv != GvSV(PL_replgv)) + sv_setsv(oreplsv, GvSV(PL_replgv)); + } + result = 1; + goto final_exit; + +no: + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], PL_colors[5]) + ); + +no_silent: + if (no_final) { + if (yes_state) { + goto yes; + } else { + goto final_exit; + } + } + if (depth) { + /* there's a previous state to backtrack to */ + st--; + if (st < SLAB_FIRST(PL_regmatch_slab)) { + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + PL_regmatch_state = st; + locinput= st->locinput; + + DEBUG_STATE_pp("pop"); + depth--; + if (yes_state == st) + yes_state = st->u.yes.prev_yes_state; + + state_num = st->resume_state + 1; /* failure = success + 1 */ + goto reenter_switch; + } + result = 0; + + final_exit: + if (rex->intflags & PREGf_VERBARG_SEEN) { + SV *sv_err = get_sv("REGERROR", 1); + SV *sv_mrk = get_sv("REGMARK", 1); + if (result) { + sv_commit = &PL_sv_no; + if (!sv_yes_mark) + sv_yes_mark = &PL_sv_yes; + } else { + if (!sv_commit) + sv_commit = &PL_sv_yes; + sv_yes_mark = &PL_sv_no; + } + assert(sv_err); + assert(sv_mrk); + sv_setsv(sv_err, sv_commit); + sv_setsv(sv_mrk, sv_yes_mark); + } + + + if (last_pushed_cv) { + dSP; + POP_MULTICALL; + PERL_UNUSED_VAR(SP); + } + + assert(!result || locinput - reginfo->strbeg >= 0); + return result ? locinput - reginfo->strbeg : -1; +} + +/* + - regrepeat - repeatedly match something simple, report how many + * + * What 'simple' means is a node which can be the operand of a quantifier like + * '+', or {1,3} + * + * startposp - pointer a pointer to the start position. This is updated + * to point to the byte following the highest successful + * match. + * p - the regnode to be repeatedly matched against. + * reginfo - struct holding match state, such as strend + * max - maximum number of things to match. + * depth - (for debugging) backtracking depth. + */ +STATIC I32 +S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, + regmatch_info *const reginfo, I32 max, int depth) +{ + char *scan; /* Pointer to current position in target string */ + I32 c; + char *loceol = reginfo->strend; /* local version */ + I32 hardcount = 0; /* How many matches so far */ + bool utf8_target = reginfo->is_utf8_target; + int to_complement = 0; /* Invert the result? */ + UV utf8_flags; + _char_class_number classnum; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_REGREPEAT; + + scan = *startposp; + if (max == REG_INFTY) + max = I32_MAX; + else if (! utf8_target && loceol - scan > max) + loceol = scan + max; + + /* Here, for the case of a non-UTF-8 target we have adjusted down + * to the maximum of how far we should go in it (leaving it set to the real + * end, if the maximum permissible would take us beyond that). This allows + * us to make the loop exit condition that we haven't gone past to + * also mean that we haven't exceeded the max permissible count, saving a + * test each time through the loop. But it assumes that the OP matches a + * single byte, which is true for most of the OPs below when applied to a + * non-UTF-8 target. Those relatively few OPs that don't have this + * characteristic will have to compensate. + * + * There is no adjustment for UTF-8 targets, as the number of bytes per + * character varies. OPs will have to test both that the count is less + * than the max permissible (using to keep track), and that we + * are still within the bounds of the string (using . A few OPs + * match a single byte no matter what the encoding. They can omit the max + * test if, for the UTF-8 case, they do the adjustment that was skipped + * above. + * + * Thus, the code above sets things up for the common case; and exceptional + * cases need extra work; the common case is to make sure doesn't + * go past , and for UTF-8 to also use to make sure the + * count doesn't exceed the maximum permissible */ + + switch (OP(p)) { + case REG_ANY: + if (utf8_target) { + while (scan < loceol && hardcount < max && *scan != '\n') { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && *scan != '\n') + scan++; + } + break; + case SANY: + if (utf8_target) { + while (scan < loceol && hardcount < max) { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else + scan = loceol; + break; + case CANY: /* Move forward bytes, unless goes off end */ + if (utf8_target && loceol - scan > max) { + + /* hadn't been adjusted in the UTF-8 case */ + scan += max; + } + else { + scan = loceol; + } + break; + case EXACT: + assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + + c = (U8)*STRING(p); + + /* Can use a simple loop if the pattern char to match on is invariant + * under UTF-8, or both target and pattern aren't UTF-8. Note that we + * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's + * true iff it doesn't matter if the argument is in UTF-8 or not */ + if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) { + if (utf8_target && loceol - scan > max) { + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match at all, 1 char == 1 byte */ + loceol = scan + max; + } + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else if (reginfo->is_utf8_pat) { + if (utf8_target) { + STRLEN scan_char_len; + + /* When both target and pattern are UTF-8, we have to do + * string EQ */ + while (hardcount < max + && scan < loceol + && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) + && memEQ(scan, STRING(p), scan_char_len)) + { + scan += scan_char_len; + hardcount++; + } + } + else if (! UTF8_IS_ABOVE_LATIN1(c)) { + + /* Target isn't utf8; convert the character in the UTF-8 + * pattern to non-UTF8, and do a simple loop */ + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } /* else pattern char is above Latin1, can't possibly match the + non-UTF-8 target */ + } + else { + + /* Here, the string must be utf8; pattern isn't, and is + * different in utf8 than not, so can't compare them directly. + * Outside the loop, find the two utf8 bytes that represent c, and + * then look for those in sequence in the utf8 string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } + break; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf; + + case EXACTFL: + utf8_flags = FOLDEQ_LOCALE; + goto do_exactf; + + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; + + case EXACTFU_SS: + case EXACTFU: + utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; + + do_exactf: { + int c1, c2; + U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; + + assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + + if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, + reginfo)) + { + if (c1 == CHRTEST_VOID) { + /* Use full Unicode fold matching */ + char *tmpeol = reginfo->strend; + STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; + while (hardcount < max + && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, pat_len, + reginfo->is_utf8_pat, utf8_flags)) + { + scan = tmpeol; + tmpeol = reginfo->strend; + hardcount++; + } + } + else if (utf8_target) { + if (c1 == c2) { + while (scan < loceol + && hardcount < max + && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while (scan < loceol + && hardcount < max + && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) + || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + } + else if (c1 == c2) { + while (scan < loceol && UCHARAT(scan) == c1) { + scan++; + } + } + else { + while (scan < loceol && + (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) + { + scan++; + } + } + } + break; + } + case ANYOF: + if (utf8_target) { + while (hardcount < max + && scan < loceol + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && REGINCLASS(prog, p, (U8*)scan)) + scan++; + } + break; + + /* The argument (FLAGS) to all the POSIX node types is the class number */ + + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: + if (! utf8_target) { + while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), + *scan))) + { + scan++; + } + } else { + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), + (U8 *) scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case POSIXD: + if (utf8_target) { + goto utf8_posix; + } + /* FALLTHROUGH */ + + case POSIXA: + if (utf8_target && loceol - scan > max) { + + /* We didn't adjust at the beginning of this routine + * because is UTF-8, but it is actually ok to do so, since here, to + * match, 1 char == 1 byte. */ + loceol = scan + max; + } + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + break; + + case NPOSIXD: + if (utf8_target) { + to_complement = 1; + goto utf8_posix; + } + /* FALLTHROUGH */ + + case NPOSIXA: + if (! utf8_target) { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + } + else { + + /* The complement of something that matches only ASCII matches all + * non-ASCII, plus everything in ASCII that isn't in the class. */ + while (hardcount < max && scan < loceol + && (! isASCII_utf8(scan) + || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + while (scan < loceol && to_complement + ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p)))) + { + scan++; + } + } + else { + utf8_posix: + classnum = (_char_class_number) FLAGS(p); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, a swash is needed for above-Latin1 code points. + * Process as many Latin1 code points using the built-in rules. + * Go to another loop to finish processing upon encountering + * the first Latin1 code point. We could do that in this loop + * as well, but the other way saves having to test if the swash + * has been loaded every time through the loop: extra space to + * save a test. */ + while (hardcount < max && scan < loceol) { + if (UTF8_IS_INVARIANT(*scan)) { + if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan, + classnum)))) + { + break; + } + scan++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), + classnum)))) + { + break; + } + scan += 2; + } + else { + goto found_above_latin1; + } + + hardcount++; + } + } + else { + /* For these character classes, the knowledge of how to handle + * every code point is compiled in to Perl via a macro. This + * code is written for making the loops as tight as possible. + * It could be refactored to save space instead */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate code + if we revert the change of \v + matching this */ + /* FALLTHROUGH */ + case _CC_ENUM_PSXSPC: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isSPACE_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_BLANK: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isBLANK_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_XDIGIT: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isXDIGIT_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_VERTSPACE: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isVERTWS_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_CNTRL: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isCNTRL_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + default: + Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum); + } + } + } + break; + + found_above_latin1: /* Continuation of POSIXU and NPOSIXU */ + + /* Load the swash if not already present */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = _core_swash_init( + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(_generic_utf8( + classnum, + scan, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) scan, + TRUE)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + + case LNBREAK: + if (utf8_target) { + while (hardcount < max && scan < loceol && + (c=is_LNBREAK_utf8_safe(scan, loceol))) { + scan += c; + hardcount++; + } + } else { + /* LNBREAK can match one or two latin chars, which is ok, but we + * have to use hardcount in this situation, and throw away the + * adjustment to done before the switch statement */ + loceol = reginfo->strend; + while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { + scan+=c; + hardcount++; + } + } + break; + + case BOUND: + case BOUNDA: + case BOUNDL: + case BOUNDU: + case EOS: + case GPOS: + case KEEPS: + case NBOUND: + case NBOUNDA: + case NBOUNDL: + case NBOUNDU: + case OPFAIL: + case SBOL: + case SEOL: + /* These are all 0 width, so match right here or not at all. */ + break; + + default: + Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); + assert(0); /* NOTREACHED */ + + } + + if (hardcount) + c = hardcount; + else + c = scan - *startposp; + *startposp = scan; + + DEBUG_r({ + GET_RE_DEBUG_FLAGS_DECL; + DEBUG_EXECUTE_r({ + SV * const prop = sv_newmortal(); + regprop(prog, prop, p, reginfo); + PerlIO_printf(Perl_debug_log, + "%*s %s can match %"IVdf" times out of %"IVdf"...\n", + REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); + }); + }); + + return(c); +} + + +#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) +/* +- regclass_swash - prepare the utf8 swash. Wraps the shared core version to +create a copy so that changes the caller makes won't change the shared one. +If is non-null, will return NULL in it, for back-compat. + */ +SV * +Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp) +{ + PERL_ARGS_ASSERT_REGCLASS_SWASH; + + if (altsvp) { + *altsvp = NULL; + } + + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); +} + +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) +{ + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. + * If is 'true', will attempt to create the swash if not already + * done. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). + * Tied intimately to how regcomp.c sets up the data structure */ + + SV *sw = NULL; + SV *si = NULL; /* Input swash initialization string */ + SV* invlist = NULL; + + RXi_GET_DECL(prog,progi); + const struct reg_data * const data = prog ? progi->data : NULL; + + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; + + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + if (data && data->count) { + const U32 n = ARG(node); + + if (data->what[n] == 's') { + SV * const rv = MUTABLE_SV(data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + + si = *ary; /* ary[0] = the string to initialize the swash with */ + + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if + * that inversion list has any user-defined properties in it. */ + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + assert(only_utf8_locale_ptr); + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; + } + } + + /* Element [1] is reserved for the set-up swash. If already there, + * return it; if not, create it and store it there */ + if (ary[1] && SvROK(ary[1])) { + sw = ary[1]; + } + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { + assert(si); + sw = _core_swash_init("utf8", /* the utf8 package */ + "", /* nameless */ + si, + 1, /* binary */ + 0, /* not from tr/// */ + invlist, + &swash_init_flags); + (void)av_store(av, 1, sw); + } + } + } + + /* If requested, return a printable version of what this swash matches */ + if (listsvp) { + SV* matches_string = newSVpvs(""); + + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ + if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) + && (si && si != &PL_sv_undef)) + { + sv_catsv(matches_string, si); + } + + /* Add the inversion list to whatever we have. This may have come from + * the swash, or from an input parameter */ + if (invlist) { + sv_catsv(matches_string, _invlist_contents(invlist)); + } + *listsvp = matches_string; + } + + return sw; +} +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ + +/* + - reginclass - determine if a character falls into a character class + + n is the ANYOF regnode + p is the target string + p_end points to one byte beyond the end of the target string + utf8_target tells whether p is in UTF-8. + + Returns true if matched; false otherwise. + + Note that this can be a synthetic start class, a combination of various + nodes, so things you think might be mutually exclusive, such as locale, + aren't. It can match both locale and non-locale + + */ + +STATIC bool +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) +{ + dVAR; + const char flags = ANYOF_FLAGS(n); + bool match = FALSE; + UV c = *p; + + PERL_ARGS_ASSERT_REGINCLASS; + + /* If c is not already the code point, get it. Note that + * UTF8_IS_INVARIANT() works even if not in UTF-8 */ + if (! UTF8_IS_INVARIANT(c) && utf8_target) { + STRLEN c_len = 0; + c = utf8n_to_uvchr(p, p_end - p, &c_len, + (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) + | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); + /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for + * UTF8_ALLOW_FFFF */ + if (c_len == (STRLEN)-1) + Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + } + + /* If this character is potentially in the bitmap, check it */ + if (c < 256) { + if (ANYOF_BITMAP_TEST(n, c)) + match = TRUE; + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL + && ! utf8_target + && ! isASCII(c)) + { + match = TRUE; + } + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } + } + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { + + /* The data structure is arranged so bits 0, 2, 4, ... are set + * if the class includes the Posix character class given by + * bit/2; and 1, 3, 5, ... are set if the class includes the + * complemented Posix class given by int(bit/2). So we loop + * through the bits, each time changing whether we complement + * the result or not. Suppose for the sake of illustration + * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0 + * is set, it means there is a match for this ANYOF node if the + * character is in the class given by the expression (0 / 2 = 0 + * = \w). If it is in that class, isFOO_lc() will return 1, + * and since 'to_complement' is 0, the result will stay TRUE, + * and we exit the loop. Suppose instead that bit 0 is 0, but + * bit 1 is 1. That means there is a match if the character + * matches \W. We won't bother to call isFOO_lc() on bit 0, + * but will on bit 1. On the second iteration 'to_complement' + * will be 1, so the exclusive or will reverse things, so we + * are testing for \W. On the third iteration, 'to_complement' + * will be 0, and we would be testing for \s; the fourth + * iteration would test for \S, etc. + * + * Note that this code assumes that all the classes are closed + * under folding. For example, if a character matches \w, then + * its fold does too; and vice versa. This should be true for + * any well-behaved locale for all the currently defined Posix + * classes, except for :lower: and :upper:, which are handled + * by the pseudo-class :cased: which matches if either of the + * other two does. To get rid of this assumption, an outer + * loop could be used below to iterate over both the source + * character, and its fold (if different) */ + + int count = 0; + int to_complement = 0; + + while (count < ANYOF_MAX) { + if (ANYOF_POSIXL_TEST(n, count) + && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) + { + match = TRUE; + break; + } + count++; + to_complement ^= 1; + } + } + } + } + + + /* If the bitmap didn't (or couldn't) match, and something outside the + * bitmap could match, try that. */ + if (!match) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { + match = TRUE; /* Everything above 255 matches */ + } + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) + { + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); + if (sw) { + U8 utf8_buffer[2]; + U8 * utf8_p; + if (utf8_target) { + utf8_p = (U8 *) p; + } else { /* Convert to utf8 */ + utf8_p = utf8_buffer; + append_utf8_from_native_byte(*p, &utf8_p); + utf8_p = utf8_buffer; + } + + if (swash_fetch(sw, utf8_p, TRUE)) { + match = TRUE; + } + } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } + } + + if (UNICODE_IS_SUPER(c) + && (flags & ANYOF_WARN_SUPER) + && ckWARN_d(WARN_NON_UNICODE)) + { + Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); + } + } + +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ + return (flags & ANYOF_INVERT) ^ match; +} + +STATIC U8 * +S_reghop3(U8 *s, SSize_t off, const U8* lim) +{ + /* return the position 'off' UTF-8 characters away from 's', forward if + * 'off' >= 0, backwards if negative. But don't go outside of position + * 'lim', which better be < s if off < 0 */ + + PERL_ARGS_ASSERT_REGHOP3; + + if (off >= 0) { + while (off-- && s < lim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + } + else { + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + } + return s; +} + +STATIC U8 * +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) +{ + PERL_ARGS_ASSERT_REGHOP4; + + if (off >= 0) { + while (off-- && s < rlim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + } + else { + while (off++ && s > llim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > llim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + } + return s; +} + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ + +STATIC U8 * +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) +{ + PERL_ARGS_ASSERT_REGHOPMAYBE3; + + if (off >= 0) { + while (off-- && s < lim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + if (off >= 0) + return NULL; + } + else { + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + if (off <= 0) + return NULL; + } + return s; +} + + +/* when executing a regex that may have (?{}), extra stuff needs setting + up that will be visible to the called code, even before the current + match has finished. In particular: + + * $_ is localised to the SV currently being matched; + * pos($_) is created if necessary, ready to be updated on each call-out + to code; + * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm + isn't set until the current pattern is successfully finished), so that + $1 etc of the match-so-far can be seen; + * save the old values of subbeg etc of the current regex, and set then + to the current string (again, this is normally only done at the end + of execution) +*/ + +static void +S_setup_eval_state(pTHX_ regmatch_info *const reginfo) +{ + MAGIC *mg; + regexp *const rex = ReANY(reginfo->prog); + regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; + + eval_state->rex = rex; + + if (reginfo->sv) { + /* Make $_ available to executed code. */ + if (reginfo->sv != DEFSV) { + SAVE_DEFSV; + DEFSV_set(reginfo->sv); + } + + if (!(mg = mg_find_mglob(reginfo->sv))) { + /* prepare for quick setting of pos */ + mg = sv_magicext_mglob(reginfo->sv); + mg->mg_len = -1; + } + eval_state->pos_magic = mg; + eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; + } + else + eval_state->pos_magic = NULL; + + if (!PL_reg_curpm) { + /* PL_reg_curpm is a fake PMOP that we can attach the current + * regex to and point PL_curpm at, so that $1 et al are visible + * within a /(?{})/. It's just allocated once per interpreter the + * first time its needed */ + Newxz(PL_reg_curpm, 1, PMOP); +#ifdef USE_ITHREADS + { + SV* const repointer = &PL_sv_undef; + /* this regexp is also owned by the new PL_reg_curpm, which + will try to free it. */ + av_push(PL_regex_padav, repointer); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); + PL_regex_pad = AvARRAY(PL_regex_padav); + } +#endif + } + SET_reg_curpm(reginfo->prog); + eval_state->curpm = PL_curpm; + PL_curpm = PL_reg_curpm; + if (RXp_MATCH_COPIED(rex)) { + /* Here is a serious problem: we cannot rewrite subbeg, + since it may be needed if this match fails. Thus + $` inside (?{}) could fail... */ + eval_state->subbeg = rex->subbeg; + eval_state->sublen = rex->sublen; + eval_state->suboffset = rex->suboffset; + eval_state->subcoffset = rex->subcoffset; +#ifdef PERL_ANY_COW + eval_state->saved_copy = rex->saved_copy; +#endif + RXp_MATCH_COPIED_off(rex); + } + else + eval_state->subbeg = NULL; + rex->subbeg = (char *)reginfo->strbeg; + rex->suboffset = 0; + rex->subcoffset = 0; + rex->sublen = reginfo->strend - reginfo->strbeg; +} + + +/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ + +static void +S_cleanup_regmatch_info_aux(pTHX_ void *arg) +{ + regmatch_info_aux *aux = (regmatch_info_aux *) arg; + regmatch_info_aux_eval *eval_state = aux->info_aux_eval; + regmatch_slab *s; + + Safefree(aux->poscache); + + if (eval_state) { + + /* undo the effects of S_setup_eval_state() */ + + if (eval_state->subbeg) { + regexp * const rex = eval_state->rex; + rex->subbeg = eval_state->subbeg; + rex->sublen = eval_state->sublen; + rex->suboffset = eval_state->suboffset; + rex->subcoffset = eval_state->subcoffset; +#ifdef PERL_ANY_COW + rex->saved_copy = eval_state->saved_copy; +#endif + RXp_MATCH_COPIED_on(rex); + } + if (eval_state->pos_magic) + { + eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } + + PL_curpm = eval_state->curpm; + } + + PL_regmatch_state = aux->old_regmatch_state; + PL_regmatch_slab = aux->old_regmatch_slab; + + /* free all slabs above current one - this must be the last action + * of this function, as aux and eval_state are allocated within + * slabs and may be freed here */ + + s = PL_regmatch_slab->next; + if (s) { + PL_regmatch_slab->next = NULL; + while (s) { + regmatch_slab * const osl = s; + s = s->next; + Safefree(osl); + } + } +} + + +STATIC void +S_to_utf8_substr(pTHX_ regexp *prog) +{ + /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile + * on the converted value */ + + int i = 1; + + PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; + + do { + if (prog->substrs->data[i].substr + && !prog->substrs->data[i].utf8_substr) { + SV* const sv = newSVsv(prog->substrs->data[i].substr); + prog->substrs->data[i].utf8_substr = sv; + sv_utf8_upgrade(sv); + if (SvVALID(prog->substrs->data[i].substr)) { + if (SvTAIL(prog->substrs->data[i].substr)) { + /* Trim the trailing \n that fbm_compile added last + time. */ + SvCUR_set(sv, SvCUR(sv) - 1); + /* Whilst this makes the SV technically "invalid" (as its + buffer is no longer followed by "\0") when fbm_compile() + adds the "\n" back, a "\0" is restored. */ + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } + if (prog->substrs->data[i].substr == prog->check_substr) + prog->check_utf8 = sv; + } + } while (i--); +} + +STATIC bool +S_to_byte_substr(pTHX_ regexp *prog) +{ + /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile + * on the converted value; returns FALSE if can't be converted. */ + + int i = 1; + + PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; + + do { + if (prog->substrs->data[i].utf8_substr + && !prog->substrs->data[i].substr) { + SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); + if (! sv_utf8_downgrade(sv, TRUE)) { + return FALSE; + } + if (SvVALID(prog->substrs->data[i].utf8_substr)) { + if (SvTAIL(prog->substrs->data[i].utf8_substr)) { + /* Trim the trailing \n that fbm_compile added last + time. */ + SvCUR_set(sv, SvCUR(sv) - 1); + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } + prog->substrs->data[i].substr = sv; + if (prog->substrs->data[i].utf8_substr == prog->check_utf8) + prog->check_substr = sv; + } + } while (i--); + + return TRUE; +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021002/regcomp.c b/src/5021002/regcomp.c new file mode 100644 index 0000000..265f278 --- /dev/null +++ b/src/5021002/regcomp.c @@ -0,0 +1,16885 @@ +/* regcomp.c + */ + +/* + * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee + * + * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] + */ + +/* This file contains functions for compiling a regular expression. See + * also regexec.c which funnily enough, contains functions for executing + * a regular expression. + * + * This file is also copied at build time to ext/re/re_comp.c, where + * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. + * This causes the main functions to be compiled under new names and with + * debugging support added, which makes "use re 'debug'" work. + */ + +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +/* + * pregcomp and pregexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + * + **** Alterations to Henry's code are... + **** + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others + **** + **** You may distribute under the terms of either the GNU General Public + **** License or the Artistic License, as specified in the README file. + + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#define PERL_IN_REGCOMP_C +#include "perl.h" + +#ifndef PERL_IN_XSUB_RE +#include "re_defs.h" +#endif + +#define REG_COMP_C +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +EXTERN_C const struct regexp_engine my_reg_engine; +#else +# include "regcomp.h" +#endif + +#include "dquote_static.c" +#include "charclass_invlists.h" +#include "inline_invlist.c" +#include "unicode_constants.h" + +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) +#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) + +#ifndef STATIC +#define STATIC static +#endif + + +struct RExC_state_t { + U32 flags; /* RXf_* are we folding, multilining? */ + U32 pm_flags; /* PMf_* stuff from the calling PMOP */ + char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ + regexp *rx; /* perl core regexp structure */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ + char *start; /* Start of input for compile */ + char *end; /* End of input for compile */ + char *parse; /* Input-scan pointer. */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ + regnode *emit_start; /* Start of emitted-code area */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ + regnode *emit; /* Code-emit pointer; if = &emit_dummy, + implies compiling, so don't emit */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ + I32 naughty; /* How bad is this pattern? */ + I32 sawback; /* Did we see \1, ...? */ + U32 seen; + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ + I32 extralen; + I32 seen_zerolen; + regnode **open_parens; /* pointers to open parens */ + regnode **close_parens; /* pointers to close parens */ + regnode *opend; /* END node in program */ + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ + I32 uni_semantics; /* If a d charset modifier should use unicode + rules, even if the pattern is not in + utf8 */ + HV *paren_names; /* Paren names */ + + regnode **recurse; /* Recurse regops */ + I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ + I32 in_lookbehind; + I32 contains_locale; + I32 contains_i; + I32 override_recoding; + I32 in_multi_char_class; + struct reg_code_block *code_blocks; /* positions of literal (?{}) + within pattern */ + int num_code_blocks; /* size of code_blocks[] */ + int code_index; /* next code_blocks[] slot */ + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ +#define RExC_starttry (pRExC_state->starttry) +#endif + SV *runtime_code_qr; /* qr with the runtime code blocks */ +#ifdef DEBUGGING + const char *lastparse; + I32 lastnum; + AV *paren_name_list; /* idx -> name */ +#define RExC_lastparse (pRExC_state->lastparse) +#define RExC_lastnum (pRExC_state->lastnum) +#define RExC_paren_name_list (pRExC_state->paren_name_list) +#endif +}; + +#define RExC_flags (pRExC_state->flags) +#define RExC_pm_flags (pRExC_state->pm_flags) +#define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) +#define RExC_rx (pRExC_state->rx) +#define RExC_rxi (pRExC_state->rxi) +#define RExC_start (pRExC_state->start) +#define RExC_end (pRExC_state->end) +#define RExC_parse (pRExC_state->parse) +#define RExC_whilem_seen (pRExC_state->whilem_seen) +#ifdef RE_TRACK_PATTERN_OFFSETS +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ +#endif +#define RExC_emit (pRExC_state->emit) +#define RExC_emit_dummy (pRExC_state->emit_dummy) +#define RExC_emit_start (pRExC_state->emit_start) +#define RExC_emit_bound (pRExC_state->emit_bound) +#define RExC_naughty (pRExC_state->naughty) +#define RExC_sawback (pRExC_state->sawback) +#define RExC_seen (pRExC_state->seen) +#define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) +#define RExC_npar (pRExC_state->npar) +#define RExC_nestroot (pRExC_state->nestroot) +#define RExC_extralen (pRExC_state->extralen) +#define RExC_seen_zerolen (pRExC_state->seen_zerolen) +#define RExC_utf8 (pRExC_state->utf8) +#define RExC_uni_semantics (pRExC_state->uni_semantics) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) +#define RExC_open_parens (pRExC_state->open_parens) +#define RExC_close_parens (pRExC_state->close_parens) +#define RExC_opend (pRExC_state->opend) +#define RExC_paren_names (pRExC_state->paren_names) +#define RExC_recurse (pRExC_state->recurse) +#define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) +#define RExC_in_lookbehind (pRExC_state->in_lookbehind) +#define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) +#define RExC_override_recoding (pRExC_state->override_recoding) +#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) + + +#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ + ((*s) == '{' && regcurly(s))) + +/* + * Flags to be passed up and down. + */ +#define WORST 0 /* Worst case. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ + +/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single + * character. (There needs to be a case: in the switch statement in regexec.c + * for any node marked SIMPLE.) Note that this is not the same thing as + * REGNODE_SIMPLE */ +#define SIMPLE 0x02 +#define SPSTART 0x04 /* Starts with * or + */ +#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ +#define TRYAGAIN 0x10 /* Weeded out a declaration. */ +#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */ + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) + +/* whether trie related optimizations are enabled */ +#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION +#define TRIE_STUDY_OPT +#define FULL_TRIE_STUDY +#define TRIE_STCLASS +#endif + + + +#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] +#define PBITVAL(paren) (1 << ((paren) & 7)) +#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) +#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) +#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) + +#define REQUIRE_UTF8 STMT_START { \ + if (!UTF) { \ + *flagp = RESTART_UTF8; \ + return NULL; \ + } \ + } STMT_END + +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((int) ((class) / 2)) +#define classnum_to_namedclass(classnum) ((classnum) * 2) + +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + +/* About scan_data_t. + + During optimisation we recurse through the regexp program performing + various inplace (keyhole style) optimisations. In addition study_chunk + and scan_commit populate this data structure with information about + what strings MUST appear in the pattern. We look for the longest + string that must appear at a fixed location, and we look for the + longest string that may appear at a floating location. So for instance + in the pattern: + + /FOO[xX]A.*B[xX]BAR/ + + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating + strings (because they follow a .* construct). study_chunk will identify + both FOO and BAR as being the longest fixed and floating strings respectively. + + The strings can be composites, for instance + + /(f)(o)(o)/ + + will result in a composite fixed substring 'foo'. + + For each string some basic information is maintained: + + - offset or min_offset + This is the position the string must appear at, or not before. + It also implicitly (when combined with minlenp) tells us how many + characters must match before the string we are searching for. + Likewise when combined with minlenp and the length of the string it + tells us how many characters must appear after the string we have + found. + + - max_offset + Only used for floating strings. This is the rightmost point that + the string can appear at. If set to SSize_t_MAX it indicates that the + string can occur infinitely far to the right. + + - minlenp + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive + lookahead or positive lookbehind we can have multiple patterns + involved. Consider + + /(?=FOO).*F/ + + The minimum length of the pattern overall is 3, the minimum length + of the lookahead part is 3, but the minimum length of the part that + will actually match is 1. So 'FOO's minimum length is 3, but the + minimum length for the F is 1. This is important as the minimum length + is used to determine offsets in front of and behind the string being + looked for. Since strings can be composites this is the length of the + pattern at the time it was committed with a scan_commit. Note that + the length is calculated by study_chunk, so that the minimum lengths + are not known until the full pattern has been compiled, thus the + pointer to the value. + + - lookbehind + + In the case of lookbehind the string being searched for can be + offset past the start point of the final matching string. + If this value was just blithely removed from the min_offset it would + invalidate some of the calculations for how many chars must match + before or after (as they are derived from min_offset and minlen and + the length of the string being searched for). + When the final pattern is compiled and the data is moved from the + scan_data_t structure into the regexp structure the information + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the + associated string. + + The fields pos_min and pos_delta are used to store the minimum offset + and the delta to the maximum offset at the current point in the pattern. + +*/ + +typedef struct scan_data_t { + /*I32 len_min; unused */ + /*I32 len_delta; unused */ + SSize_t pos_min; + SSize_t pos_delta; + SV *last_found; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; + SV **longest; /* Either &l_fixed, or &l_float. */ + SV *longest_fixed; /* longest fixed string found in pattern */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ + I32 lookbehind_fixed; /* is the position of the string modfied by LB */ + SV *longest_float; /* longest floating string found in pattern */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ + I32 flags; + I32 whilem_c; + SSize_t *last_closep; + regnode_ssc *start_class; +} scan_data_t; + +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + +/* + * Forward declarations for pregcomp()'s friends. + */ + +static const scan_data_t zero_scan_data = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; + +#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) +#define SF_BEFORE_SEOL 0x0001 +#define SF_BEFORE_MEOL 0x0002 +#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) +#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) + +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) + +#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) +#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) + +#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) +#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ +#define SF_IS_INF 0x0040 +#define SF_HAS_PAR 0x0080 +#define SF_IN_PAR 0x0100 +#define SF_HAS_EVAL 0x0200 +#define SCF_DO_SUBSTR 0x0400 +#define SCF_DO_STCLASS_AND 0x0800 +#define SCF_DO_STCLASS_OR 0x1000 +#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 + +#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ +#define SCF_SEEN_ACCEPT 0x8000 +#define SCF_TRIE_DOING_RESTUDY 0x10000 + +#define UTF cBOOL(RExC_utf8) + +/* The enums for all these are ordered so things work out correctly */ +#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) + +#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) + +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + +#define OOB_NAMEDCLASS -1 + +/* There is no code point that is out-of-bounds, so this is problematic. But + * its only current use is to initialize a variable that is always set before + * looked at. */ +#define OOB_UNICODE 0xDEADBEEF + +#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) + + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "<-- HERE" /* marker as it appears in the description */ +#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define _FAIL(code) STMT_START { \ + const char *ellipses = ""; \ + IV len = RExC_end - RExC_precomp; \ + \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + code; \ +} STMT_END + +#define FAIL(msg) _FAIL( \ + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +#define FAIL2(msg,arg) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL(m); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL2(m, a1); \ +} STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL3(m, a1, a2); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vFAIL4(m,a1,a2,a3) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL4(m, a1, a2, a3); \ +} STMT_END + +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + +/* m is not necessarily a "literal string", in this macro */ +#define reg_warn_non_literal_string(loc, m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNreg(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN_dep(loc, m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNdep(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNregdep(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN2reg(loc, m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN3(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN4(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + +/* Allow for side effects in s */ +#define REGC(c,s) STMT_START { \ + if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ +} STMT_END + +/* Macros for recording node offsets. 20001227 mjd@plover.com + * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in + * element 2*n-1 of the array. Element #2n holds the byte length node #n. + * Element 0 holds the number n. + * Position is 1 indexed. + */ +#ifndef RE_TRACK_PATTERN_OFFSETS +#define Set_Node_Offset_To_R(node,byte) +#define Set_Node_Offset(node,byte) +#define Set_Cur_Node_Offset +#define Set_Node_Length_To_R(node,len) +#define Set_Node_Length(node,len) +#define Set_Node_Cur_Length(node,start) +#define Node_Offset(n) +#define Node_Length(n) +#define Set_Node_Offset_Length(node,offset,len) +#define ProgLen(ri) ri->u.proglen +#define SetProgLen(ri,x) ri->u.proglen = x +#else +#define ProgLen(ri) ri->u.offsets[0] +#define SetProgLen(ri,x) ri->u.offsets[0] = x +#define Set_Node_Offset_To_R(node,byte) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(byte))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ + } else { \ + RExC_offsets[2*(node)-1] = (byte); \ + } \ + } \ +} STMT_END + +#define Set_Node_Offset(node,byte) \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) +#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) + +#define Set_Node_Length_To_R(node,len) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(len))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ + } else { \ + RExC_offsets[2*(node)] = (len); \ + } \ + } \ +} STMT_END + +#define Set_Node_Length(node,len) \ + Set_Node_Length_To_R((node)-RExC_emit_start, len) +#define Set_Node_Cur_Length(node, start) \ + Set_Node_Length(node, RExC_parse - start) + +/* Get offsets and lengths */ +#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) +#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) + +#define Set_Node_Offset_Length(node,offset,len) STMT_START { \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ + Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ +} STMT_END +#endif + +#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS +#define EXPERIMENTAL_INPLACESCAN +#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ + +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + +#define DEBUG_STUDYDATA(str,data,depth) \ +DEBUG_OPTIMISE_MORE_r(if(data){ \ + PerlIO_printf(Perl_debug_log, \ + "%*s" str "Pos:%"IVdf"/%"IVdf \ + " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ + (int)(depth)*2, "", \ + (IV)((data)->pos_min), \ + (IV)((data)->pos_delta), \ + (UV)((data)->flags), \ + (IV)((data)->whilem_c), \ + (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ + is_inf ? "INF " : "" \ + ); \ + if ((data)->last_found) \ + PerlIO_printf(Perl_debug_log, \ + "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ + " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ + SvPVX_const((data)->last_found), \ + (IV)((data)->last_end), \ + (IV)((data)->last_start_min), \ + (IV)((data)->last_start_max), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ + SvPVX_const((data)->longest_fixed), \ + (IV)((data)->offset_fixed), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_float)) ? "*" : "", \ + SvPVX_const((data)->longest_float), \ + (IV)((data)->offset_float_min), \ + (IV)((data)->offset_float_max) \ + ); \ + PerlIO_printf(Perl_debug_log,"\n"); \ +}); + +/* Mark that we cannot extend a found fixed substring at this point. + Update the longest found anchored substring and the longest found + floating substrings if needed. */ + +STATIC void +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) +{ + const STRLEN l = CHR_SVLEN(data->last_found); + const STRLEN old_l = CHR_SVLEN(*data->longest); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_SCAN_COMMIT; + + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { + SvSetMagicSV(*data->longest, data->last_found); + if (*data->longest == data->longest_fixed) { + data->offset_fixed = l ? data->last_start_min : data->pos_min; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); + else + data->flags &= ~SF_FIX_BEFORE_EOL; + data->minlen_fixed=minlenp; + data->lookbehind_fixed=0; + } + else { /* *data->longest == data->longest_float */ + data->offset_float_min = l ? data->last_start_min : data->pos_min; + data->offset_float_max = (l + ? data->last_start_max + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); + else + data->flags &= ~SF_FL_BEFORE_EOL; + data->minlen_float=minlenp; + data->lookbehind_float=0; + } + } + SvCUR_set(data->last_found, 0); + { + SV * const sv = data->last_found; + if (SvUTF8(sv) && SvMAGICAL(sv)) { + MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); + if (mg) + mg->mg_len = 0; + } + } + data->last_end = -1; + data->flags &= ~SF_BEFORE_EOL; + DEBUG_STUDYDATA("commit: ",data,0); +} + +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + +STATIC void +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ + + PERL_ARGS_ASSERT_SSC_INIT; + + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); + + /* If any portion of the regex is to operate under locale rules, + * initialization includes it. The reason this isn't done for all regexes + * is that the optimizer was written under the assumption that locale was + * all-or-nothing. Given the complexity and lack of documentation in the + * optimizer, and that there are inadequate test cases for locale, many + * parts of it may not work properly, it is safest to avoid locale unless + * necessary. */ + if (RExC_contains_locale) { + ANYOF_POSIXL_SETALL(ssc); + } + else { + ANYOF_POSIXL_ZERO(ssc); + } +} + +STATIC int +S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; +} + +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) +{ + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ + + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } + + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } + + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } + + return invlist; +} + +/* These two functions currently do the exact same thing */ +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ + +STATIC void +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) +{ + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ + + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; + + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); + + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); + + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } + } + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } + } +} + +STATIC void +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) +{ + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ + + SV* ored_cp_list; + U8 ored_flags; + + PERL_ARGS_ASSERT_SSC_OR; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ + + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); + } + } + } + } + + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} + +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; + } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); +} + +#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] +#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) +#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) + + +#ifdef DEBUGGING +/* + dump_trie(trie,widecharmap,revcharmap) + dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) + dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) + + These routines dump out a trie in a somewhat readable format. + The _interim_ variants are used for debugging the interim + tables that are used to generate the final compressed + representation which is what dump_trie expects. + + Part of the reason for their existence is to provide a form + of documentation as to how the different representations function. + +*/ + +/* + Dumps the final compressed table form of the trie to Perl_debug_log. + Used for debugging make_trie(). +*/ + +STATIC void +S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, + AV *revcharmap, U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + U16 word; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE; + + PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", + (int)depth * 2 + 2,"", + "Match","Base","Ofs" ); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) { + SV ** const tmp = av_fetch( revcharmap, state, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", + (int)depth * 2 + 2,""); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) + PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); + PerlIO_printf( Perl_debug_log, "\n"); + + for( state = 1 ; state < trie->statecount ; state++ ) { + const U32 base = trie->states[ state ].trans.base; + + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); + + if ( trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); + } else { + PerlIO_printf( Perl_debug_log, "%6s", "" ); + } + + PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); + + if ( base ) { + U32 ofs = 0; + + while( ( base + ofs < trie->uniquecharcount ) || + ( base + ofs - trie->uniquecharcount < trie->lasttrans + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) + ofs++; + + PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) + { + PerlIO_printf( Perl_debug_log, "%*"UVXf, + colwidth, + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); + } else { + PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); + } + } + + PerlIO_printf( Perl_debug_log, "]"); + + } + PerlIO_printf( Perl_debug_log, "\n" ); + } + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); + for (word=1; word <= trie->wordcount; word++) { + PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", + (int)word, (int)(trie->wordinfo[word].prev), + (int)(trie->wordinfo[word].len)); + } + PerlIO_printf(Perl_debug_log, "\n" ); +} +/* + Dumps a fully constructed but uncompressed trie in list form. + List tries normally only are used for construction when the number of + possible chars (trie->uniquecharcount) is very high. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + + /* print out the table precompression. */ + PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", + (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", + "------:-----+-----------------\n" ); + + for( state=1 ; state < next_alloc ; state ++ ) { + U16 charid; + + PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", + (int)depth * 2 + 2,"", (UV)state ); + if ( ! trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, "%5s| ",""); + } else { + PerlIO_printf( Perl_debug_log, "W%4x| ", + trie->states[ state ].wordnum + ); + } + for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR + ) , + TRIE_LIST_ITEM(state,charid).forid, + (UV)TRIE_LIST_ITEM(state,charid).newstate + ); + if (!(charid % 10)) + PerlIO_printf(Perl_debug_log, "\n%*s| ", + (int)((depth * 2) + 14), ""); + } + } + PerlIO_printf( Perl_debug_log, "\n"); + } +} + +/* + Dumps a fully constructed but uncompressed trie in table form. + This is the normal DFA style state transition table, with a few + twists to facilitate compression later. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + U16 charid; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; + + /* + print out the table precompression so that we can do a visual check + that they are identical. + */ + + PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, charid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + + PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); + + for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { + PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); + } + + PerlIO_printf( Perl_debug_log, "\n" ); + + for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { + + PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", + (int)depth * 2 + 2,"", + (UV)TRIE_NODENUM( state ) ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); + if (v) + PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); + else + PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); + } + if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); + } else { + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, + trie->states[ TRIE_NODENUM( state ) ].wordnum ); + } + } +} + +#endif + + +/* make_trie(startbranch,first,last,tail,word_count,flags,depth) + startbranch: the first branch in the whole branch sequence + first : start branch of sequence of branch-exact nodes. + May be the same as startbranch + last : Thing following the last branch. + May be the same as tail. + tail : item following the branch sequence + count : words in the sequence + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/ + depth : indent depth + +Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. + +A trie is an N'ary tree where the branches are determined by digital +decomposition of the key. IE, at the root node you look up the 1st character and +follow that branch repeat until you find the end of the branches. Nodes can be +marked as "accepting" meaning they represent a complete word. Eg: + + /he|she|his|hers/ + +would convert into the following structure. Numbers represent states, letters +following numbers represent valid transitions on the letter from that state, if +the number is in square brackets it represents an accepting state, otherwise it +will be in parenthesis. + + +-h->+-e->[3]-+-r->(8)-+-s->[9] + | | + | (2) + | | + (1) +-i->(6)-+-s->[7] + | + +-s->(3)-+-h->(4)-+-e->[5] + + Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) + +This shows that when matching against the string 'hers' we will begin at state 1 +read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, +then read 'r' and go to state 8 followed by 's' which takes us to state 9 which +is also accepting. Thus we know that we can match both 'he' and 'hers' with a +single traverse. We store a mapping from accepting to state to which word was +matched, and then when we have multiple possibilities we try to complete the +rest of the regex in the order in which they occured in the alternation. + +The only prior NFA like behaviour that would be changed by the TRIE support is +the silent ignoring of duplicate alternations which are of the form: + + / (DUPE|DUPE) X? (?{ ... }) Y /x + +Thus EVAL blocks following a trie may be called a different number of times with +and without the optimisation. With the optimisations dupes will be silently +ignored. This inconsistent behaviour of EVAL type nodes is well established as +the following demonstrates: + + 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ + +which prints out 'word' three times, but + + 'words'=~/(word|word|word)(?{ print $1 })S/ + +which doesnt print it out at all. This is due to other optimisations kicking in. + +Example of what happens on a structural level: + +The regexp /(ac|ad|ab)+/ will produce the following debug output: + + 1: CURLYM[1] {1,32767}(18) + 5: BRANCH(8) + 6: EXACT (16) + 8: BRANCH(11) + 9: EXACT (16) + 11: BRANCH(14) + 12: EXACT (16) + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +This would be optimizable with startbranch=5, first=5, last=16, tail=16 +and should turn into: + + 1: CURLYM[1] {1,32767}(18) + 5: TRIE(16) + [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] + + + + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +Cases where tail != last would be like /(?foo|bar)baz/: + + 1: BRANCH(4) + 2: EXACT (8) + 4: BRANCH(7) + 5: EXACT (8) + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + +which would be optimizable with startbranch=1, first=1, last=7, tail=8 +and would end up looking like: + + 1: TRIE(8) + [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] + + + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + + d = uvchr_to_utf8_flags(d, uv, 0); + +is the recommended Unicode-aware way of saying + + *(d++) = uv; +*/ + +#define TRIE_STORE_REVCHAR(val) \ + STMT_START { \ + if (UTF) { \ + SV *zlopp = newSV(7); /* XXX: optimize me */ \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)val; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END + +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ +} STMT_END + + + +#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ + if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ + U32 ging = TRIE_LIST_LEN( state ) *= 2; \ + Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ + } \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ + TRIE_LIST_CUR( state )++; \ +} STMT_END + +#define TRIE_LIST_NEW(state) STMT_START { \ + Newxz( trie->states[ state ].trans.list, \ + 4, reg_trie_trans_le ); \ + TRIE_LIST_CUR( state ) = 1; \ + TRIE_LIST_LEN( state ) = 4; \ +} STMT_END + +#define TRIE_HANDLE_WORD(state) STMT_START { \ + U16 dupe= trie->states[ state ].wordnum; \ + regnode * const noper_next = regnext( noper ); \ + \ + DEBUG_r({ \ + /* store the word for dumping */ \ + SV* tmp; \ + if (OP(noper) != NOTHING) \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ + else \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ + av_push( trie_words, tmp ); \ + }); \ + \ + curword++; \ + trie->wordinfo[curword].prev = 0; \ + trie->wordinfo[curword].len = wordlen; \ + trie->wordinfo[curword].accept = state; \ + \ + if ( noper_next < tail ) { \ + if (!trie->jump) \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ + trie->jump[curword] = (U16)(noper_next - convert); \ + if (!jumper) \ + jumper = noper_next; \ + if (!nextbranch) \ + nextbranch= regnext(cur); \ + } \ + \ + if ( dupe ) { \ + /* It's a dupe. Pre-insert into the wordinfo[].prev */\ + /* chain, so that when the bits of chain are later */\ + /* linked together, the dups appear in the chain */\ + trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ + trie->wordinfo[dupe].prev = curword; \ + } else { \ + /* we haven't inserted this word yet. */ \ + trie->states[ state ].wordnum = curword; \ + } \ +} STMT_END + + +#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ + ( ( base + charid >= ucharcount \ + && base + charid < ubound \ + && state == trie->trans[ base - ucharcount + charid ].check \ + && trie->trans[ base - ucharcount + charid ].next ) \ + ? trie->trans[ base - ucharcount + charid ].next \ + : ( state==1 ? special : 0 ) \ + ) + +#define MADE_TRIE 1 +#define MADE_JUMP_TRIE 2 +#define MADE_EXACT_TRIE 4 + +STATIC I32 +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) +{ + /* first pass, loop through and scan words */ + reg_trie_data *trie; + HV *widecharmap = NULL; + AV *revcharmap = newAV(); + regnode *cur; + STRLEN len = 0; + UV uvc = 0; + U16 curword = 0; + U32 next_alloc = 0; + regnode *jumper = NULL; + regnode *nextbranch = NULL; + regnode *convert = NULL; + U32 *prev_states; /* temp array mapping each state to previous one */ + /* we just use folder as a flag in utf8 */ + const U8 * folder = NULL; + +#ifdef DEBUGGING + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); + AV *trie_words = NULL; + /* along with revcharmap, this only used during construction but both are + * useful during debugging so we store them in the struct when debugging. + */ +#else + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); + STRLEN trie_charcount=0; +#endif + SV *re_trie_maxbuff; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + switch (flags) { + case EXACT: break; + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: folder = PL_fold_latin1; break; + case EXACTF: folder = PL_fold; break; + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); + } + + trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); + trie->refcount = 1; + trie->startstate = 1; + trie->wordcount = word_count; + RExC_rxi->data->data[ data_slot ] = (void*)trie; + trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); + if (flags == EXACT) + trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); + trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( + trie->wordcount+1, sizeof(reg_trie_wordinfo)); + + DEBUG_r({ + trie_words = newAV(); + }); + + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + assert(re_trie_maxbuff); + if (!SvIOK(re_trie_maxbuff)) { + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); + }); + + /* Find the node we are going to overwrite */ + if ( first == startbranch && OP( last ) != BRANCH ) { + /* whole branch chain */ + convert = first; + } else { + /* branch sub-chain */ + convert = NEXTOPER( first ); + } + + /* -- First loop and Setup -- + + We first traverse the branches and scan each word to determine if it + contains widechars, and how many unique chars there are, this is + important as we have to build a table with at least as many columns as we + have unique chars. + + We use an array of integers to represent the character codes 0..255 + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. + + *TODO* If we keep track of how many times each character is used we can + remap the columns so that the table compression later on is more + efficient in terms of memory by ensuring the most common value is in the + middle and the least common are on the outside. IMO this would be better + than a most to least common mapping as theres a decent chance the most + common letter will share a node with the least common, meaning the node + will not be compressible. With a middle is most common approach the worst + case is when we have the least common nodes twice. + + */ + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + regnode *noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + int foldlen = 0; + U32 wordlen = 0; /* required init */ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + trie->minlen= STR_LEN(noper); + } else { + trie->minlen= 0; + continue; + } + } + + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ + TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte + regardless of encoding */ + if (OP( noper ) == EXACTFU_SS) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); + } + } + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ + TRIE_CHARCOUNT(trie)++; + TRIE_READ_CHAR; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ + if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } + if ( !trie->charmap[ uvc ] ) { + trie->charmap[ uvc ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( uvc ); + } + if ( set_bit ) { + /* store the codepoint in the bitmap, and its folded + * equivalent. */ + TRIE_BITMAP_SET(trie, uvc); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + variant codepoints */ + if (! UVCHR_IS_INVARIANT(uvc)) { + TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); + } + } + set_bit = 0; /* We've done our bit :-) */ + } + } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + + SV** svpp; + if ( !widecharmap ) + widecharmap = newHV(); + + svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); + + if ( !svpp ) + Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); + + if ( !SvTRUE( *svpp ) ) { + sv_setiv( *svpp, ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR(uvc); + } + } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ + if( cur == first ) { + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; + } + } /* end first pass */ + DEBUG_TRIE_COMPILE_r( + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + (int)depth * 2 + 2,"", + ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, + (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, + (int)trie->minlen, (int)trie->maxlen ) + ); + + /* + We now know what we are dealing with in terms of unique chars and + string sizes so we can calculate how much memory a naive + representation using a flat table will take. If it's over a reasonable + limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory + conservative but potentially much slower representation using an array + of lists. + + At the end we convert both representations into the same compressed + form that will be used in regexec.c for matching with. The latter + is a form that cannot be used to construct with but has memory + properties similar to the list form and access properties similar + to the table form making it both suitable for fast searches and + small enough that its feasable to store for the duration of a program. + + See the comment in the code where the compressed table is produced + inplace from the flat tabe representation for an explanation of how + the compression works. + + */ + + + Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); + prev_states[1] = 0; + + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { + /* + Second Pass -- Array Of Lists Representation + + Each state will be represented by a list of charid:state records + (reg_trie_trans_le) the first such element holds the CUR and LEN + points of the allocated array. (See defines above). + + We build the initial structure using the lists, and then convert + it into the compressed table form which allows faster lookups + (but cant be modified once converted). + */ + + STRLEN transcount = 1; + + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using list compiler\n", + (int)depth * 2 + 2, "")); + + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + TRIE_LIST_NEW(1); + next_alloc = 2; + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = NEXTOPER( cur ); + U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + U32 state = 1; /* required init */ + U16 charid = 0; /* sanity init */ + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + + if (OP(noper) != NOTHING) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + if ( !svpp ) { + charid = 0; + } else { + charid=(U16)SvIV( *svpp ); + } + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + if ( charid ) { + + U16 check; + U32 newstate = 0; + + charid--; + if ( !trie->states[ state ].trans.list ) { + TRIE_LIST_NEW( state ); + } + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { + newstate = TRIE_LIST_ITEM( state, check ).newstate; + break; + } + } + if ( ! newstate ) { + newstate = next_alloc++; + prev_states[newstate] = state; + TRIE_LIST_PUSH( state, charid, newstate ); + transcount++; + } + state = newstate; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + } + } + TRIE_HANDLE_WORD(state); + + } /* end second pass */ + + /* next alloc is the NEXT state to be allocated */ + trie->statecount = next_alloc; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, + next_alloc + * sizeof(reg_trie_state) ); + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, + revcharmap, next_alloc, + depth+1) + ); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); + { + U32 state; + U32 tp = 0; + U32 zp = 0; + + + for( state=1 ; state < next_alloc ; state ++ ) { + U32 base=0; + + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) + ); + */ + + if (trie->states[state].trans.list) { + U16 minid=TRIE_LIST_ITEM( state, 1).forid; + U16 maxid=minid; + U16 idx; + + for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U16 forid = TRIE_LIST_ITEM( state, idx).forid; + if ( forid < minid ) { + minid=forid; + } else if ( forid > maxid ) { + maxid=forid; + } + } + if ( transcount < tp + maxid - minid + 1) { + transcount *= 2; + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, + transcount + * sizeof(reg_trie_trans) ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); + } + base = trie->uniquecharcount + tp - minid; + if ( maxid == minid ) { + U32 set = 0; + for ( ; zp < tp ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + base = trie->uniquecharcount + zp - minid; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ zp ].check = state; + set = 1; + break; + } + } + if ( !set ) { + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ tp ].check = state; + tp++; + zp = tp; + } + } else { + for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; + trie->trans[ tid ].check = state; + } + tp += ( maxid - minid + 1 ); + } + Safefree(trie->states[ state ].trans.list); + } + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, " base: %d\n",base); + ); + */ + trie->states[ state ].trans.base=base; + } + trie->lasttrans = tp + 1; + } + } else { + /* + Second Pass -- Flat Table Representation. + + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. + + We then construct the trie using only the .next slots of the entry + structs. + + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. + + Since trans are numbered from 1 any 0 pointer in the table is a FAIL + transition. + + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: + + A B A B + 1. 2 4 1. 3 7 + 2. 0 3 3. 0 5 + 3. 0 0 5. 0 0 + 4. 0 0 7. 0 0 + + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. + + */ + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using table compiler\n", + (int)depth * 2 + 2, "")); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) + * trie->uniquecharcount + 1, + sizeof(reg_trie_trans) ); + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + next_alloc = trie->uniquecharcount + 1; + + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + + U32 state = 1; /* required init */ + + U16 charid = 0; /* sanity init */ + U32 accept_state = 0; /* sanity init */ + + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + + if ( OP(noper) != NOTHING ) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + charid = svpp ? (U16)SvIV(*svpp) : 0; + } + if ( charid ) { + charid--; + if ( !trie->trans[ state + charid ].next ) { + trie->trans[ state + charid ].next = next_alloc; + trie->trans[ state ].check++; + prev_states[TRIE_NODENUM(next_alloc)] + = TRIE_NODENUM(state); + next_alloc += trie->uniquecharcount; + } + state = trie->trans[ state + charid ].next; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + } + } + accept_state = TRIE_NODENUM( state ); + TRIE_HANDLE_WORD(accept_state); + + } /* end second pass */ + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, + revcharmap, + next_alloc, depth+1)); + + { + /* + * Inplace compress the table.* + + For sparse data sets the table constructed by the trie algorithm will + be mostly 0/FAIL transitions or to put it another way mostly empty. + (Note that leaf nodes will not contain any transitions.) + + This algorithm compresses the tables by eliminating most such + transitions, at the cost of a modest bit of extra work during lookup: + + - Each states[] entry contains a .base field which indicates the + index in the state[] array wheres its transition data is stored. + + - If .base is 0 there are no valid transitions from that node. + + - If .base is nonzero then charid is added to it to find an entry in + the trans array. + + -If trans[states[state].base+charid].check!=state then the + transition is taken to be a 0/Fail transition. Thus if there are fail + transitions at the front of the node then the .base offset will point + somewhere inside the previous nodes data (or maybe even into a node + even earlier), but the .check field determines if the transition is + valid. + + XXX - wrong maybe? + The following process inplace converts the table to the compressed + table: We first do not compress the root node 1,and mark all its + .check pointers as 1 and set its .base pointer as 1 as well. This + allows us to do a DFA construction from the compressed table later, + and ensures that any .base pointers we calculate later are greater + than 0. + + - We set 'pos' to indicate the first entry of the second node. + + - We then iterate over the columns of the node, finding the first and + last used entry at l and m. We then copy l..m into pos..(pos+m-l), + and set the .check pointers accordingly, and advance pos + appropriately and repreat for the next node. Note that when we copy + the next pointers we have to convert them from the original + NODEIDX form to NODENUM form as the former is not valid post + compression. + + - If a node has no transitions used we mark its base as 0 and do not + advance the pos pointer. + + - If a node only has one transition we use a second pointer into the + structure to fill in allocated fail transitions from other states. + This pointer is independent of the main pointer and scans forward + looking for null transitions that are allocated to a state. When it + finds one it writes the single transition into the "hole". If the + pointer doesnt find one the single transition is appended as normal. + + - Once compressed we can Renew/realloc the structures to release the + excess space. + + See "Table-Compression Methods" in sec 3.9 of the Red Dragon, + specifically Fig 3.47 and the associated pseudocode. + + demq + */ + const U32 laststate = TRIE_NODENUM( next_alloc ); + U32 state, charid; + U32 pos = 0, zp=0; + trie->statecount = laststate; + + for ( state = 1 ; state < laststate ; state++ ) { + U8 flag = 0; + const U32 stateidx = TRIE_NODEIDX( state ); + const U32 o_used = trie->trans[ stateidx ].check; + U32 used = trie->trans[ stateidx ].check; + trie->trans[ stateidx ].check = 0; + + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { + if ( flag || trie->trans[ stateidx + charid ].next ) { + if ( trie->trans[ stateidx + charid ].next ) { + if (o_used == 1) { + for ( ; zp < pos ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + break; + } + } + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); + trie->trans[ zp ].check = state; + if ( ++zp > pos ) pos = zp; + break; + } + used--; + } + if ( !flag ) { + flag = 1; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; + } + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].check = state; + pos++; + } + } + } + trie->lasttrans = pos + 1; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, laststate + * sizeof(reg_trie_state) ); + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + ); + + } /* end table compress */ + } + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + (int)depth * 2 + 2, "", + (UV)trie->statecount, + (UV)trie->lasttrans) + ); + /* resize the trans array to remove unused space */ + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, trie->lasttrans + * sizeof(reg_trie_trans) ); + + { /* Modify the program and insert the new TRIE node */ + U8 nodetype =(U8)(flags & 0xFF); + char *str=NULL; + +#ifdef DEBUGGING + regnode *optimize = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + + U32 mjd_offset = 0; + U32 mjd_nodelen = 0; +#endif /* RE_TRACK_PATTERN_OFFSETS */ +#endif /* DEBUGGING */ + /* + This means we convert either the first branch or the first Exact, + depending on whether the thing following (in 'last') is a branch + or not and whther first is the startbranch (ie is it a sub part of + the alternation or is it the whole thing.) + Assuming its a sub part we convert the EXACT otherwise we convert + the whole branch sequence, including the first. + */ + /* Find the node we are going to overwrite */ + if ( first != startbranch || OP( last ) == BRANCH ) { + /* branch sub-chain */ + NEXT_OFF( first ) = (U16)(last - first); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_r({ + mjd_offset= Node_Offset((convert)); + mjd_nodelen= Node_Length((convert)); + }); +#endif + /* whole branch chain */ + } +#ifdef RE_TRACK_PATTERN_OFFSETS + else { + DEBUG_r({ + const regnode *nop = NEXTOPER( convert ); + mjd_offset= Node_Offset((nop)); + mjd_nodelen= Node_Length((nop)); + }); + } + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + (int)depth * 2 + 2, "", + (UV)mjd_offset, (UV)mjd_nodelen) + ); +#endif + /* But first we check to see if there is a common prefix we can + split out as an EXACT and put in front of the TRIE node. */ + trie->startstate= 1; + if ( trie->bitmap && !widecharmap && !trie->jump ) { + U32 state; + for ( state = 1 ; state < trie->statecount-1 ; state++ ) { + U32 ofs = 0; + I32 idx = -1; + U32 count = 0; + const U32 base = trie->states[ state ].trans.base; + + if ( trie->states[state].wordnum ) + count = 1; + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) && + ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && + trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + { + if ( ++count > 1 ) { + SV **tmp = av_fetch( revcharmap, ofs, 0); + const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); + if ( state == 1 ) break; + if ( count == 2 ) { + Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*sNew Start State=%"UVuf" Class: [", + (int)depth * 2 + 2, "", + (UV)state)); + if (idx >= 0) { + SV ** const tmp = av_fetch( revcharmap, idx, 0); + const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); + + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie, folder[ *ch ]); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, "%s", (char*)ch) + ); + } + } + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie,folder[ *ch ]); + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); + } + idx = ofs; + } + } + if ( count == 1 ) { + SV **tmp = av_fetch( revcharmap, idx, 0); + STRLEN len; + char *ch = SvPV( *tmp, len ); + DEBUG_OPTIMISE_r({ + SV *sv=sv_newmortal(); + PerlIO_printf( Perl_debug_log, + "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", + (int)depth * 2 + 2, "", + (UV)state, (UV)idx, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + }); + if ( state==1 ) { + OP( convert ) = nodetype; + str=STRING(convert); + STR_LEN(convert)=0; + } + STR_LEN(convert) += len; + while (len--) + *str++ = *ch++; + } else { +#ifdef DEBUGGING + if (state>1) + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); +#endif + break; + } + } + trie->prefixlen = (state-1); + if (str) { + regnode *n = convert+NODE_SZ_STR(convert); + NEXT_OFF(convert) = NODE_SZ_STR(convert); + trie->startstate = state; + trie->minlen -= (state - 1); + trie->maxlen -= (state - 1); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + regnode *fix = convert; + U32 word = trie->wordcount; + mjd_nodelen++; + Set_Node_Offset_Length(convert, mjd_offset, state - 1); + while( ++fix < n ) { + Set_Node_Offset_Length(fix, 0, 0); + } + while (word--) { + SV ** const tmp = av_fetch( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif + if (trie->maxlen) { + convert = n; + } else { + NEXT_OFF(convert) = (U16)(tail - convert); + DEBUG_r(optimize= n); + } + } + } + if (!jumper) + jumper = last; + if ( trie->maxlen ) { + NEXT_OFF( convert ) = (U16)(tail - convert); + ARG_SET( convert, data_slot ); + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. + We use this when dumping a trie and during optimisation. */ + if (trie->jump) + trie->jump[0] = (U16)(nextbranch - convert); + + /* If the start state is not accepting (meaning there is no empty string/NOTHING) + * and there is a bitmap + * and the first "jump target" node we found leaves enough room + * then convert the TRIE node into a TRIEC node, with the bitmap + * embedded inline in the opcode - this is hypothetically faster. + */ + if ( !trie->states[trie->startstate].wordnum + && trie->bitmap + && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) + { + OP( convert ) = TRIEC; + Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); + PerlMemShared_free(trie->bitmap); + trie->bitmap= NULL; + } else + OP( convert ) = TRIE; + + /* store the type in the flags */ + convert->flags = nodetype; + DEBUG_r({ + optimize = convert + + NODE_STEP_REGNODE + + regarglen[ OP( convert ) ]; + }); + /* XXX We really should free up the resource in trie now, + as we won't use them - (which resources?) dmq */ + } + /* needed for dumping*/ + DEBUG_r(if (optimize) { + regnode *opt = convert; + + while ( ++opt < optimize) { + Set_Node_Offset_Length(opt,0,0); + } + /* + Try to clean up some of the debris left after the + optimisation. + */ + while( optimize < jumper ) { + mjd_nodelen += Node_Length((optimize)); + OP( optimize ) = OPTIMIZED; + Set_Node_Offset_Length(optimize,0,0); + optimize++; + } + Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); + }); + } /* end node insert */ + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert); + + /* Finish populating the prev field of the wordinfo array. Walk back + * from each accept state until we find another accept state, and if + * so, point the first word's .prev field at the second word. If the + * second already has a .prev field set, stop now. This will be the + * case either if we've already processed that word's accept state, + * or that state had multiple words, and the overspill words were + * already linked up earlier. + */ + { + U16 word; + U32 state; + U16 prev; + + for (word=1; word <= trie->wordcount; word++) { + prev = 0; + if (trie->wordinfo[word].prev) + continue; + state = trie->wordinfo[word].accept; + while (state) { + state = prev_states[state]; + if (!state) + break; + prev = trie->states[state].wordnum; + if (prev) + break; + } + trie->wordinfo[word].prev = prev; + } + Safefree(prev_states); + } + + + /* and now dump out the compressed format */ + DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); + + RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; +#ifdef DEBUGGING + RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; + RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; +#else + SvREFCNT_dec_NN(revcharmap); +#endif + return trie->jump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE + : MADE_TRIE; +} + +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) +{ +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed + + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 + ISBN 0-201-10088-6 + + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. + Consider + 'abcdgu'=~/abcdefg|cdgu/ + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. + */ + /* add a fail transition */ + const U32 trie_offset = ARG(source); + reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; + U32 *q; + const U32 ucharcount = trie->uniquecharcount; + const U32 numstates = trie->statecount; + const U32 ubound = trie->lasttrans + ucharcount; + U32 q_read = 0; + U32 q_write = 0; + U32 charid; + U32 base = trie->states[ 1 ].trans.base; + U32 *fail; + reg_ac_data *aho; + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; + PERL_UNUSED_CONTEXT; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ + + ARG_SET( stclass, data_slot ); + aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); + RExC_rxi->data->data[ data_slot ] = (void*)aho; + aho->trie=trie_offset; + aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); + Copy( trie->states, aho->states, numstates, reg_trie_state ); + Newxz( q, numstates, U32); + aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); + aho->refcount = 1; + fail = aho->fail; + /* initialize fail[0..1] to be 1 so that we always have + a valid final fail state */ + fail[ 0 ] = fail[ 1 ] = 1; + + for ( charid = 0; charid < ucharcount ; charid++ ) { + const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); + if ( newstate ) { + q[ q_write ] = newstate; + /* set to point at the root */ + fail[ q[ q_write++ ] ]=1; + } + } + while ( q_read < q_write) { + const U32 cur = q[ q_read++ % numstates ]; + base = trie->states[ cur ].trans.base; + + for ( charid = 0 ; charid < ucharcount ; charid++ ) { + const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); + if (ch_state) { + U32 fail_state = cur; + U32 fail_base; + do { + fail_state = fail[ fail_state ]; + fail_base = aho->states[ fail_state ].trans.base; + } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); + + fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); + fail[ ch_state ] = fail_state; + if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) + { + aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; + } + q[ q_write++ % numstates] = ch_state; + } + } + } + /* restore fail[0..1] to 0 so that we "fall out" of the AC loop + when we fail in state 1, this allows us to use the + charclass scan to find a valid start char. This is based on the principle + that theres a good chance the string being searched contains lots of stuff + that cant be a start char. + */ + fail[ 0 ] = fail[ 1 ] = 0; + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%*sStclass Failtable (%"UVuf" states): 0", + (int)(depth * 2), "", (UV)numstates + ); + for( q_read=1; q_read%3d: %s (%d)\n", \ + (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ + Next ? (REG_NODE_NUM(Next)) : 0 ); \ + }}); + + +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one. The regop may be changed if the node(s) contain certain sequences that + * require special handling. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING-kind nodes, and + * these get optimized out + * + * If a node is to match under /i (folded), the number of characters it matches + * can be different than its character length if it contains a multi-character + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. + * + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) + * + * This is as good a place as any to discuss the design of handling these + * multi-character fold sequences. It's been wrong in Perl for a very long + * time. There are three code points in Unicode whose multi-character folds + * were long ago discovered to mess things up. The previous designs for + * dealing with these involved assigning a special node for them. This + * approach doesn't always work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both sides fold to "sss", but if the pattern is parsed to create a node that + * would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFU_SS node + * that is "sss" in this case. + * + * It turns out that there are problems with all multi-character folds, and not + * just these three. Now the code is general, for all such cases. The + * approach taken is: + * 1) This routine examines each EXACTFish node that could contain multi- + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * is used for an EXACTFU node that contains at least one "ss" sequence in + * it. For non-UTF-8 patterns and strings, this is the only case where + * there is a possible fold length change. That means that a regular + * EXACTFU node without UTF-8 involvement doesn't have to concern itself + * with length changes, and so can be processed faster. regexec.c takes + * advantage of this. Generally, an EXACTFish node that is in UTF-8 is + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string are + * members of a fold-pair, and arrays are set up for all of them so that + * the other member of the pair can be found quickly. Code elsewhere in + * this file makes sure that in EXACTFU nodes, the sharp s gets folded to + * 'ss', even if the pattern isn't UTF-8. This avoids the issues + * described in the next item. + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid + * assumption. Thus, there is no optimization based on string lengths for + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ + if (PL_regkind[OP(scan)] == EXACT) \ + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) + +STATIC U32 +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ + /* Merge several consecutive EXACTish nodes into one. */ + regnode *n = regnext(scan); + U32 stringok = 1; + regnode *next = scan + NODE_SZ_STR(scan); + U32 merged = 0; + U32 stopnow = 0; +#ifdef DEBUGGING + regnode *stop = scan; + GET_RE_DEBUG_FLAGS_DECL; +#else + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_JOIN_EXACT; +#ifndef EXPERIMENTAL_INPLACESCAN + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(val); +#endif + DEBUG_PEEP("join",scan,depth); + + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) + && NEXT_OFF(n) + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { + + if (OP(n) == TAIL || n > next) + stringok = 0; + if (PL_regkind[OP(n)] == NOTHING) { + DEBUG_PEEP("skip:",n,depth); + NEXT_OFF(scan) += NEXT_OFF(n); + next = n + NODE_STEP_REGNODE; +#ifdef DEBUGGING + if (stringok) + stop = n; +#endif + n = regnext(n); + } + else if (stringok) { + const unsigned int oldl = STR_LEN(scan); + regnode * const nnext = regnext(n); + + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ + /* Don't join if the sum can't fit into a single node */ + if (oldl + STR_LEN(n) > U8_MAX) + break; + + DEBUG_PEEP("merg",n,depth); + merged++; + + NEXT_OFF(scan) += NEXT_OFF(n); + STR_LEN(scan) += STR_LEN(n); + next = n + NODE_SZ_STR(n); + /* Now we can overwrite *n : */ + Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); +#ifdef DEBUGGING + stop = next - 1; +#endif + n = nnext; + if (stopnow) break; + } + +#ifdef EXPERIMENTAL_INPLACESCAN + if (flags && !NEXT_OFF(n)) { + DEBUG_PEEP("atch", val, depth); + if (reg_off_by_arg[OP(n)]) { + ARG_SET(n, val - n); + } + else { + NEXT_OFF(n) = val - n; + } + stopnow = 1; + } +#endif + } + + *min_subtract = 0; + *unfolded_multi_char = FALSE; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ + if (OP(scan) != EXACT) { + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ + + /* One pass is made over the node's string looking for all the + * possibilities. To avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ + + /* Examine the string for a multi-character fold sequence. UTF-8 + * patterns have all characters pre-folded by the time this code is + * executed */ + while (s < s_end - 1) /* Can stop 1 before the end, as minimum + length sequence we are looking for is 2 */ + { + int count = 0; /* How many characters in a multi-char fold */ + int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); + if (! len) { /* Not a multi-char fold: get next char */ + s += UTF8SKIP(s); + continue; + } + + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ + if (len == 2 && *s == 's' && *(s+1) == 's' + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) + { + count = 2; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } + s += 2; + } + else { /* Here is a generic multi-char fold. */ + U8* multi_end = s + len; + + /* Count how many characters are in it. In the case of + * /aa, no folds which contain ASCII code points are + * allowed, so check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { + count = utf8_length(s, multi_end); + s = multi_end; + } + else { + while (s < multi_end) { + if (isASCII(*s)) { + s++; + goto next_iteration; + } + else { + s += UTF8SKIP(s); + } + count++; + } + } + } + + /* The delta is how long the sequence is minus 1 (1 is how long + * the character that folds to the sequence is) */ + total_count_delta += count - 1; + next_iteration: ; + } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); + } + else if (OP(scan) == EXACTFA) { + + /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char + * fold to the ASCII range (and there are no existing ones in the + * upper latin1 range). But, as outlined in the comments preceding + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ + while (s < s_end) { + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; + break; + } + s++; + continue; + } + } + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; + + while (s < upper) { + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); + if (! len) { /* Not a multi-char fold. */ + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) + { + *unfolded_multi_char = TRUE; + } + s++; + continue; + } + + if (len == 2 + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) + { + + /* EXACTF nodes need to know that the minimum length + * changed so that a sharp s in the string can match this + * ss in the pattern, but they remain EXACTF nodes, as they + * won't match this unless the target string is is UTF-8, + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } + } + + *min_subtract += len - 1; + s += len; + } + } + } + +#ifdef DEBUGGING + /* Allow dumping but overwriting the collection of skipped + * ops and/or strings with fake optimized ops */ + n = scan + NODE_SZ_STR(scan); + while (n <= stop) { + OP(n) = OPTIMIZED; + FLAGS(n) = 0; + NEXT_OFF(n) = 0; + n++; + } +#endif + DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); + return stopnow; +} + +/* REx optimizer. Converts nodes into quicker variants "in place". + Finds fixed substrings. */ + +/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set + to the position after last scanned or to NULL. */ + +#define INIT_AND_WITHP \ + assert(!and_withp); \ + Newx(and_withp,1, regnode_ssc); \ + SAVEFREEPV(and_withp) + +/* this is a chain of data about sub patterns we are processing that + need to be handled separately/specially in study_chunk. Its so + we can simulate recursion without losing state. */ +struct scan_frame; +typedef struct scan_frame { + regnode *last; /* last node to process in this frame */ + regnode *next; /* next node to process when last is reached */ + struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; + I32 stop; /* what stopparen do we use */ +} scan_frame; + + +STATIC SSize_t +S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, + SSize_t *minlenp, SSize_t *deltap, + regnode *last, + scan_data_t *data, + I32 stopparen, + U32 recursed_depth, + regnode_ssc *and_withp, + U32 flags, U32 depth) + /* scanp: Start here (read-write). */ + /* deltap: Write maxlen-minlen here. */ + /* last: Stop before this one. */ + /* data: string data about the pattern */ + /* stopparen: treat close N as END */ + /* recursed: which subroutines have we recursed into */ + /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ +{ + /* There must be at least this number of characters to match */ + SSize_t min = 0; + I32 pars = 0, code; + regnode *scan = *scanp, *next; + SSize_t delta = 0; + int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); + int is_inf_internal = 0; /* The studied chunk is infinite */ + I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; + scan_data_t data_fake; + SV *re_trie_maxbuff = NULL; + regnode *first_non_open = scan; + SSize_t stopmin = SSize_t_MAX; + scan_frame *frame = NULL; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_STUDY_CHUNK; + +#ifdef DEBUGGING + StructCopy(&zero_scan_data, &data_fake, scan_data_t); +#endif + if ( depth == 0 ) { + while (first_non_open && OP(first_non_open) == OPEN) + first_non_open=regnext(first_non_open); + } + + + fake_study_recurse: + while ( scan && OP(scan) != END && scan < last ){ + UV min_subtract = 0; /* How mmany chars to subtract from the minimum + node length to get a real minimum (because + the folded version may be shorter) */ + bool unfolded_multi_char = FALSE; + /* Peephole optimizer: */ + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); + + + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); + + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. */ + if (OP(scan) != CURLYX) { + const int max = (reg_off_by_arg[OP(scan)] + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); + int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); + int noff; + regnode *n = scan; + + /* Skip NOTHING and LONGJMP. */ + while ((n = regnext(n)) + && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n)))) + && off + noff < max) + off += noff; + if (reg_off_by_arg[OP(scan)]) + ARG(scan) = off; + else + NEXT_OFF(scan) = off; + } + + + + /* The principal pseudo-switch. Cannot be a switch, since we + look into several different things. */ + if (OP(scan) == BRANCH || OP(scan) == BRANCHJ + || OP(scan) == IFTHEN) { + next = regnext(scan); + code = OP(scan); + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ + + if (OP(next) == code || code == IFTHEN) { + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; + regnode * const startbranch=scan; + + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + while (OP(scan) == code) { + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; + + num++; + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + + data_fake.pos_delta = delta; + next = regnext(scan); + scan = NEXTOPER(scan); + if (code != BRANCH) + scan = NEXTOPER(scan); + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + /* we suppose the run is continuous, last=next...*/ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); + if (min1 > minnext) + min1 = minnext; + if (deltanext == SSize_t_MAX) { + is_inf = is_inf_internal = 1; + max1 = SSize_t_MAX; + } else if (max1 < minnext + deltanext) + max1 = minnext + deltanext; + scan = next; + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > minnext) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); + } + if (code == IFTHEN && num < 2) /* Empty ELSE branch */ + min1 = 0; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; + else + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; + else + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { + /* demq. + + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of + + BRANCH->EXACT=>x1 + BRANCH->EXACT=>x2 + tail + + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ + + If we can find such a subsequence we need to turn the first + element into a trie and then add the subsequent branch exact + strings to the trie. + + We have two cases + + 1. patterns where the whole set of branches can be + converted. + + 2. patterns where only a subset can be converted. + + In case 1 we can replace the whole set with a single regop + for the trie. In case 2 we need to keep the start and end + branches so + + 'BRANCH EXACT; BRANCH EXACT; BRANCH X' + becomes BRANCH TRIE; BRANCH X; + + There is an additional case, that being where there is a + common prefix, which gets split out into an EXACT like node + preceding the TRIE node. + + If x(1..n)==tail then we can do a simple trie, if not we make + a "jump" trie, such that when we match the appropriate word + we "jump" to the appropriate tail node. Essentially we turn + a nested if into a case structure of sorts. + + */ + + int made=0; + if (!re_trie_maxbuff) { + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + if (!SvIOK(re_trie_maxbuff)) + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + if ( SvIV(re_trie_maxbuff)>=0 ) { + regnode *cur; + regnode *first = (regnode *)NULL; + regnode *last = (regnode *)NULL; + regnode *tail = scan; + U8 trietype = 0; + U32 count=0; + +#ifdef DEBUGGING + SV * const mysv = sv_newmortal(); /* for dumping */ +#endif + /* var tail is used because there may be a TAIL + regop in the way. Ie, the exacts will point to the + thing following the TAIL, but the last branch will + point at the TAIL. So we advance tail. If we + have nested (?:) we may have to move through several + tails. + */ + + while ( OP( tail ) == TAIL ) { + /* this is the TAIL generated by (?:) */ + tail = regnext( tail ); + } + + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, tail, NULL); + PerlIO_printf( Perl_debug_log, "%*s%s%s\n", + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) + ); + }); + + /* + + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part + of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACTFU | EXACTFU + EXACTFU_SS | EXACTFU + EXACTFA | EXACTFA + + + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ + ( EXACT == (X) ) ? EXACT : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ + 0 ) + + /* dont use tail as the end marker for this traverse */ + for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { + regnode * const noper = NEXTOPER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); +#if defined(DEBUGGING) || defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; +#endif + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", + (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); + + regprop(RExC_rx, mysv, noper, NULL); + PerlIO_printf( Perl_debug_log, " -> %s", + SvPV_nolen_const(mysv)); + + if ( noper_next ) { + regprop(RExC_rx, mysv, noper_next, NULL); + PerlIO_printf( Perl_debug_log,"\t=> %s\t", + SvPV_nolen_const(mysv)); + } + PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] + ); + }); + + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ + if ( noper_trietype + && + ( + ( noper_trietype == NOTHING) + || ( trietype == NOTHING ) + || ( trietype == noper_trietype ) + ) +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + && count < U16_MAX) + { + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ + if ( !first ) { + first = cur; + if ( noper_trietype == NOTHING ) { +#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; +#endif + + if ( noper_next_trietype ) { + trietype = noper_next_trietype; + } else if (noper_next_type) { + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ + first = NULL; + } + } else { + trietype = noper_trietype; + } + } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; + last = cur; + } + if (first) + count++; + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ + if ( last ) { + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ + if ( trietype && trietype != NOTHING ) + make_trie( pRExC_state, + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ + } + if ( noper_trietype +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + ){ + /* noper is triable, so we can start a new + * trie sequence */ + count = 1; + first = cur; + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the + * current node is not triable then we have + * to reset the first information. */ + count = 0; + first = NULL; + trietype = 0; + } + } /* end handle unmergable node */ + } /* loop over branches */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + if ( last && trietype ) { + if ( trietype != NOTHING ) { + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); +#ifdef TRIE_STUDY_OPT + if ( ((made == MADE_EXACT_TRIE && + startbranch == first) + || ( first_non_open == first )) && + depth==0 ) { + flags |= SCF_TRIE_RESTUDY; + if ( startbranch == first + && scan == tail ) + { + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; + } + } +#endif + } else { + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING + */ + if ( startbranch == first ) { + regnode *opt; + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + OP(startbranch)= NOTHING; + NEXT_OFF(startbranch)= tail - startbranch; + for ( opt= startbranch + 1; opt < tail ; opt++ ) + OP(opt)= OPTIMIZED; + } + } + } /* end if ( last) */ + } /* TRIE_MAXBUF is non zero */ + + } /* do trie */ + + } + else if ( code == BRANCHJ ) { /* single branch is optimized. */ + scan = NEXTOPER(NEXTOPER(scan)); + } else /* single branch is optimized. */ + scan = NEXTOPER(scan); + continue; + } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { + scan_frame *newframe = NULL; + I32 paren; + regnode *start; + regnode *end; + U32 my_recursed_depth= recursed_depth; + + if (OP(scan) != SUSPEND) { + /* set the pointer */ + if (OP(scan) == GOSUB) { + paren = ARG(scan); + RExC_recurse[ARG2L(scan)] = scan; + start = RExC_open_parens[paren-1]; + end = RExC_close_parens[paren-1]; + } else { + paren = 0; + start = RExC_rxi->program + 1; + end = RExC_opend; + } + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; + Newx(newframe,1,scan_frame); + } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + } + } else { + Newx(newframe,1,scan_frame); + paren = stopparen; + start = scan+2; + end = regnext(scan); + } + if (newframe) { + assert(start); + assert(end); + SAVEFREEPV(newframe); + newframe->next = regnext(scan); + newframe->last = last; + newframe->stop = stopparen; + newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); + + frame = newframe; + scan = start; + stopparen = paren; + last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; + + continue; + } + } + else if (OP(scan) == EXACT) { + SSize_t l = STR_LEN(scan); + UV uc; + if (UTF) { + const U8 * const s = (U8*)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } else { + uc = *((U8*)STRING(scan)); + } + min += l; + if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ + /* The code below prefers earlier match for fixed + offset, later match for variable offset. */ + if (data->last_end == -1) { /* Update the start info. */ + data->last_start_min = data->pos_min; + data->last_start_max = is_inf + ? SSize_t_MAX : data->pos_min + data->pos_delta; + } + sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + if (UTF) + SvUTF8_on(data->last_found); + { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += utf8_length((U8*)STRING(scan), + (U8*)STRING(scan)+STR_LEN(scan)); + } + data->last_end = data->pos_min + l; + data->pos_min += l; /* As in the first entry. */ + data->flags &= ~SF_BEFORE_EOL; + } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ + if (flags & SCF_DO_STCLASS_AND) { + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + } + else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is + EXACTFish */ + SSize_t l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ + const U8 * s = (U8*)STRING(scan); + + /* Search for fixed substrings supports EXACT only. */ + if (flags & SCF_DO_SUBSTR) { + assert(data); + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (UTF) { + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; + } + min += l - min_subtract; + assert (min >= 0); + delta += min_subtract; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += l - min_subtract; + if (data->pos_min < 0) { + data->pos_min = 0; + } + data->pos_delta += min_subtract; + if (min_subtract) { + data->longest = &(data->longest_float); + } + } + + if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + + if (! UTF) { + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { + + /* Any Latin1 range character can potentially match any + * other depending on the locale */ + if (OP(scan) == EXACTFL) { + _invlist_union(EXACTF_invlist, PL_Latin1, + &EXACTF_invlist); + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) { + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); + } + } + + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, + (U8) uc, + &EXACTF_invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + STR_LEN(scan); + SV** listp; + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(scan) == EXACTFL + && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) + { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); + } + } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; + } + + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c); + } + } + } + } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); + } + else if (REGNODE_VARIES(OP(scan))) { + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; + regnode * const oscan = scan; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; + I32 next_is_eval = 0; + + switch (PL_regkind[OP(scan)]) { + case WHILEM: /* End of (?:...)* . */ + scan = NEXTOPER(scan); + goto finish; + case PLUS: + if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { + next = NEXTOPER(scan); + if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { + mincount = 1; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + } + if (flags & SCF_DO_SUBSTR) + data->pos_min++; + min++; + /* FALLTHROUGH */ + case STAR: + if (flags & SCF_DO_STCLASS) { + mincount = 0; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + scan = regnext(scan); + goto optimize_curly_tail; + case CURLY: + if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) + && (scan->flags == stopparen)) + { + mincount = 1; + maxcount = 1; + } else { + mincount = ARG1(scan); + maxcount = ARG2(scan); + } + next = regnext(scan); + if (OP(scan) == CURLYX) { + I32 lp = (data ? *(data->last_closep) : 0); + scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); + } + scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; + next_is_eval = (OP(scan) == EVAL); + do_curly: + if (flags & SCF_DO_SUBSTR) { + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + pos_before = data->pos_min; + } + if (data) { + fl = data->flags; + data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); + if (is_inf) + data->flags |= SF_IS_INF; + } + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + oclass = data->start_class; + data->start_class = &this_class; + f |= SCF_DO_STCLASS_AND; + f &= ~SCF_DO_STCLASS_OR; + } + /* Exclude from super-linear cache processing any {n,m} + regops for which the combination of input pos and regex + pos is not enough information to determine if a match + will be possible. + + For example, in the regex /foo(bar\s*){4,8}baz/ with the + regex pos at the \s*, the prospects for a match depend not + only on the input position but also on how many (bar\s*) + repeats into the {4,8} we are. */ + if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) + f &= ~SCF_WHILEM_VISITED_POS; + + /* This will finish on WHILEM, setting scan, or on NULL: */ + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); + + if (flags & SCF_DO_STCLASS) + data->start_class = oclass; + if (mincount == 0 || minnext == 0) { + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + } + else if (flags & SCF_DO_STCLASS_AND) { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&this_class, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + } + } else { /* Non-zero len */ + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + } + else if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + flags &= ~SCF_DO_STCLASS; + } + if (!scan) /* It was not CURLYX, but CURLY. */ + scan = next; + if (!(flags & SCF_TRIE_DOING_RESTUDY) + /* ? quantifier ok, except for (?{ ... }) */ + && (next_is_eval || !(mincount == 0 && maxcount == 1)) + && (minnext == 0) && (deltanext == 0) + && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ + { + /* Fatal warnings may leak the regexp without this: */ + SAVEFREESV(RExC_rx_sv); + ckWARNreg(RExC_parse, + "Quantifier unexpected on zero-length expression"); + (void)ReREFCNT_inc(RExC_rx_sv); + } + + min += minnext * mincount; + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf |= is_inf_internal; + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } + /* Try powerful optimization CURLYX => CURLYN. */ + if ( OP(oscan) == CURLYX && data + && data->flags & SF_IN_PAR + && !(data->flags & SF_HAS_EVAL) + && !deltanext && minnext == 1 ) { + /* Try to optimize to CURLYN. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; + regnode * const nxt1 = nxt; +#ifdef DEBUGGING + regnode *nxt2; +#endif + + /* Skip open. */ + nxt = regnext(nxt); + if (!REGNODE_SIMPLE(OP(nxt)) + && !(PL_regkind[OP(nxt)] == EXACT + && STR_LEN(nxt) == 1)) + goto nogo; +#ifdef DEBUGGING + nxt2 = nxt; +#endif + nxt = regnext(nxt); + if (OP(nxt) != CLOSE) + goto nogo; + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ + } + /* Now we know that nxt2 is the only contents: */ + oscan->flags = (U8)ARG(nxt); + OP(oscan) = CURLYN; + OP(nxt1) = NOTHING; /* was OPEN. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ +#endif + } + nogo: + + /* Try optimization CURLYX => CURLYM. */ + if ( OP(oscan) == CURLYX && data + && !(data->flags & SF_HAS_PAR) + && !(data->flags & SF_HAS_EVAL) + && !deltanext /* atom is fixed width */ + && minnext != 0 /* CURLYM can't handle zero width */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) + ) { + /* XXXX How to optimize if data == 0? */ + /* Optimize to a simpler form. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ + regnode *nxt2; + + OP(oscan) = CURLYM; + while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ + && (OP(nxt2) != WHILEM)) + nxt = nxt2; + OP(nxt2) = SUCCEED; /* Whas WHILEM */ + /* Need to optimize away parenths. */ + if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { + /* Set the parenth number. */ + regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ + + oscan->flags = (U8)ARG(nxt); + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ + } + OP(nxt1) = OPTIMIZED; /* was OPEN. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ +#endif +#if 0 + while ( nxt1 && (OP(nxt1) != WHILEM)) { + regnode *nnxt = regnext(nxt1); + if (nnxt == nxt) { + if (reg_off_by_arg[OP(nxt1)]) + ARG_SET(nxt1, nxt2 - nxt1); + else if (nxt2 - nxt1 < U16_MAX) + NEXT_OFF(nxt1) = nxt2 - nxt1; + else + OP(nxt) = NOTHING; /* Cannot beautify */ + } + nxt1 = nnxt; + } +#endif + /* Optimize again: */ + study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, + NULL, stopparen, recursed_depth, NULL, 0,depth+1); + } + else + oscan->flags = 0; + } + else if ((OP(oscan) == CURLYX) + && (flags & SCF_WHILEM_VISITED_POS) + /* See the comment on a similar expression above. + However, this time it's not a subexpression + we care about, but the expression itself. */ + && (maxcount == REG_INFTY) + && data && ++data->whilem_c < 16) { + /* This stays as CURLYX, we can put the count/of pair. */ + /* Find WHILEM (as in regexec.c) */ + regnode *nxt = oscan + NEXT_OFF(oscan); + + if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ + nxt += ARG(nxt); + PREVOPER(nxt)->flags = (U8)(data->whilem_c + | (RExC_whilem_seen << 4)); /* On WHILEM */ + } + if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (flags & SCF_DO_SUBSTR) { + SV *last_str = NULL; + STRLEN last_chrs = 0; + int counted = mincount != 0; + + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min + ? pos_before : data->last_start_min; + STRLEN l; + const char * const s = SvPV_const(data->last_found, l); + SSize_t old = b - data->last_start_min; + + if (UTF) + old = utf8_hop((U8*)s, old) - (U8*)s; + l -= old; + /* Get the added string: */ + last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; + if (deltanext == 0 && pos_before == b) { + /* What was added is a constant string */ + if (mincount > 1) { + + SvGROW(last_str, (mincount * l) + 1); + repeatcpy(SvPVX(last_str) + l, + SvPVX_const(last_str), l, + mincount - 1); + SvCUR_set(last_str, SvCUR(last_str) * mincount); + /* Add additional parts. */ + SvCUR_set(data->last_found, + SvCUR(data->last_found) - l); + sv_catsv(data->last_found, last_str); + { + SV * sv = data->last_found; + MAGIC *mg = + SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += last_chrs * (mincount-1); + } + last_chrs *= mincount; + data->last_end += l * (mincount - 1); + } + } else { + /* start offset must point into the last copy */ + data->last_start_min += minnext * (mincount - 1); + data->last_start_max += is_inf ? SSize_t_MAX + : (maxcount - 1) * (minnext + data->pos_delta); + } + } + /* It is counted once already... */ + data->pos_min += minnext * (mincount - counted); +#if 0 +PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf + " SSize_t_MAX=%"UVuf" minnext=%"UVuf + " maxcount=%"UVuf" mincount=%"UVuf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); +#endif + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; + else + data->pos_delta += - counted * deltanext + + (minnext + deltanext) * maxcount - minnext * mincount; + if (mincount != maxcount) { + /* Cannot extend fixed substrings found inside + the group. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + if (mincount && last_str) { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + + if (mg) + mg->mg_len = -1; + sv_setsv(sv, last_str); + data->last_end = data->pos_min; + data->last_start_min = data->pos_min - last_chrs; + data->last_start_max = is_inf + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; + } + data->longest = &(data->longest_float); + } + SvREFCNT_dec(last_str); + } + if (data && (fl & SF_HAS_EVAL)) + data->flags |= SF_HAS_EVAL; + optimize_curly_tail: + if (OP(oscan) != CURLYX) { + while (PL_regkind[OP(next = regnext(oscan))] == NOTHING + && NEXT_OFF(next)) + NEXT_OFF(oscan) += NEXT_OFF(next); + } + continue; + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } + flags &= ~SCF_DO_STCLASS; + break; + } + } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + } + min++; + delta++; /* Because of the 2 char string cr-lf */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + } + else if (REGNODE_SIMPLE(OP(scan))) { + + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min++; + } + min++; + if (flags & SCF_DO_STCLASS) { + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + + /* Some of the logic below assumes that switching + locale on will only add false positives. */ + switch (OP(scan)) { + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); +#endif + case CANY: + case SANY: + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_match_all_cp(data->start_class); + break; + + case REG_ANY: + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); + } + break; + + case ANYOF: + if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); + else + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); + break; + + case NPOSIXL: + invert = 1; + /* FALLTHROUGH */ + + case POSIXL: + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; + if (flags & SCF_DO_STCLASS_AND) { + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); + } + else { + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); + } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + } + break; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; + /* FALLTHROUGH */ + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + + case NPOSIXD: + case NPOSIXU: + invert = 1; + /* FALLTHROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } + + join_posix: + + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); + } + else { + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); + } + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { + data->flags |= (OP(scan) == MEOL + ? SF_BEFORE_MEOL + : SF_BEFORE_SEOL); + scan_commit(pRExC_state, data, minlenp, is_inf); + + } + else if ( PL_regkind[OP(scan)] == BRANCHJ + /* Lookbehind, or need to calculate parens/evals/stclass: */ + && (scan->flags || data || (flags & SCF_DO_STCLASS)) + && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { + if ( OP(scan) == UNLESSM && + scan->flags == 0 && + OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && + OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED + ) { + regnode *opt; + regnode *upto= regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_STUDYDATA("OPFAIL",data,depth); + + /*DEBUG_PARSE_MSG("opfail");*/ + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) + ); + }); + OP(scan) = OPFAIL; + NEXT_OFF(scan) = upto - scan; + for (opt= scan + 1; opt < upto ; opt++) + OP(opt) = OPTIMIZED; + scan= upto; + continue; + } + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + || OP(scan) == UNLESSM ) + { + /* Negative Lookahead/lookbehind + In this case we can't do fixed string optimisation. + */ + + SSize_t deltanext, minnext, fake = 0; + regnode *nscan; + regnode_ssc intrnl; + int f = 0; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (minnext > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); + } + scan->flags = (U8)minnext; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (f & SCF_DO_STCLASS_AND) { + if (flags & SCF_DO_STCLASS_OR) { + /* OR before, AND after: ideally we would recurse with + * data_fake to get the AND applied by study of the + * remainder of the pattern, and then derecurse; + * *** HACK *** for now just treat as "no information". + * See [perl #56690]. + */ + ssc_init(pRExC_state, data->start_class); + } else { + /* AND before and after: combine and continue */ + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + } + } + } +#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY + else { + /* Positive Lookahead/lookbehind + In this case we can do fixed string optimisation, + but we must be careful about it. Note in the case of + lookbehind the positions will be offset by the minimum + length of the pattern, something we won't know about + until after the recurse. + */ + SSize_t deltanext, fake = 0; + regnode *nscan; + regnode_ssc intrnl; + int f = 0; + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated + minlens when it's all done. This way we don't + have to worry about freeing them when we know + they wont be used, which would be a pain. + */ + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); + SAVEFREEPV(minnextp); + + if (data) { + StructCopy(data, &data_fake, scan_data_t); + if ((flags & SCF_DO_SUBSTR) && data->last_found) { + f |= SCF_DO_SUBSTR; + if (scan->flags) + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); + data_fake.last_found=newSVsv(data->last_found); + } + } + else + data_fake.last_closep = &fake; + data_fake.flags = 0; + data_fake.pos_delta = delta; + if (is_inf) + data_fake.flags |= SF_IS_INF; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (*minnextp > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); + } + scan->flags = (U8)*minnextp; + } + + *minnextp += min; + + if (f & SCF_DO_STCLASS_AND) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { + if (RExC_rx->minlen<*minnextp) + RExC_rx->minlen=*minnextp; + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); + SvREFCNT_dec_NN(data_fake.last_found); + + if ( data_fake.minlen_fixed != minlenp ) + { + data->offset_fixed= data_fake.offset_fixed; + data->minlen_fixed= data_fake.minlen_fixed; + data->lookbehind_fixed+= scan->flags; + } + if ( data_fake.minlen_float != minlenp ) + { + data->minlen_float= data_fake.minlen_float; + data->offset_float_min=data_fake.offset_float_min; + data->offset_float_max=data_fake.offset_float_max; + data->lookbehind_float+= scan->flags; + } + } + } + } +#endif + } + else if (OP(scan) == OPEN) { + if (stopparen != (I32)ARG(scan)) + pars++; + } + else if (OP(scan) == CLOSE) { + if (stopparen == (I32)ARG(scan)) { + break; + } + if ((I32)ARG(scan) == is_par) { + next = regnext(scan); + + if ( next && (OP(next) != WHILEM) && next < last) + is_par = 0; /* Disable optimization */ + } + if (data) + *(data->last_closep) = ARG(scan); + } + else if (OP(scan) == EVAL) { + if (data) + data->flags |= SF_HAS_EVAL; + } + else if ( PL_regkind[OP(scan)] == ENDLIKE ) { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + flags &= ~SCF_DO_SUBSTR; + } + if (data && OP(scan)==ACCEPT) { + data->flags |= SCF_SEEN_ACCEPT; + if (stopmin > min) + stopmin = min; + } + } + else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ + { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + } + else if (OP(scan) == GPOS) { + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) + { + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) + RExC_rx->gofs = min; + } else { + RExC_rx->intflags |= PREGf_GPOS_FLOAT; + RExC_rx->gofs = 0; + } + } +#ifdef TRIE_STUDY_OPT +#ifdef FULL_TRIE_STUDY + else if (PL_regkind[OP(scan)] == TRIE) { + /* NOTE - There is similar code to this block above for handling + BRANCH nodes on the initial study. If you change stuff here + check there too. */ + regnode *trie_node= scan; + regnode *tail= regnext(scan); + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; + + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + if (!trie->jump) { + min1= trie->minlen; + max1= trie->maxlen; + } else { + const regnode *nextbranch= NULL; + U32 word; + + for ( word=1 ; word <= trie->wordcount ; word++) + { + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + if (trie->jump[word]) { + if (!nextbranch) + nextbranch = trie_node + trie->jump[0]; + scan= trie_node + trie->jump[word]; + /* We go from the jump point to the branch that follows + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed_depth, NULL, f,depth+1); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode*)nextbranch); + + if (min1 > (SSize_t)(minnext + trie->minlen)) + min1 = minnext + trie->minlen; + if (deltanext == SSize_t_MAX) { + is_inf = is_inf_internal = 1; + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) + max1 = minnext + deltanext + trie->maxlen; + + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > min + min1) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); + } + } + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + scan= tail; + continue; + } +#else + else if (PL_regkind[OP(scan)] == TRIE) { + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + U8*bang=NULL; + + min += trie->minlen; + delta += (trie->maxlen - trie->minlen); + flags &= ~SCF_DO_STCLASS; /* xxx */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += trie->minlen; + data->pos_delta += (trie->maxlen - trie->minlen); + if (trie->maxlen != trie->minlen) + data->longest = &(data->longest_float); + } + if (trie->jump) /* no more substrings -- for now /grr*/ + flags &= ~SCF_DO_SUBSTR; + } +#endif /* old or new */ +#endif /* TRIE_STUDY_OPT */ + + /* Else: zero-length, ignore. */ + scan = regnext(scan); + } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ + if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ + last = frame->last; + scan = frame->next; + stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + + frame = frame->prev; + goto fake_study_recurse; + } + + finish: + assert(!frame); + DEBUG_STUDYDATA("pre-fin:",data,depth); + + *scanp = scan; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + + if (flags & SCF_DO_SUBSTR && is_inf) + data->pos_delta = SSize_t_MAX - data->pos_min; + if (is_par > (I32)U8_MAX) + is_par = 0; + if (is_par && pars==1 && data) { + data->flags |= SF_IN_PAR; + data->flags &= ~SF_HAS_PAR; + } + else if (pars && data) { + data->flags |= SF_HAS_PAR; + data->flags &= ~SF_IN_PAR; + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + if (flags & SCF_TRIE_RESTUDY) + data->flags |= SCF_TRIE_RESTUDY; + + DEBUG_STUDYDATA("post-fin:",data,depth); + + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ +} + +STATIC U32 +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) +{ + U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; + + PERL_ARGS_ASSERT_ADD_DATA; + + Renewc(RExC_rxi->data, + sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), + char, struct reg_data); + if(count) + Renew(RExC_rxi->data->what, count + n, U8); + else + Newx(RExC_rxi->data->what, n, U8); + RExC_rxi->data->count = count + n; + Copy(s, RExC_rxi->data->what + count, n, U8); + return count; +} + +/*XXX: todo make this not included in a non debugging perl, but appears to be + * used anyway there, in 'use re' */ +#ifndef PERL_IN_XSUB_RE +void +Perl_reginitcolors(pTHX) +{ + const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); + if (s) { + char *t = savepv(s); + int i = 0; + PL_colors[0] = t; + while (++i < 6) { + t = strchr(t, '\t'); + if (t) { + *t = '\0'; + PL_colors[i] = ++t; + } + else + PL_colors[i] = t = (char *)""; + } + } else { + int i = 0; + while (i < 6) + PL_colors[i++] = (char *)""; + } + PL_colorset = 1; +} +#endif + + +#ifdef TRIE_STUDY_OPT +#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ + STMT_START { \ + if ( \ + (data.flags & SCF_TRIE_RESTUDY) \ + && ! restudied++ \ + ) { \ + dOsomething; \ + goto reStudy; \ + } \ + } STMT_END +#else +#define CHECK_RESTUDY_GOTO_butfirst +#endif + +/* + * pregcomp - compile a regular expression into internal code + * + * Decides which engine's compiler to call based on the hint currently in + * scope + */ + +#ifndef PERL_IN_XSUB_RE + +/* return the currently in-scope regex engine (or the default if none) */ + +regexp_engine const * +Perl_current_re_engine(pTHX) +{ + if (IN_PERL_COMPILETIME) { + HV * const table = GvHV(PL_hintgv); + SV **ptr; + + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) + return &reh_regexp_engine; + ptr = hv_fetchs(table, "regcomp", FALSE); + if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) + return &reh_regexp_engine; + return INT2PTR(regexp_engine*,SvIV(*ptr)); + } + else { + SV *ptr; + if (!PL_curcop->cop_hints_hash) + return &reh_regexp_engine; + ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); + if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) + return &reh_regexp_engine; + return INT2PTR(regexp_engine*,SvIV(ptr)); + } +} + + +REGEXP * +Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) +{ + regexp_engine const *eng = current_re_engine(); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_PREGCOMP; + + /* Dispatch a request to compile a regexp to correct regexp engine. */ + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + PTR2UV(eng)); + }); + return CALLREGCOMP_ENG(eng, pattern, flags); +} +#endif + +/* public(ish) entry point for the perl core's own regex compiling code. + * It's actually a wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs, and uses the internal engine rather + * than the current one */ + +REGEXP * +Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) +{ + SV *pat = pattern; /* defeat constness! */ + PERL_ARGS_ASSERT_RE_COMPILE; + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, +#ifdef PERL_IN_XSUB_RE + &my_reg_engine, +#else + &reh_regexp_engine, +#endif + NULL, NULL, rx_flags, 0); +} + + +/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code + * blocks, recalculate the indices. Update pat_p and plen_p in-place to + * point to the realloced string and length. + * + * This is essentially a copy of Perl_bytes_to_utf8() with the code index + * stuff added */ + +static void +S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, + char **pat_p, STRLEN *plen_p, int num_code_blocks) +{ + U8 *const src = (U8*)*pat_p; + U8 *dst; + int n=0; + STRLEN s = 0, d = 0; + bool do_end = 0; + GET_RE_DEBUG_FLAGS_DECL; + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + + Newx(dst, *plen_p * 2 + 1, U8); + + while (s < *plen_p) { + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; + else { + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); + } + if (n < num_code_blocks) { + if (!do_end && pRExC_state->code_blocks[n].start == s) { + pRExC_state->code_blocks[n].start = d; + assert(dst[d] == '('); + do_end = 1; + } + else if (do_end && pRExC_state->code_blocks[n].end == s) { + pRExC_state->code_blocks[n].end = d; + assert(dst[d] == ')'); + do_end = 0; + n++; + } + } + s++; + d++; + } + dst[d] = '\0'; + *plen_p = d; + *pat_p = (char*) dst; + SAVEFREEPV(*pat_p); + RExC_orig_utf8 = RExC_utf8 = 1; +} + + + +/* S_concat_pat(): concatenate a list of args to the pattern string pat, + * while recording any code block indices, and handling overloading, + * nested qr// objects etc. If pat is null, it will allocate a new + * string, or just return the first arg, if there's only one. + * + * Returns the malloced/updated pat. + * patternp and pat_count is the array of SVs to be concatted; + * oplist is the optional list of ops that generated the SVs; + * recompile_p is a pointer to a boolean that will be set if + * the regex will need to be recompiled. + * delim, if non-null is an SV that will be inserted between each element + */ + +static SV* +S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, + SV *pat, SV ** const patternp, int pat_count, + OP *oplist, bool *recompile_p, SV *delim) +{ + SV **svp; + int n = 0; + bool use_delim = FALSE; + bool alloced = FALSE; + + /* if we know we have at least two args, create an empty string, + * then concatenate args to that. For no args, return an empty string */ + if (!pat && pat_count != 1) { + pat = newSVpvs(""); + SAVEFREESV(pat); + alloced = TRUE; + } + + for (svp = patternp; svp < patternp + pat_count; svp++) { + SV *sv; + SV *rx = NULL; + STRLEN orig_patlen = 0; + bool code = 0; + SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; + + /* if we've got a delimiter, we go round the loop twice for each + * svp slot (except the last), using the delimiter the second + * time round */ + if (use_delim) { + svp--; + use_delim = FALSE; + } + else if (delim) + use_delim = TRUE; + + if (SvTYPE(msv) == SVt_PVAV) { + /* we've encountered an interpolated array within + * the pattern, e.g. /...@a..../. Expand the list of elements, + * then recursively append elements. + * The code in this block is based on S_pushav() */ + + AV *const av = (AV*)msv; + const SSize_t maxarg = AvFILL(av) + 1; + SV **array; + + if (oplist) { + assert(oplist->op_type == OP_PADAV + || oplist->op_type == OP_RV2AV); + oplist = OP_SIBLING(oplist); + } + + if (SvRMAGICAL(av)) { + SSize_t i; + + Newx(array, maxarg, SV*); + SAVEFREEPV(array); + for (i=0; i < maxarg; i++) { + SV ** const svp = av_fetch(av, i, FALSE); + array[i] = svp ? *svp : &PL_sv_undef; + } + } + else + array = AvARRAY(av); + + pat = S_concat_pat(aTHX_ pRExC_state, pat, + array, maxarg, NULL, recompile_p, + /* $" */ + GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); + + continue; + } + + + /* we make the assumption here that each op in the list of + * op_siblings maps to one SV pushed onto the stack, + * except for code blocks, with have both an OP_NULL and + * and OP_CONST. + * This allows us to match up the list of SVs against the + * list of OPs to find the next code block. + * + * Note that PUSHMARK PADSV PADSV .. + * is optimised to + * PADRANGE PADSV PADSV .. + * so the alignment still works. */ + + if (oplist) { + if (oplist->op_type == OP_NULL + && (oplist->op_flags & OPf_SPECIAL)) + { + assert(n < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0; + pRExC_state->code_blocks[n].block = oplist; + pRExC_state->code_blocks[n].src_regex = NULL; + n++; + code = 1; + oplist = OP_SIBLING(oplist); /* skip CONST */ + assert(oplist); + } + oplist = OP_SIBLING(oplist);; + } + + /* apply magic and QR overloading to arg */ + + SvGETMAGIC(msv); + if (SvROK(msv) && SvAMAGIC(msv)) { + SV *sv = AMG_CALLunary(msv, regexp_amg); + if (sv) { + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) != SVt_REGEXP) + Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); + msv = sv; + } + } + + /* try concatenation overload ... */ + if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && + (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) + { + sv_setsv(pat, sv); + /* overloading involved: all bets are off over literal + * code. Pretend we haven't seen it */ + pRExC_state->num_code_blocks -= n; + n = 0; + } + else { + /* ... or failing that, try "" overload */ + while (SvAMAGIC(msv) + && (sv = AMG_CALLunary(msv, string_amg)) + && sv != msv + && !( SvROK(msv) + && SvROK(sv) + && SvRV(msv) == SvRV(sv)) + ) { + msv = sv; + SvGETMAGIC(msv); + } + if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) + msv = SvRV(msv); + + if (pat) { + /* this is a partially unrolled + * sv_catsv_nomg(pat, msv); + * that allows us to adjust code block indices if + * needed */ + STRLEN dlen; + char *dst = SvPV_force_nomg(pat, dlen); + orig_patlen = dlen; + if (SvUTF8(msv) && !SvUTF8(pat)) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); + sv_setpvn(pat, dst, dlen); + SvUTF8_on(pat); + } + sv_catsv_nomg(pat, msv); + rx = msv; + } + else + pat = msv; + + if (code) + pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + } + + /* extract any code blocks within any embedded qr//'s */ + if (rx && SvTYPE(rx) == SVt_REGEXP + && RX_ENGINE((REGEXP*)rx)->op_comp) + { + + RXi_GET_DECL(ReANY((REGEXP *)rx), ri); + if (ri->num_code_blocks) { + int i; + /* the presence of an embedded qr// with code means + * we should always recompile: the text of the + * qr// may not have changed, but it may be a + * different closure than last time */ + *recompile_p = 1; + Renew(pRExC_state->code_blocks, + pRExC_state->num_code_blocks + ri->num_code_blocks, + struct reg_code_block); + pRExC_state->num_code_blocks += ri->num_code_blocks; + + for (i=0; i < ri->num_code_blocks; i++) { + struct reg_code_block *src, *dst; + STRLEN offset = orig_patlen + + ReANY((REGEXP *)rx)->pre_prefix; + assert(n < pRExC_state->num_code_blocks); + src = &ri->code_blocks[i]; + dst = &pRExC_state->code_blocks[n]; + dst->start = src->start + offset; + dst->end = src->end + offset; + dst->block = src->block; + dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) + src->src_regex + ? src->src_regex + : (REGEXP*)rx); + n++; + } + } + } + } + /* avoid calling magic multiple times on a single element e.g. =~ $qr */ + if (alloced) + SvSETMAGIC(pat); + + return pat; +} + + + +/* see if there are any run-time code blocks in the pattern. + * False positives are allowed */ + +static bool +S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + int n = 0; + STRLEN s; + + PERL_UNUSED_CONTEXT; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + s = pRExC_state->code_blocks[n].end; + n++; + continue; + } + /* TODO ideally should handle [..], (#..), /#.../x to reduce false + * positives here */ + if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && + (pat[s+2] == '{' + || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) + ) + return 1; + } + return 0; +} + +/* Handle run-time code blocks. We will already have compiled any direct + * or indirect literal code blocks. Now, take the pattern 'pat' and make a + * copy of it, but with any literal code blocks blanked out and + * appropriate chars escaped; then feed it into + * + * eval "qr'modified_pattern'" + * + * For example, + * + * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno + * + * becomes + * + * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' + * + * After eval_sv()-ing that, grab any new code blocks from the returned qr + * and merge them with any code blocks of the original regexp. + * + * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; + * instead, just save the qr and return FALSE; this tells our caller that + * the original pattern needs upgrading to utf8. + */ + +static bool +S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + SV *qr; + + GET_RE_DEBUG_FLAGS_DECL; + + if (pRExC_state->runtime_code_qr) { + /* this is the second time we've been called; this should + * only happen if the main pattern got upgraded to utf8 + * during compilation; re-use the qr we compiled first time + * round (which should be utf8 too) + */ + qr = pRExC_state->runtime_code_qr; + pRExC_state->runtime_code_qr = NULL; + assert(RExC_utf8 && SvUTF8(qr)); + } + else { + int n = 0; + STRLEN s; + char *p, *newpat; + int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ + SV *sv, *qr_ref; + dSP; + + /* determine how many extra chars we need for ' and \ escaping */ + for (s = 0; s < plen; s++) { + if (pat[s] == '\'' || pat[s] == '\\') + newlen++; + } + + Newx(newpat, newlen, char); + p = newpat; + *p++ = 'q'; *p++ = 'r'; *p++ = '\''; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + /* blank out literal code block */ + assert(pat[s] == '('); + while (s <= pRExC_state->code_blocks[n].end) { + *p++ = '_'; + s++; + } + s--; + n++; + continue; + } + if (pat[s] == '\'' || pat[s] == '\\') + *p++ = '\\'; + *p++ = pat[s]; + } + *p++ = '\''; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + *p++ = 'x'; + *p++ = '\0'; + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%sre-parsing pattern for runtime code:%s %s\n", + PL_colors[4],PL_colors[5],newpat); + }); + + sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); + Safefree(newpat); + + ENTER; + SAVETMPS; + save_re_context(); + PUSHSTACKi(PERLSI_REQUIRE); + /* G_RE_REPARSING causes the toker to collapse \\ into \ when + * parsing qr''; normally only q'' does this. It also alters + * hints handling */ + eval_sv(sv, G_SCALAR|G_RE_REPARSING); + SvREFCNT_dec_NN(sv); + SPAGAIN; + qr_ref = POPs; + PUTBACK; + { + SV * const errsv = ERRSV; + if (SvTRUE_NN(errsv)) + { + Safefree(pRExC_state->code_blocks); + /* use croak_sv ? */ + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); + } + } + assert(SvROK(qr_ref)); + qr = SvRV(qr_ref); + assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); + /* the leaving below frees the tmp qr_ref. + * Give qr a life of its own */ + SvREFCNT_inc(qr); + POPSTACK; + FREETMPS; + LEAVE; + + } + + if (!RExC_utf8 && SvUTF8(qr)) { + /* first time through; the pattern got upgraded; save the + * qr for the next time through */ + assert(!pRExC_state->runtime_code_qr); + pRExC_state->runtime_code_qr = qr; + return 0; + } + + + /* extract any code blocks within the returned qr// */ + + + /* merge the main (r1) and run-time (r2) code blocks into one */ + { + RXi_GET_DECL(ReANY((REGEXP *)qr), r2); + struct reg_code_block *new_block, *dst; + RExC_state_t * const r1 = pRExC_state; /* convenient alias */ + int i1 = 0, i2 = 0; + + if (!r2->num_code_blocks) /* we guessed wrong */ + { + SvREFCNT_dec_NN(qr); + return 1; + } + + Newx(new_block, + r1->num_code_blocks + r2->num_code_blocks, + struct reg_code_block); + dst = new_block; + + while ( i1 < r1->num_code_blocks + || i2 < r2->num_code_blocks) + { + struct reg_code_block *src; + bool is_qr = 0; + + if (i1 == r1->num_code_blocks) { + src = &r2->code_blocks[i2++]; + is_qr = 1; + } + else if (i2 == r2->num_code_blocks) + src = &r1->code_blocks[i1++]; + else if ( r1->code_blocks[i1].start + < r2->code_blocks[i2].start) + { + src = &r1->code_blocks[i1++]; + assert(src->end < r2->code_blocks[i2].start); + } + else { + assert( r1->code_blocks[i1].start + > r2->code_blocks[i2].start); + src = &r2->code_blocks[i2++]; + is_qr = 1; + assert(src->end < r1->code_blocks[i1].start); + } + + assert(pat[src->start] == '('); + assert(pat[src->end] == ')'); + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; + dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) + : src->src_regex; + dst++; + } + r1->num_code_blocks += r2->num_code_blocks; + Safefree(r1->code_blocks); + r1->code_blocks = new_block; + } + + SvREFCNT_dec_NN(qr); + return 1; +} + + +STATIC bool +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) +{ + /* This is the common code for setting up the floating and fixed length + * string data extracted from Perl_re_op_compile() below. Returns a boolean + * as to whether succeeded or not */ + + I32 t; + SSize_t ml; + + if (! (longest_length + || (eol /* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) + ) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) + { + return FALSE; + } + + /* copy the information about the longest from the reg_scan_data + over to the program. */ + if (SvUTF8(sv_longest)) { + *rx_utf8 = sv_longest; + *rx_substr = NULL; + } else { + *rx_substr = sv_longest; + *rx_utf8 = NULL; + } + /* end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = minlen ? *(minlen) : (SSize_t)longest_length; + *rx_end_shift = ml - offset + - longest_length + (SvTAIL(sv_longest) != 0) + + lookbehind; + + t = (eol/* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); + fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); + + return TRUE; +} + +/* + * Perl_re_op_compile - the perl internal RE engine's function to compile a + * regular expression into internal code. + * The pattern may be passed either as: + * a list of SVs (patternp plus pat_count) + * a list of OPs (expr) + * If both are passed, the SV list is used, but the OP list indicates + * which SVs are actually pre-compiled code blocks + * + * The SVs in the list have magic and qr overloading applied to them (and + * the list may be modified in-place with replacement SVs in the latter + * case). + * + * If the pattern hasn't changed from old_re, then old_re will be + * returned. + * + * eng is the current engine. If that engine has an op_comp method, then + * handle directly (i.e. we assume that op_comp was us); otherwise, just + * do the initial concatenation of arguments and pass on to the external + * engine. + * + * If is_bare_re is not null, set it to a boolean indicating whether the + * arg list reduced (after overloading) to a single bare regex which has + * been returned (i.e. /$qr/). + * + * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. + * + * pm_flags contains the PMf_* flags, typically based on those from the + * pm_flags field of the related PMOP. Currently we're only interested in + * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) [NB: not true in perl] + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. [I'll say.] + */ + +REGEXP * +Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, + OP *expr, const regexp_engine* eng, REGEXP *old_re, + bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) +{ + REGEXP *rx; + struct regexp *r; + regexp_internal *ri; + STRLEN plen; + char *exp; + regnode *scan; + I32 flags; + SSize_t minlen = 0; + U32 rx_flags; + SV *pat; + SV *code_blocksv = NULL; + SV** new_patternp = patternp; + + /* these are all flags - maybe they should be turned + * into a single int with different bit masks */ + I32 sawlookahead = 0; + I32 sawplus = 0; + I32 sawopen = 0; + I32 sawminmod = 0; + + regex_charset initial_charset = get_regex_charset(orig_rx_flags); + bool recompile = 0; + bool runtime_code = 0; + scan_data_t data; + RExC_state_t RExC_state; + RExC_state_t * const pRExC_state = &RExC_state; +#ifdef TRIE_STUDY_OPT + int restudied = 0; + RExC_state_t copyRExC_state; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_OP_COMPILE; + + DEBUG_r(if (!PL_colorset) reginitcolors()); + +#ifndef PERL_IN_XSUB_RE + /* Initialize these here instead of as-needed, as is quick and avoids + * having to test them each time otherwise */ + if (! PL_AboveLatin1) { + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); + } +#endif + + pRExC_state->code_blocks = NULL; + pRExC_state->num_code_blocks = 0; + + if (is_bare_re) + *is_bare_re = FALSE; + + if (expr && (expr->op_type == OP_LIST || + (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { + /* allocate code_blocks if needed */ + OP *o; + int ncode = 0; + + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) + if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) + ncode++; /* count of DO blocks */ + if (ncode) { + pRExC_state->num_code_blocks = ncode; + Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); + } + } + + if (!pat_count) { + /* compile-time pattern with just OP_CONSTs and DO blocks */ + + int n; + OP *o; + + /* find how many CONSTs there are */ + assert(expr); + n = 0; + if (expr->op_type == OP_CONST) + n = 1; + else + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) { + if (o->op_type == OP_CONST) + n++; + } + + /* fake up an SV array */ + + assert(!new_patternp); + Newx(new_patternp, n, SV*); + SAVEFREEPV(new_patternp); + pat_count = n; + + n = 0; + if (expr->op_type == OP_CONST) + new_patternp[n] = cSVOPx_sv(expr); + else + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) { + if (o->op_type == OP_CONST) + new_patternp[n++] = cSVOPo_sv; + } + + } + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Assembling pattern from %d elements%s\n", pat_count, + orig_rx_flags & RXf_SPLIT ? " for split" : "")); + + /* set expr to the first arg op */ + + if (pRExC_state->num_code_blocks + && expr->op_type != OP_CONST) + { + expr = cLISTOPx(expr)->op_first; + assert( expr->op_type == OP_PUSHMARK + || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) + || expr->op_type == OP_PADRANGE); + expr = OP_SIBLING(expr); + } + + pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, + expr, &recompile, NULL); + + /* handle bare (possibly after overloading) regex: foo =~ $re */ + { + SV *re = pat; + if (SvROK(re)) + re = SvRV(re); + if (SvTYPE(re) == SVt_REGEXP) { + if (is_bare_re) + *is_bare_re = TRUE; + SvREFCNT_inc(re); + Safefree(pRExC_state->code_blocks); + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Precompiled pattern%s\n", + orig_rx_flags & RXf_SPLIT ? " for split" : "")); + + return (REGEXP*)re; + } + } + + exp = SvPV_nomg(pat, plen); + + if (!eng->op_comp) { + if ((SvUTF8(pat) && IN_BYTES) + || SvGMAGICAL(pat) || SvAMAGIC(pat)) + { + /* make a temporary copy; either to convert to bytes, + * or to avoid repeating get-magic / overloaded stringify */ + pat = newSVpvn_flags(exp, plen, SVs_TEMP | + (IN_BYTES ? 0 : SvUTF8(pat))); + } + Safefree(pRExC_state->code_blocks); + return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); + } + + /* ignore the utf8ness if the pattern is 0 length */ + RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); + RExC_uni_semantics = 0; + RExC_contains_locale = 0; + RExC_contains_i = 0; + pRExC_state->runtime_code_qr = NULL; + + DEBUG_COMPILE_r({ + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", + PL_colors[4],PL_colors[5],s); + }); + + redo_first_pass: + /* we jump here if we upgrade the pattern to utf8 and have to + * recompile */ + + if ((pm_flags & PMf_USE_RE_EVAL) + /* this second condition covers the non-regex literal case, + * i.e. $foo =~ '(?{})'. */ + || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) + ) + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); + + /* return old regex if pattern hasn't changed */ + /* XXX: note in the below we have to check the flags as well as the + * pattern. + * + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ + + if ( old_re + && !recompile + && !!RX_UTF8(old_re) == !!RExC_utf8 + && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) + && RX_PRECOMP(old_re) + && RX_PRELEN(old_re) == plen + && memEQ(RX_PRECOMP(old_re), exp, plen) + && !runtime_code /* with runtime code, always recompile */ ) + { + Safefree(pRExC_state->code_blocks); + return old_re; + } + + rx_flags = orig_rx_flags; + + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; + } + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + + /* Set to use unicode semantics if the pattern is in utf8 and has the + * 'depends' charset specified, as it means unicode when utf8 */ + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); + } + + RExC_precomp = exp; + RExC_flags = rx_flags; + RExC_pm_flags = pm_flags; + + if (runtime_code) { + if (TAINTING_get && TAINT_get) + Perl_croak(aTHX_ "Eval-group in insecure regular expression"); + + if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { + /* whoops, we have a non-utf8 pattern, whilst run-time code + * got compiled as utf8. Try again with a utf8 pattern */ + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + pRExC_state->num_code_blocks); + goto redo_first_pass; + } + } + assert(!pRExC_state->runtime_code_qr); + + RExC_sawback = 0; + + RExC_seen = 0; + RExC_maxlen = 0; + RExC_in_lookbehind = 0; + RExC_seen_zerolen = *exp == '^' ? -1 : 0; + RExC_extralen = 0; + RExC_override_recoding = 0; + RExC_in_multi_char_class = 0; + + /* First pass: determine size, legality. */ + RExC_parse = exp; + RExC_start = exp; + RExC_end = exp + plen; + RExC_naughty = 0; + RExC_npar = 1; + RExC_nestroot = 0; + RExC_size = 0L; + RExC_emit = (regnode *) &RExC_emit_dummy; + RExC_whilem_seen = 0; + RExC_open_parens = NULL; + RExC_close_parens = NULL; + RExC_opend = NULL; + RExC_paren_names = NULL; +#ifdef DEBUGGING + RExC_paren_name_list = NULL; +#endif + RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; + RExC_recurse_count = 0; + pRExC_state->code_index = 0; + +#if 0 /* REGC() is (currently) a NOP at the first pass. + * Clever compilers notice this and complain. --jhi */ + REGC((U8)REG_MAGIC, (char*)RExC_emit); +#endif + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + RExC_lastnum=0; + RExC_lastparse=NULL; + ); + /* reg may croak on us, not giving us a chance to free + pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may + need it to survive as long as the regexp (qr/(?{})/). + We must check that code_blocksv is not already set, because we may + have jumped back to restart the sizing pass. */ + if (pRExC_state->code_blocks && !code_blocksv) { + code_blocksv = newSV_type(SVt_PV); + SAVEFREESV(code_blocksv); + SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); + SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ + } + if (reg(pRExC_state, 0, &flags,1) == NULL) { + /* It's possible to write a regexp in ascii that represents Unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + -- dmq */ + if (flags & RESTART_UTF8) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + pRExC_state->num_code_blocks); + goto redo_first_pass; + } + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); + } + if (code_blocksv) + SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ + + DEBUG_PARSE_r({ + PerlIO_printf(Perl_debug_log, + "Required size %"IVdf" nodes\n" + "Starting second pass (creation)\n", + (IV)RExC_size); + RExC_lastnum=0; + RExC_lastparse=NULL; + }); + + /* The first pass could have found things that force Unicode semantics */ + if ((RExC_utf8 || RExC_uni_semantics) + && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) + { + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); + } + + /* Small enough for pointer-storage convention? + If extralen==0, this means that we will not need long jumps. */ + if (RExC_size >= 0x10000L && RExC_extralen) + RExC_size += RExC_extralen; + else + RExC_extralen = 0; + if (RExC_whilem_seen > 15) + RExC_whilem_seen = 15; + + /* Allocate space and zero-initialize. Note, the two step process + of zeroing when in debug mode, thus anything assigned has to + happen after that */ + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = ReANY(rx); + Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char, regexp_internal); + if ( r == NULL || ri == NULL ) + FAIL("Regexp out of space"); +#ifdef DEBUGGING + /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); +#else + /* bulk initialize base fields with 0. */ + Zero(ri, sizeof(regexp_internal), char); +#endif + + /* non-zero initialization begins here */ + RXi_SET( r, ri ); + r->engine= eng; + r->extflags = rx_flags; + RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; + + if (pm_flags & PMf_IS_QR) { + ri->code_blocks = pRExC_state->code_blocks; + ri->num_code_blocks = pRExC_state->num_code_blocks; + } + else + { + int n; + for (n = 0; n < pRExC_state->num_code_blocks; n++) + if (pRExC_state->code_blocks[n].src_regex) + SAVEFREESV(pRExC_state->code_blocks[n].src_regex); + SAVEFREEPV(pRExC_state->code_blocks); + } + + { + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); + + /* The caret is output if there are any defaults: if not all the STD + * flags are set, or if no character set specifier is needed */ + bool has_default = + (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) + || ! has_charset); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + >> RXf_PMf_STD_PMMOD_SHIFT); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + /* Allocate for the worst case, which is all the std flags are turned + * on. If more precision is desired, we could do a population count of + * the flags set. This could be done with a small lookup table, or by + * shifting, masking and adding, or even, when available, assembly + * language for a machine-language population count. + * We never output a minus, as all those are defaults, so are + * covered by the caret */ + const STRLEN wraplen = plen + has_p + has_runon + + has_default /* If needs a caret */ + + /* If needs a character set specifier */ + + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ + r->xpv_len_u.xpvlenu_pv = p; + if (RExC_utf8) + SvFLAGS(rx) |= SVf_UTF8; + *p++='('; *p++='?'; + + /* If a default, cover it using the caret */ + if (has_default) { + *p++= DEFAULT_PAT_MOD; + } + if (has_charset) { + STRLEN len; + const char* const name = get_regex_charset_name(r->extflags, &len); + Copy(name, p, len, char); + p += len; + } + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ + { + char ch; + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + reganch >>= 1; + } + } + + *p++ = ':'; + Copy(RExC_precomp, p, plen, char); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); + p += plen; + if (has_runon) + *p++ = '\n'; + *p++ = ')'; + *p = 0; + SvCUR_set(rx, p - RX_WRAPPED(rx)); + } + + r->intflags = 0; + r->nparens = RExC_npar - 1; /* set early to validate backrefs */ + + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { + Newxz(RExC_open_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_open_parens); + Newxz(RExC_close_parens,RExC_npar,regnode *); + SAVEFREEPV(RExC_close_parens); + } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } + + /* Useful during FAIL. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ + DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, + "%s %"UVuf" bytes for offset annotations.\n", + ri->u.offsets ? "Got" : "Couldn't get", + (UV)((2*RExC_size+1) * sizeof(U32)))); +#endif + SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; + RExC_rx = r; + RExC_rxi = ri; + REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx); + + /* Second pass: emit code. */ + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_pm_flags = pm_flags; + RExC_parse = exp; + RExC_end = exp + plen; + RExC_naughty = 0; + RExC_npar = 1; + RExC_emit_start = ri->program; + RExC_emit = ri->program; + RExC_emit_bound = ri->program + RExC_size + 1; + pRExC_state->code_index = 0; + + REGC((U8)REG_MAGIC, (char*) RExC_emit++); + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(rx); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); + } + /* XXXX To minimize changes to RE engine we always allocate + 3-units-long substrs field. */ + Newx(r->substrs, 1, struct reg_substr_data); + if (RExC_recurse_count) { + Newxz(RExC_recurse,RExC_recurse_count,regnode *); + SAVEFREEPV(RExC_recurse); + } + +reStudy: + r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; + Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + +#ifdef TRIE_STUDY_OPT + if (!restudied) { + StructCopy(&zero_scan_data, &data, scan_data_t); + copyRExC_state = RExC_state; + } else { + U32 seen=RExC_seen; + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); + + RExC_state = copyRExC_state; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; + else + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; + StructCopy(&zero_scan_data, &data, scan_data_t); + } +#else + StructCopy(&zero_scan_data, &data, scan_data_t); +#endif + + /* Dig out information for optimizations. */ + r->extflags = RExC_flags; /* was pm_op */ + /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ + + if (UTF) + SvUTF8_on(rx); /* Unicode in it? */ + ri->regstclass = NULL; + if (RExC_naughty >= 10) /* Probably an expensive pattern. */ + r->intflags |= PREGf_NAUGHTY; + scan = ri->program + 1; /* First BRANCH. */ + + /* testing for BRANCH here tells us whether there is "must appear" + data in the pattern. If there is then we can use it for optimisations */ + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; + STRLEN longest_float_length, longest_fixed_length; + regnode_ssc ch_class; /* pointed to by data */ + int stclass_flag; + SSize_t last_close = 0; /* pointed to by data */ + regnode *first= scan; + regnode *first_next= regnext(first); + /* + * Skip introductions and multiplicators >= 1 + * so that we can extract the 'meat' of the pattern that must + * match in the large if() sequence following. + * NOTE that EXACT is NOT covered here, as it is normally + * picked up by the optimiser separately. + * + * This is unfortunate as the optimiser isnt handling lookahead + * properly currently. + * + */ + while ((OP(first) == OPEN && (sawopen = 1)) || + /* An OR of *one* alternative - should not happen now. */ + (OP(first) == BRANCH && OP(first_next) != BRANCH) || + /* for now we can't handle lookbehind IFMATCH*/ + (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || + (OP(first) == PLUS) || + (OP(first) == MINMOD) || + /* An {n,m} with n>0 */ + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) + { + /* + * the only op that could be a regnode is PLUS, all the rest + * will be regnode_1 or regnode_2. + * + * (yves doesn't think this is true) + */ + if (OP(first) == PLUS) + sawplus = 1; + else { + if (OP(first) == MINMOD) + sawminmod = 1; + first += regarglen[OP(first)]; + } + first = NEXTOPER(first); + first_next= regnext(first); + } + + /* Starting-point info. */ + again: + DEBUG_PEEP("first:",first,0); + /* Ignore EXACT as we deal with it later. */ + if (PL_regkind[OP(first)] == EXACT) { + if (OP(first) == EXACT) + NOOP; /* Empty, get anchored substr later. */ + else + ri->regstclass = first; + } +#ifdef TRIE_STCLASS + else if (PL_regkind[OP(first)] == TRIE && + ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) + { + /* this can happen only on restudy */ + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); + } +#endif + else if (REGNODE_SIMPLE(OP(first))) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOUND || + PL_regkind[OP(first)] == NBOUND) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOL) { + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL + : (OP(first) == SBOL + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); + first = NEXTOPER(first); + goto again; + } + else if (OP(first) == GPOS) { + r->intflags |= PREGf_ANCH_GPOS; + first = NEXTOPER(first); + goto again; + } + else if ((!sawopen || !RExC_sawback) && + !sawlookahead && + (OP(first) == STAR && + PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) + { + /* turn .* into ^.* with an implied $*=1 */ + const int type = + (OP(NEXTOPER(first)) == REG_ANY) + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); + first = NEXTOPER(first); + goto again; + } + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) + && !pRExC_state->num_code_blocks) /* May examine pos and $& */ + /* x+ must match at the 1st pos of run of x's */ + r->intflags |= PREGf_SKIP; + + /* Scan is after the zeroth branch, first is atomic matcher. */ +#ifdef TRIE_STUDY_OPT + DEBUG_PARSE_r( + if (!restudied) + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#else + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#endif + + + /* + * If there's something expensive in the r.e., find the + * longest literal string that must appear and make it the + * regmust. Resolve ties in favor of later strings, since + * the regstart check works with the beginning of the r.e. + * and avoiding duplication strengthens checking. Not a + * strong reason, but sufficient in the absence of others. + * [Now we resolve ties in favor of the earlier string if + * it happens that c_offset_min has been invalidated, since the + * earlier string may buy us something the later one won't.] + */ + + data.longest_fixed = newSVpvs(""); + data.longest_float = newSVpvs(""); + data.last_found = newSVpvs(""); + data.longest = &(data.longest_fixed); + ENTER_with_name("study_chunk"); + SAVEFREESV(data.longest_fixed); + SAVEFREESV(data.longest_float); + SAVEFREESV(data.last_found); + first = scan; + if (!ri->regstclass) { + ssc_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + stclass_flag = SCF_DO_STCLASS_AND; + } else /* XXXX Check for BOUND? */ + stclass_flag = 0; + data.last_closep = &last_close; + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, + SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag + | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), + 0); + + + CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); + + + if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) + && data.last_start_min == 0 && data.last_end > 0 + && !RExC_seen_zerolen + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ + r->extflags |= RXf_CHECK_ALL; + } + scan_commit(pRExC_state, &data,&minlen,0); + + longest_float_length = CHR_SVLEN(data.longest_float); + + if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) + && S_setup_longest (aTHX_ pRExC_state, + data.longest_float, + &(r->float_utf8), + &(r->float_substr), + &(r->float_end_shift), + data.lookbehind_float, + data.offset_float_min, + data.minlen_float, + longest_float_length, + cBOOL(data.flags & SF_FL_BEFORE_EOL), + cBOOL(data.flags & SF_FL_BEFORE_MEOL))) + { + r->float_min_offset = data.offset_float_min - data.lookbehind_float; + r->float_max_offset = data.offset_float_max; + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ + r->float_max_offset -= data.lookbehind_float; + SvREFCNT_inc_simple_void_NN(data.longest_float); + } + else { + r->float_substr = r->float_utf8 = NULL; + longest_float_length = 0; + } + + longest_fixed_length = CHR_SVLEN(data.longest_fixed); + + if (S_setup_longest (aTHX_ pRExC_state, + data.longest_fixed, + &(r->anchored_utf8), + &(r->anchored_substr), + &(r->anchored_end_shift), + data.lookbehind_fixed, + data.offset_fixed, + data.minlen_fixed, + longest_fixed_length, + cBOOL(data.flags & SF_FIX_BEFORE_EOL), + cBOOL(data.flags & SF_FIX_BEFORE_MEOL))) + { + r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; + SvREFCNT_inc_simple_void_NN(data.longest_fixed); + } + else { + r->anchored_substr = r->anchored_utf8 = NULL; + longest_fixed_length = 0; + } + LEAVE_with_name("study_chunk"); + + if (ri->regstclass + && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) + ri->regstclass = NULL; + + if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) + && stclass_flag + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); + StructCopy(data.start_class, + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class, NULL); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + data.start_class = NULL; + } + + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ + if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; + r->check_end_shift = r->anchored_end_shift; + r->check_substr = r->anchored_substr; + r->check_utf8 = r->anchored_utf8; + r->check_offset_min = r->check_offset_max = r->anchored_offset; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; + } + else { + r->substrs->check_ix = 1; + r->check_end_shift = r->float_end_shift; + r->check_substr = r->float_substr; + r->check_utf8 = r->float_utf8; + r->check_offset_min = r->float_min_offset; + r->check_offset_max = r->float_max_offset; + } + if ((r->check_substr || r->check_utf8) ) { + r->extflags |= RXf_USE_INTUIT; + if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) + r->extflags |= RXf_INTUIT_TAIL; + } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) + if ( (STRLEN)minlen < longest_float_length ) + minlen= longest_float_length; + if ( (STRLEN)minlen < longest_fixed_length ) + minlen= longest_fixed_length; + */ + } + else { + /* Several toplevels. Best we can is to set minlen. */ + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); + + scan = ri->program + 1; + ssc_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + data.last_closep = &last_close; + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), + 0); + + CHECK_RESTUDY_GOTO_butfirst(NOOP); + + r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 + = r->float_substr = r->float_utf8 = NULL; + + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); + StructCopy(data.start_class, + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class, NULL); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + data.start_class = NULL; + } + } + + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + + /* Guard against an embedded (?=) or (?<=) with a longer minlen than + the "real" pattern. */ + DEBUG_OPTIMISE_r({ + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); + }); + r->minlenret = minlen; + if (r->minlen < minlen) + r->minlen = minlen; + + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ + if (pRExC_state->num_code_blocks) + r->extflags |= RXf_EVAL_SEEN; + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) + { + r->intflags |= PREGf_VERBARG_SEEN; + r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ + } + if (RExC_seen & REG_CUTGROUP_SEEN) + r->intflags |= PREGf_CUTGROUP_SEEN; + if (pm_flags & PMf_USE_RE_EVAL) + r->intflags |= PREGf_USE_RE_EVAL; + if (RExC_paren_names) + RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); + else + RXp_PAREN_NAMES(r) = NULL; + + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ + regnode *first = ri->program + 1; + U8 fop = OP(first); + regnode *next = NEXTOPER(first); + U8 nop = OP(next); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) + r->extflags |= RXf_WHITE; + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + + } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + +#ifdef DEBUGGING + if (RExC_paren_names) { + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); + } else +#endif + ri->name_list_idx = 0; + + if (RExC_recurse_count) { + for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { + const regnode *scan = RExC_recurse[RExC_recurse_count-1]; + ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); + } + } + Newxz(r->offs, RExC_npar, regexp_paren_pair); + /* assume we don't need to swap parens around before we match */ + + DEBUG_DUMP_r({ + DEBUG_RExC_seen(); + PerlIO_printf(Perl_debug_log,"Final program:\n"); + regdump(r); + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_OFFSETS_r(if (ri->u.offsets) { + const STRLEN len = ri->u.offsets[0]; + STRLEN i; + GET_RE_DEBUG_FLAGS_DECL; + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + for (i = 1; i <= len; i++) { + if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) + PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", + (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); + } + PerlIO_printf(Perl_debug_log, "\n"); + }); +#endif + +#ifdef USE_ITHREADS + /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated + * by setting the regexp SV to readonly-only instead. If the + * pattern's been recompiled, the USEDness should remain. */ + if (old_re && SvREADONLY(old_re)) + SvREADONLY_on(rx); +#endif + return rx; +} + + +SV* +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak_no_modify(); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) +{ + AV *retarray = NULL; + SV *ret; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + + if (flags & RXapif_ALL) + retarray=newAV(); + + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) + { + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); + if (!retarray) + return ret; + } else { + if (retarray) + ret = newSVsv(&PL_sv_undef); + } + if (retarray) + av_push(retarray, ret); + } + if (retarray) + return newRV_noinc(MUTABLE_SV(retarray)); + } + } + return NULL; +} + +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, + const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & RXapif_ALL) { + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); + if (sv) { + SvREFCNT_dec_NN(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; + + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); + + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + return newSVhek(HeKEY_hek(temphe)); + } + } + } + return NULL; +} + +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) +{ + SV *ret; + AV *av; + SSize_t length; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = MUTABLE_AV(SvRV(ret)); + length = av_tindex(av); + SvREFCNT_dec_NN(ret); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + AV *av = newAV(); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + av_push(av, newSVhek(HeKEY_hek(temphe))); + } + } + } + + return newRV_noinc(MUTABLE_SV(av)); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) +{ + struct regexp *const rx = ReANY(r); + char *s = NULL; + SSize_t i = 0; + SSize_t s1, t1; + I32 n = paren; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; + + if ( n == RX_BUFF_IDX_CARET_PREMATCH + || n == RX_BUFF_IDX_CARET_FULLMATCH + || n == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } + + if (!rx->subbeg) + goto ret_undef; + + if (n == RX_BUFF_IDX_CARET_FULLMATCH) + /* no need to distinguish between them any more */ + n = RX_BUFF_IDX_FULLMATCH; + + if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) + && rx->offs[0].start != -1) + { + /* $`, ${^PREMATCH} */ + i = rx->offs[0].start; + s = rx->subbeg; + } + else + if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) + && rx->offs[0].end != -1) + { + /* $', ${^POSTMATCH} */ + s = rx->subbeg - rx->suboffset + rx->offs[0].end; + i = rx->sublen + rx->suboffset - rx->offs[0].end; + } + else + if ( 0 <= n && n <= (I32)rx->nparens && + (s1 = rx->offs[n].start) != -1 && + (t1 = rx->offs[n].end) != -1) + { + /* $&, ${^MATCH}, $1 ... */ + i = t1 - s1; + s = rx->subbeg + s1 - rx->suboffset; + } else { + goto ret_undef; + } + + assert(s >= rx->subbeg); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); + if (i >= 0) { +#ifdef NO_TAINT_SUPPORT + sv_setpvn(sv, s, i); +#else + const int oldtainted = TAINT_get; + TAINT_NOT; + sv_setpvn(sv, s, i); + TAINT_set(oldtainted); +#endif + if ( (rx->intflags & PREGf_CANY_SEEN) + ? (RXp_MATCH_UTF8(rx) + && (!i || is_utf8_string((U8*)s, i))) + : (RXp_MATCH_UTF8(rx)) ) + { + SvUTF8_on(sv); + } + else + SvUTF8_off(sv); + if (TAINTING_get) { + if (RXp_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + TAINT; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); + } + } else { + TAINT; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); + } + } else { + ret_undef: + sv_setsv(sv,&PL_sv_undef); + return; + } +} + +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak_no_modify(); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, + const I32 paren) +{ + struct regexp *const rx = ReANY(r); + I32 i; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + + /* Some of this code was originally in C in F */ + switch (paren) { + case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: /* $` */ + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + + case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: /* $' */ + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + + default: /* $& / ${^MATCH}, $1, $2, ... */ + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + warn_undef: + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((const SV *)sv); + return 0; + } + } + getlen: + if (i > 0 && RXp_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg - rx->suboffset + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_ARGS_ASSERT_REG_QR_PACKAGE; + PERL_UNUSED_ARG(rx); + if (0) + return NULL; + else + return newSVpvs("Regexp"); +} + +/* Scans the name of a named buffer from the pattern. + * If flags is REG_RSN_RETURN_NULL returns null. + * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name + * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding + * to the parsed name as looked up in the RExC_paren_names hash. + * If there is an error throws a vFAIL().. type exception. + */ + +#define REG_RSN_RETURN_NULL 0 +#define REG_RSN_RETURN_NAME 1 +#define REG_RSN_RETURN_DATA 2 + +STATIC SV* +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) +{ + char *name_start = RExC_parse; + + PERL_ARGS_ASSERT_REG_SCAN_NAME; + + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + /* skip IDFIRST by using do...while */ + if (UTF) + do { + RExC_parse += UTF8SKIP(RExC_parse); + } while (isWORDCHAR_utf8((U8*)RExC_parse)); + else + do { + RExC_parse++; + } while (isWORDCHAR(*RExC_parse)); + } else { + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ + vFAIL("Group name must start with a non-digit word character"); + } + if ( flags ) { + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); + if ( flags == REG_RSN_RETURN_NAME) + return sv_name; + else if (flags==REG_RSN_RETURN_DATA) { + HE *he_str = NULL; + SV *sv_dat = NULL; + if ( ! sv_name ) /* should not happen*/ + Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) + vFAIL("Reference to nonexistent named group"); + return sv_dat; + } + else { + Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", + (unsigned long) flags); + } + assert(0); /* NOT REACHED */ + } + return NULL; +} + +#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ + int rem=(int)(RExC_end - RExC_parse); \ + int cut; \ + int num; \ + int iscut=0; \ + if (rem>10) { \ + rem=10; \ + iscut=1; \ + } \ + cut=10-rem; \ + if (RExC_lastparse!=RExC_parse) \ + PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ + rem, RExC_parse, \ + cut + 4, \ + iscut ? "..." : "<" \ + ); \ + else \ + PerlIO_printf(Perl_debug_log,"%16s",""); \ + \ + if (SIZE_ONLY) \ + num = RExC_size + 1; \ + else \ + num=REG_NODE_NUM(RExC_emit); \ + if (RExC_lastnum!=num) \ + PerlIO_printf(Perl_debug_log,"|%4d",num); \ + else \ + PerlIO_printf(Perl_debug_log,"|%4s",""); \ + PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ + (int)((depth*2)), "", \ + (funcname) \ + ); \ + RExC_lastnum=num; \ + RExC_lastparse=RExC_parse; \ +}) + + + +#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,"%4s","\n"); \ +}) +#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,fmt "\n",args); \ +}) + +/* This section of code defines the inversion list object and its methods. The + * interfaces are highly subject to change, so as much as possible is static to + * this file. An inversion list is here implemented as a malloc'd C UV array + * as an SVt_INVLIST scalar. + * + * An inversion list for Unicode is an array of code points, sorted by ordinal + * number. The zeroth element is the first code point in the list. The 1th + * element is the first element beyond that not in the list. In other words, + * the first range is + * invlist[0]..(invlist[1]-1) + * The other ranges follow. Thus every element whose index is divisible by two + * marks the beginning of a range that is in the list, and every element not + * divisible by two marks the beginning of a range not in the list. A single + * element inversion list that contains the single code point N generally + * consists of two elements + * invlist[0] == N + * invlist[1] == N+1 + * (The exception is when N is the highest representable value on the + * machine, in which case the list containing just it would be a single + * element, itself. By extension, if the last range in the list extends to + * infinity, then the first element of that range will be in the inversion list + * at a position that is divisible by two, and is the final element in the + * list.) + * Taking the complement (inverting) an inversion list is quite simple, if the + * first element is 0, remove it; otherwise add a 0 element at the beginning. + * This implementation reserves an element at the beginning of each inversion + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. + * + * More about inversion lists can be found in "Unicode Demystified" + * Chapter 13 by Richard Gillam, published by Addison-Wesley. + * More will be coming when functionality is added later. + * + * The inversion list data structure is currently implemented as an SV pointing + * to an array of UVs that the SV thinks are bytes. This allows us to have an + * array of UV whose memory management is automatically handled by the existing + * facilities for SV's. + * + * Some of the methods should always be private to the implementation, and some + * should eventually be made public */ + +/* The header definitions are in F */ + +PERL_STATIC_INLINE UV* +S__invlist_array_init(SV* const invlist, const bool will_have_0) +{ + /* Returns a pointer to the first element in the inversion list's array. + * This is called upon initialization of an inversion list. Where the + * array begins depends on whether the list has the code point U+0000 in it + * or not. The other parameter tells it whether the code that follows this + * call is about to put a 0 in the inversion list or not. The first + * element is either the element reserved for 0, if TRUE, or the element + * after it, if FALSE */ + + bool* offset = get_invlist_offset_addr(invlist); + UV* zero_addr = (UV *) SvPVX(invlist); + + PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; + + /* Must be empty */ + assert(! _invlist_len(invlist)); + + *zero_addr = 0; + + /* 1^1 = 0; 1^0 = 1 */ + *offset = 1 ^ will_have_0; + return zero_addr + *offset; +} + +PERL_STATIC_INLINE UV* +S_invlist_array(SV* const invlist) +{ + /* Returns the pointer to the inversion list's array. Every time the + * length changes, this needs to be called in case malloc or realloc moved + * it */ + + PERL_ARGS_ASSERT_INVLIST_ARRAY; + + /* Must not be empty. If these fail, you probably didn't check for + * being non-zero before trying to get the array */ + assert(_invlist_len(invlist)); + + /* The very first element always contains zero, The array begins either + * there, or if the inversion list is offset, at the element after it. + * The offset header field determines which; it contains 0 or 1 to indicate + * how much additionally to add */ + assert(0 == *(SvPVX(invlist))); + return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); +} + +PERL_STATIC_INLINE void +S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) +{ + /* Sets the current number of elements stored in the inversion list. + * Updates SvCUR correspondingly */ + PERL_UNUSED_CONTEXT; + PERL_ARGS_ASSERT_INVLIST_SET_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + SvCUR_set(invlist, + (len == 0) + ? 0 + : TO_INTERNAL_SIZE(len + offset)); + assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); +} + +PERL_STATIC_INLINE IV* +S_get_invlist_previous_index_addr(SV* invlist) +{ + /* Return the address of the IV that is reserved to hold the cached index + * */ + PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); +} + +PERL_STATIC_INLINE IV +S_invlist_previous_index(SV* const invlist) +{ + /* Returns cached index of previous search */ + + PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; + + return *get_invlist_previous_index_addr(invlist); +} + +PERL_STATIC_INLINE void +S_invlist_set_previous_index(SV* const invlist, const IV index) +{ + /* Caches for later retrieval */ + + PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; + + assert(index == 0 || index < (int) _invlist_len(invlist)); + + *get_invlist_previous_index_addr(invlist) = index; +} + +PERL_STATIC_INLINE UV +S_invlist_max(SV* const invlist) +{ + /* Returns the maximum number of elements storable in the inversion list's + * array, without having to realloc() */ + + PERL_ARGS_ASSERT_INVLIST_MAX; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ + return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; +} + +#ifndef PERL_IN_XSUB_RE +SV* +Perl__new_invlist(pTHX_ IV initial_size) +{ + + /* Return a pointer to a newly constructed inversion list, with enough + * space to store 'initial_size' elements. If that number is negative, a + * system default is used instead */ + + SV* new_list; + + if (initial_size < 0) { + initial_size = 10; + } + + /* Allocate the initial space */ + new_list = newSV_type(SVt_INVLIST); + + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(new_list, 0, 0); + + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; + + *get_invlist_previous_index_addr(new_list) = 0; + + return new_list; +} + +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) +{ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to , which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands. The passed in 'list' contains + * several header fields at the beginning that are not part of the + * inversion list body proper */ + + const STRLEN length = (STRLEN) list[0]; + const UV version_id = list[1]; + const bool offset = cBOOL(list[2]); +#define HEADER_LENGTH 3 + /* If any of the above changes in any way, you must change HEADER_LENGTH + * (if appropriate) and regenerate INVLIST_VERSION_ID by running + * perl -E 'say int(rand 2**31-1)' + */ +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and + data structure type, so that one being + passed in can be validated to be an + inversion list of the correct vintage. + */ + + SV* invlist = newSV_type(SVt_INVLIST); + + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; + + if (version_id != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + /* The generated array passed in includes header elements that aren't part + * of the list proper, so start it just after them */ + SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); + + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); + + invlist_set_previous_index(invlist, 0); + + /* Initialize the iteration pointer. */ + invlist_iterfinish(invlist); + + SvREADONLY_on(invlist); + + return invlist; +} +#endif /* ifndef PERL_IN_XSUB_RE */ + +STATIC void +S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) +{ + /* Grow the maximum size of an inversion list */ + + PERL_ARGS_ASSERT_INVLIST_EXTEND; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Add one to account for the zero element at the beginning which may not + * be counted by the calling parameters */ + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); +} + +PERL_STATIC_INLINE void +S_invlist_trim(SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_TRIM; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Change the length of the inversion list to how many entries it currently + * has */ + SvPV_shrink_to_cur((SV *) invlist); +} + +STATIC void +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) +{ + /* Subject to change or removal. Append the range from 'start' to 'end' at + * the end of the inversion list. The range must be above any existing + * ones. */ + + UV* array; + UV max = invlist_max(invlist); + UV len = _invlist_len(invlist); + bool offset; + + PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; + + if (len == 0) { /* Empty lists must be initialized */ + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); + } + else { + /* Here, the existing list is non-empty. The current max entry in the + * list is generally the first value not in the set, except when the + * set extends to the end of permissible values, in which case it is + * the first entry in that final set, and so this call is an attempt to + * append out-of-order */ + + UV final_element = len - 1; + array = invlist_array(invlist); + if (array[final_element] > start + || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) + { + Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); + } + + /* Here, it is a legal append. If the new range begins with the first + * value not in the set, it is extending the set, so the new first + * value not in the set is one greater than the newly extended range. + * */ + offset = *get_invlist_offset_addr(invlist); + if (array[final_element] == start) { + if (end != UV_MAX) { + array[final_element] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, + * just let the range that this would extend to have no end */ + invlist_set_len(invlist, len - 1, offset); + } + return; + } + } + + /* Here the new range doesn't extend any existing set. Add it */ + + len += 2; /* Includes an element each for the start and end of range */ + + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ + if (max < len) { + invlist_extend(invlist, len); + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + + array = invlist_array(invlist); + } + else { + invlist_set_len(invlist, len, offset); + } + + /* The next item on the list starts the range, the one after that is + * one past the new range. */ + array[len - 2] = start; + if (end != UV_MAX) { + array[len - 1] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, just let + * the range have no end */ + invlist_set_len(invlist, len - 1, offset); + } +} + +#ifndef PERL_IN_XSUB_RE + +IV +Perl__invlist_search(SV* const invlist, const UV cp) +{ + /* Searches the inversion list for the entry that contains the input code + * point . If is not in the list, -1 is returned. Otherwise, the + * return value is the index into the list's array of the range that + * contains */ + + IV low = 0; + IV mid; + IV high = _invlist_len(invlist); + const IV highest_element = high - 1; + const UV* array; + + PERL_ARGS_ASSERT__INVLIST_SEARCH; + + /* If list is empty, return failure. */ + if (high == 0) { + return -1; + } + + /* (We can't get the array unless we know the list is non-empty) */ + array = invlist_array(invlist); + + mid = invlist_previous_index(invlist); + assert(mid >=0 && mid <= highest_element); + + /* contains the cache of the result of the previous call to this + * function (0 the first time). See if this call is for the same result, + * or if it is for mid-1. This is under the theory that calls to this + * function will often be for related code points that are near each other. + * And benchmarks show that caching gives better results. We also test + * here if the code point is within the bounds of the list. These tests + * replace others that would have had to be made anyway to make sure that + * the array bounds were not exceeded, and these give us extra information + * at the same time */ + if (cp >= array[mid]) { + if (cp >= array[highest_element]) { + return highest_element; + } + + /* Here, array[mid] <= cp < array[highest_element]. This means that + * the final element is not the answer, so can exclude it; it also + * means that is not the final element, so can refer to 'mid + 1' + * safely */ + if (cp < array[mid + 1]) { + return mid; + } + high--; + low = mid + 1; + } + else { /* cp < aray[mid] */ + if (cp < array[0]) { /* Fail if outside the array */ + return -1; + } + high = mid; + if (cp >= array[mid - 1]) { + goto found_entry; + } + } + + /* Binary search. What we are looking for is such that + * array[i] <= cp < array[i+1] + * The loop below converges on the i+1. Note that there may not be an + * (i+1)th element in the array, and things work nonetheless */ + while (low < high) { + mid = (low + high) / 2; + assert(mid <= highest_element); + if (array[mid] <= cp) { /* cp >= array[mid] */ + low = mid + 1; + + /* We could do this extra test to exit the loop early. + if (cp < array[low]) { + return mid; + } + */ + } + else { /* cp < array[mid] */ + high = mid; + } + } + + found_entry: + high--; + invlist_set_previous_index(invlist, high); + return high; +} + +void +Perl__invlist_populate_swatch(SV* const invlist, + const UV start, const UV end, U8* swatch) +{ + /* populates a swatch of a swash the same way swatch_get() does in utf8.c, + * but is used when the swash has an inversion list. This makes this much + * faster, as it uses a binary search instead of a linear one. This is + * intimately tied to that function, and perhaps should be in utf8.c, + * except it is intimately tied to inversion lists as well. It assumes + * that is all 0's on input */ + + UV current = start; + const IV len = _invlist_len(invlist); + IV i; + const UV * array; + + PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; + + if (len == 0) { /* Empty inversion list */ + return; + } + + array = invlist_array(invlist); + + /* Find which element it is */ + i = _invlist_search(invlist, start); + + /* We populate from to */ + while (current < end) { + UV upper; + + /* The inversion list gives the results for every possible code point + * after the first one in the list. Only those ranges whose index is + * even are ones that the inversion list matches. For the odd ones, + * and if the initial code point is not in the list, we have to skip + * forward to the next element */ + if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { + i++; + if (i >= len) { /* Finished if beyond the end of the array */ + return; + } + current = array[i]; + if (current >= end) { /* Finished if beyond the end of what we + are populating */ + if (LIKELY(end < UV_MAX)) { + return; + } + + /* We get here when the upper bound is the maximum + * representable on the machine, and we are looking for just + * that code point. Have to special case it */ + i = len; + goto join_end_of_list; + } + } + assert(current >= start); + + /* The current range ends one below the next one, except don't go past + * */ + i++; + upper = (i < len && array[i] < end) ? array[i] : end; + + /* Here we are in a range that matches. Populate a bit in the 3-bit U8 + * for each code point in it */ + for (; current < upper; current++) { + const STRLEN offset = (STRLEN)(current - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + + join_end_of_list: + + /* Quit if at the end of the list */ + if (i >= len) { + + /* But first, have to deal with the highest possible code point on + * the platform. The previous code assumes that is one + * beyond where we want to populate, but that is impossible at the + * platform's infinity, so have to handle it specially */ + if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) + { + const STRLEN offset = (STRLEN)(end - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + return; + } + + /* Advance to the next range, which will be for code points not in the + * inversion list */ + current = array[i]; + } + + return; +} + +void +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) +{ + /* Take the union of two inversion lists and point to it. *output + * SHOULD BE DEFINED upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. + * + * The algorithm is like a merge sort. + * + * XXX A potential performance improvement is to keep track as we go along + * if only one of the inputs contributes to the result, meaning the other + * is a subset of that one. In that case, we can skip the final copy and + * return the larger of the input lists, but then outside code might need + * to keep track of whether to free the input list or not */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* u; /* the resulting union */ + UV* array_u; + UV len_u; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_u = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 0. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is zero is something not in the set. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* If either one is empty, the union is the other one */ + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + + if (*output == a) { + if (a != NULL) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + } + if (*output != b) { + *output = invlist_clone(b); + if (complement_b) { + _invlist_invert(*output); + } + } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } + return; + } + else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; + if (*output == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + + /* The complement of an empty list is a list that has everything in it, + * so the union with includes everything too */ + if (complement_b) { + if (a == *output) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + *output = _new_invlist(1); + _append_range_to_invlist(*output, 0, UV_MAX); + } + else if (*output != a) { + *output = invlist_clone(a); + } + /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the union for the worst case: that the sets are completely + * disjoint */ + u = _new_invlist(len_a + len_b); + + /* Will contain U+0000 if either component does */ + array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) + || (len_b > 0 && array_b[0] == 0)); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the union's array */ + bool cp_in_set; /* is it in the the input list's set or not */ + + /* We need to take one or the other of the two inputs for the union. + * Since we are merging two sorted lists, we take the smaller of the + * next items. In case of a tie, we take the one that is in its set + * first. If we took one not in the set first, it would decrement the + * count, possibly to 0 which would cause it to be output as ending the + * range, and the next time through we would take the same number, and + * output it again as beginning the next range. By doing it the + * opposite way, there is no possibility that the count will be + * momentarily decremented to 0, and thus the two adjoining ranges will + * be seamlessly merged. (In a tie and both are in the set or both not + * in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp = array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 0, which marks the + * beginning/end of a range in that's in the set */ + if (cp_in_set) { + if (count == 0) { + array_u[i_u++] = cp; + } + count++; + } + else { + count--; + if (count == 0) { + array_u[i_u++] = cp; + } + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that hasn't been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to the element beyond + * the one we care about.) If in the set, we decrement 'count'; if 0, there + * is potentially more to output. + * There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. What's left + * in the union is entirely from the non-exhausted set. + * 2) Both were in their sets, count is 2. Nothing further should + * be output, as everything that remains will be in the exhausted + * list's set, hence in the union; decrementing to 1 but not 0 insures + * that + * 3) the exhausted was in its set, non-exhausted isn't, count is 1. + * Nothing further should be output because the union includes + * everything from the exhausted set. Not decrementing ensures that. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; + * decrementing to 0 insures that we look at the remainder of the + * non-exhausted set */ + if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count--; + } + + /* The final length is what we've output so far, plus what else is about to + * be output. (If 'count' is non-zero, then the input list we exhausted + * has everything remaining up to the machine's limit in its set, and hence + * in the union, so there will be no further output. */ + len_u = i_u; + if (count == 0) { + /* At most one of the subexpressions will be non-zero */ + len_u += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_u, so + * re-find it */ + if (len_u != _invlist_len(u)) { + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); + invlist_trim(u); + array_u = invlist_array(u); + } + + /* When 'count' is 0, the list that was exhausted (if one was shorter than + * the other) ended with everything above it not in its set. That means + * that the remaining part of the union is precisely the same as the + * non-exhausted list, so can just copy it unchanged. (If both list were + * exhausted at the same time, then the operations below will be both 0.) + */ + if (count == 0) { + IV copy_count; /* At most one will have a non-zero copy count */ + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_u + i_u, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_u + i_u, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ + if (a == *output || b == *output) { + assert(! invlist_is_iterating(*output)); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } + } + + *output = u; + + return; +} + +void +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) +{ + /* Take the intersection of two inversion lists and point to it. *i + * SHOULD BE DEFINED upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. In fact, it had bugs + * + * The algorithm is like a merge sort, and is essentially the same as the + * union above + */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* r; /* the resulting intersection */ + UV* array_r; + UV len_r; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_r = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 2. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is 2 is something in the intersection. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* Special case if either one is empty */ + len_a = (a == NULL) ? 0 : _invlist_len(a); + if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; + + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the above 'if', 'b' must + * be empty. Here, also we are using 'b's complement, which hence + * must be every possible code point. Thus the intersection is + * simply 'a'. */ + if (*i != a) { + if (*i == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + + *i = invlist_clone(a); + } + /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ + if (*i == a) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + else if (*i == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the intersection for the worst case: that the intersection ends up + * fragmenting everything to be completely disjoint */ + r= _new_invlist(len_a + len_b); + + /* Will contain U+0000 iff both components do */ + array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 + && len_b > 0 && array_b[0] == 0); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the intersection's + array */ + bool cp_in_set; /* Is it in the input list's set or not */ + + /* We need to take one or the other of the two inputs for the + * intersection. Since we are merging two sorted lists, we take the + * smaller of the next items. In case of a tie, we take the one that + * is not in its set first (a difference from the union algorithm). If + * we took one in the set first, it would increment the count, possibly + * to 2 which would cause it to be output as starting a range in the + * intersection, and the next time through we would take that same + * number, and output it again as ending the set. By doing it the + * opposite of this, there is no possibility that the count will be + * momentarily incremented to 2. (In a tie and both are in the set or + * both not in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp= array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 2, which marks the + * beginning/end of a range that's in the intersection */ + if (cp_in_set) { + count++; + if (count == 2) { + array_r[i_r++] = cp; + } + } + else { + if (count == 2) { + array_r[i_r++] = cp; + } + count--; + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that has been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to elements 1 beyond + * the ones we care about.) There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. There's + * nothing left in the intersection. + * 2) Both were in their sets, count is 2 and perhaps is incremented to + * above 2. What should be output is exactly that which is in the + * non-exhausted set, as everything it has is also in the intersection + * set, and everything it doesn't have can't be in the intersection + * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and + * gets incremented to 2. Like the previous case, the intersection is + * everything that remains in the non-exhausted set. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and + * remains 1. And the intersection has nothing more. */ + if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count++; + } + + /* The final length is what we've output so far plus what else is in the + * intersection. At most one of the subexpressions below will be non-zero + * */ + len_r = i_r; + if (count >= 2) { + len_r += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_r, so + * re-find it */ + if (len_r != _invlist_len(r)) { + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); + invlist_trim(r); + array_r = invlist_array(r); + } + + /* Finish outputting any remaining */ + if (count >= 2) { /* At most one will have a non-zero copy count */ + IV copy_count; + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_r + i_r, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_r + i_r, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ + if (a == *i || b == *i) { + assert(! invlist_is_iterating(*i)); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } + } + + *i = r; + + return; +} + +SV* +Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) +{ + /* Add the range from 'start' to 'end' inclusive to the inversion list's + * set. A pointer to the inversion list is returned. This may actually be + * a new list, in which case the passed in one has been destroyed. The + * passed in inversion list can be NULL, in which case a new one is created + * with just the one range in it */ + + SV* range_invlist; + UV len; + + if (invlist == NULL) { + invlist = _new_invlist(2); + len = 0; + } + else { + len = _invlist_len(invlist); + } + + /* If comes after the final entry actually in the list, can just append it + * to the end, */ + if (len == 0 + || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1) + && start >= invlist_array(invlist)[len - 1])) + { + _append_range_to_invlist(invlist, start, end); + return invlist; + } + + /* Here, can't just append things, create and return a new inversion list + * which is the union of this range and the existing inversion list */ + range_invlist = _new_invlist(2); + _append_range_to_invlist(range_invlist, start, end); + + _invlist_union(invlist, range_invlist, &invlist); + + /* The temporary can be freed */ + SvREFCNT_dec_NN(range_invlist); + + return invlist; +} + +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + +#endif + +PERL_STATIC_INLINE SV* +S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { + return _add_range_to_invlist(invlist, cp, cp); +} + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_invert(pTHX_ SV* const invlist) +{ + /* Complement the input inversion list. This adds a 0 if the list didn't + * have a zero; removes it otherwise. As described above, the data + * structure is set up so that this is very efficient */ + + PERL_ARGS_ASSERT__INVLIST_INVERT; + + assert(! invlist_is_iterating(invlist)); + + /* The inverse of matching nothing is matching everything */ + if (_invlist_len(invlist) == 0) { + _append_range_to_invlist(invlist, 0, UV_MAX); + return; + } + + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); +} + +#endif + +PERL_STATIC_INLINE SV* +S_invlist_clone(pTHX_ SV* const invlist) +{ + + /* Return a new inversion list that is a copy of the input one, which is + * unchanged. The new list will not be mortal even if the old one was. */ + + /* Need to allocate extra space to accommodate Perl's addition of a + * trailing NUL to SvPV's, since it thinks they are always strings */ + SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); + STRLEN physical_length = SvCUR(invlist); + bool offset = *(get_invlist_offset_addr(invlist)); + + PERL_ARGS_ASSERT_INVLIST_CLONE; + + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, _invlist_len(invlist), offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); + + return new_invlist; +} + +PERL_STATIC_INLINE STRLEN* +S_get_invlist_iter_addr(SV* invlist) +{ + /* Return the address of the UV that contains the current iteration + * position */ + + PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->iterator); +} + +PERL_STATIC_INLINE void +S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ +{ + PERL_ARGS_ASSERT_INVLIST_ITERINIT; + + *get_invlist_iter_addr(invlist) = 0; +} + +PERL_STATIC_INLINE void +S_invlist_iterfinish(SV* invlist) +{ + /* Terminate iterator for invlist. This is to catch development errors. + * Any iteration that is interrupted before completed should call this + * function. Functions that add code points anywhere else but to the end + * of an inversion list assert that they are not in the middle of an + * iteration. If they were, the addition would make the iteration + * problematical: if the iteration hadn't reached the place where things + * were being added, it would be ok */ + + PERL_ARGS_ASSERT_INVLIST_ITERFINISH; + + *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; +} + +STATIC bool +S_invlist_iternext(SV* invlist, UV* start, UV* end) +{ + /* An C call on must be used to set this up. + * This call sets in <*start> and <*end>, the next range in . + * Returns if successful and the next call will return the next + * range; if was already at the end of the list. If the latter, + * <*start> and <*end> are unchanged, and the next call to this function + * will start over at the beginning of the list */ + + STRLEN* pos = get_invlist_iter_addr(invlist); + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_ITERNEXT; + + if (*pos >= len) { + *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ + return FALSE; + } + + array = invlist_array(invlist); + + *start = array[(*pos)++]; + + if (*pos >= len) { + *end = UV_MAX; + } + else { + *end = array[(*pos)++] - 1; + } + + return TRUE; +} + +PERL_STATIC_INLINE bool +S_invlist_is_iterating(SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; + + return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; +} + +PERL_STATIC_INLINE UV +S_invlist_highest(SV* const invlist) +{ + /* Returns the highest code point that matches an inversion list. This API + * has an ambiguity, as it returns 0 under either the highest is actually + * 0, or if the list is empty. If this distinction matters to you, check + * for emptiness before calling this function */ + + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_HIGHEST; + + if (len == 0) { + return 0; + } + + array = invlist_array(invlist); + + /* The last element in the array in the inversion list always starts a + * range that goes to infinity. That range may be for code points that are + * matched in the inversion list, or it may be for ones that aren't + * matched. In the latter case, the highest code point in the set is one + * less than the beginning of this range; otherwise it is the final element + * of this range: infinity */ + return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) + ? UV_MAX + : array[len - 1] - 1; +} + +#ifndef PERL_IN_XSUB_RE +SV * +Perl__invlist_contents(pTHX_ SV* const invlist) +{ + /* Get the contents of an inversion list into a string SV so that they can + * be printed out. It uses the format traditionally done for debug tracing + */ + + UV start, end; + SV* output = newSVpvs("\n"); + + PERL_ARGS_ASSERT__INVLIST_CONTENTS; + + assert(! invlist_is_iterating(invlist)); + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); + } + else if (end != start) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", + start, end); + } + else { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); + } + } + + return output; +} +#endif + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) +{ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ + + UV start, end; + STRLEN count = 0; + + PERL_ARGS_ASSERT__INVLIST_DUMP; + + if (invlist_is_iterating(invlist)) { + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); + return; + } + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); + } + else if (end != start) { + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); + } + else { + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); + } + count += 2; + } +} + +void +Perl__load_PL_utf8_foldclosures (pTHX) +{ + assert(! PL_utf8_foldclosures); + + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); +} +#endif + +#ifdef PERL_ARGS_ASSERT__INVLISTEQ +bool +S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) +{ + /* Return a boolean as to if the two passed in inversion lists are + * identical. The final argument, if TRUE, says to take the complement of + * the second inversion list before doing the comparison */ + + const UV* array_a = invlist_array(a); + const UV* array_b = invlist_array(b); + UV len_a = _invlist_len(a); + UV len_b = _invlist_len(b); + + UV i = 0; /* current index into the arrays */ + bool retval = TRUE; /* Assume are identical until proven otherwise */ + + PERL_ARGS_ASSERT__INVLISTEQ; + + /* If are to compare 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* The complement of nothing is everything, so would have to have + * just one element, starting at zero (ending at infinity) */ + if (len_b == 0) { + return (len_a == 1 && array_a[0] == 0); + } + else if (array_b[0] == 0) { + + /* Otherwise, to complement, we invert. Here, the first element is + * 0, just remove it. To do this, we just pretend the array starts + * one later */ + + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Make sure that the lengths are the same, as well as the final element + * before looping through the remainder. (Thus we test the length, final, + * and first elements right off the bat) */ + if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { + retval = FALSE; + } + else for (i = 0; i < len_a - 1; i++) { + if (array_a[i] != array_b[i]) { + retval = FALSE; + break; + } + } + + return retval; +} +#endif + +#undef HEADER_LENGTH +#undef TO_INTERNAL_SIZE +#undef FROM_INTERNAL_SIZE +#undef INVLIST_VERSION_ID + +/* End of inversion list object */ + +STATIC void +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) +{ + /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' + * constructs, and updates RExC_flags with them. On input, RExC_parse + * should point to the first flag; it is updated on output to point to the + * final ')' or ':'. There needs to be at least one flag, or this will + * abort */ + + /* for (?g), (?gc), and (?o) warnings; warning + about (?c) will warn about (?g) -- japhy */ + +#define WASTED_O 0x01 +#define WASTED_G 0x02 +#define WASTED_C 0x04 +#define WASTED_GC (WASTED_G|WASTED_C) + I32 wastedflags = 0x00; + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; + char has_charset_modifier = '\0'; + regex_charset cs; + bool has_use_defaults = FALSE; + const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ + + PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; + + /* '^' as an initial flag sets certain defaults */ + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + has_use_defaults = TRUE; + STD_PMMOD_FLAGS_CLEAR(&RExC_flags); + set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET); + } + + cs = get_regex_charset(RExC_flags); + if (cs == REGEX_DEPENDS_CHARSET + && (RExC_utf8 || RExC_uni_semantics)) + { + cs = REGEX_UNICODE_CHARSET; + } + + while (*RExC_parse) { + /* && strchr("iogcmsx", *RExC_parse) */ + /* (?g), (?gc) and (?o) are useless here + and must be globally applied -- japhy */ + switch (*RExC_parse) { + + /* Code for the imsx flags */ + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + + case LOCALE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_LOCALE_CHARSET; + has_charset_modifier = LOCALE_PAT_MOD; + break; + case UNICODE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_UNICODE_CHARSET; + has_charset_modifier = UNICODE_PAT_MOD; + break; + case ASCII_RESTRICT_PAT_MOD: + if (flagsp == &negflags) { + goto neg_modifier; + } + if (has_charset_modifier) { + if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { + goto excess_modifier; + } + /* Doubled modifier implies more restricted */ + cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; + } + else { + cs = REGEX_ASCII_RESTRICTED_CHARSET; + } + has_charset_modifier = ASCII_RESTRICT_PAT_MOD; + break; + case DEPENDS_PAT_MOD: + if (has_use_defaults) { + goto fail_modifiers; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + else if (has_charset_modifier) { + goto excess_modifier; + } + + /* The dual charset means unicode semantics if the + * pattern (or target, not known until runtime) are + * utf8, or something in the pattern indicates unicode + * semantics */ + cs = (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET; + has_charset_modifier = DEPENDS_PAT_MOD; + break; + excess_modifier: + RExC_parse++; + if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { + vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); + } + else if (has_charset_modifier == *(RExC_parse - 1)) { + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); + } + else { + vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); + } + /*NOTREACHED*/ + neg_modifier: + RExC_parse++; + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); + /*NOTREACHED*/ + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; + if (! (wastedflags & wflagbit) ) { + wastedflags |= wflagbit; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ + vWARN5( + RExC_parse + 1, + "Useless (%s%c) - %suse /%c modifier", + flagsp == &negflags ? "?-" : "?", + *RExC_parse, + flagsp == &negflags ? "don't " : "", + *RExC_parse + ); + } + } + break; + + case CONTINUE_PAT_MOD: /* 'c' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (! (wastedflags & WASTED_C) ) { + wastedflags |= WASTED_GC; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ + vWARN3( + RExC_parse + 1, + "Useless (%sc) - %suse /gc modifier", + flagsp == &negflags ? "?-" : "?", + flagsp == &negflags ? "don't " : "" + ); + } + } + break; + case KEEPCOPY_PAT_MOD: /* 'p' */ + if (flagsp == &negflags) { + if (SIZE_ONLY) + ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); + } else { + *flagsp |= RXf_PMf_KEEPCOPY; + } + break; + case '-': + /* A flag is a default iff it is following a minus, so + * if there is a minus, it means will be trying to + * re-specify a default which is an error */ + if (has_use_defaults || flagsp == &negflags) { + goto fail_modifiers; + } + flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case ':': + case ')': + RExC_flags |= posflags; + RExC_flags &= ~negflags; + set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } + return; + /*NOTREACHED*/ + default: + fail_modifiers: + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); + /*NOTREACHED*/ + } + + ++RExC_parse; + } +} + +/* + - reg - regular expression, i.e. main body or parenthesized thing + * + * Caller must absorb opening parenthesis. + * + * Combining parenthesis handling with the base level of regular expression + * is a trifle forced, but the need to tie the tails of the branches to what + * follows makes it hard to avoid. + */ +#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) +#ifdef DEBUGGING +#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) +#else +#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) +#endif + +/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets + flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan + needs to be restarted. + Otherwise would only return NULL if regbranch() returns NULL, which + cannot happen. */ +STATIC regnode * +S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) + /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. + * 2 is like 1, but indicates that nextchar() has been called to advance + * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and + * this flag alerts us to the need to check for that */ +{ + regnode *ret; /* Will be the head of the group. */ + regnode *br; + regnode *lastbr; + regnode *ender = NULL; + I32 parno = 0; + I32 flags; + U32 oregflags = RExC_flags; + bool have_branch = 0; + bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; + I32 num; /* numeric backreferences */ + + char * parse_start = RExC_parse; /* MJD */ + char * const oregcomp_parse = RExC_parse; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG; + DEBUG_PARSE("reg "); + + *flagp = 0; /* Tentatively. */ + + + /* Make an OPEN node, if parenthesized. */ + if (paren) { + + /* Under /x, space and comments can be gobbled up between the '(' and + * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such + * intervening space, as the sequence is a token, and a token should be + * indivisible */ + bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; + + if ( *RExC_parse == '*') { /* (*VERB:ARG) */ + char *start_verb = RExC_parse; + STRLEN verb_len = 0; + char *start_arg = NULL; + unsigned char op = 0; + int argok = 1; + int internal_argval = 0; /* internal_argval is only useful if + !argok */ + + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); + } + while ( *RExC_parse && *RExC_parse != ')' ) { + if ( *RExC_parse == ':' ) { + start_arg = RExC_parse + 1; + break; + } + RExC_parse++; + } + ++start_verb; + verb_len = RExC_parse - start_verb; + if ( start_arg ) { + RExC_parse++; + while ( *RExC_parse && *RExC_parse != ')' ) + RExC_parse++; + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern argument"); + if ( RExC_parse == start_arg ) + start_arg = NULL; + } else { + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern"); + } + + switch ( *start_verb ) { + case 'A': /* (*ACCEPT) */ + if ( memEQs(start_verb,verb_len,"ACCEPT") ) { + op = ACCEPT; + internal_argval = RExC_nestroot; + } + break; + case 'C': /* (*COMMIT) */ + if ( memEQs(start_verb,verb_len,"COMMIT") ) + op = COMMIT; + break; + case 'F': /* (*FAIL) */ + if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { + op = OPFAIL; + argok = 0; + } + break; + case ':': /* (*:NAME) */ + case 'M': /* (*MARK:NAME) */ + if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { + op = MARKPOINT; + argok = -1; + } + break; + case 'P': /* (*PRUNE) */ + if ( memEQs(start_verb,verb_len,"PRUNE") ) + op = PRUNE; + break; + case 'S': /* (*SKIP) */ + if ( memEQs(start_verb,verb_len,"SKIP") ) + op = SKIP; + break; + case 'T': /* (*THEN) */ + /* [19:06] :: is then */ + if ( memEQs(start_verb,verb_len,"THEN") ) { + op = CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; + } + break; + } + if ( ! op ) { + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); + } + if ( argok ) { + if ( start_arg && internal_argval ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else if ( argok < 0 && !start_arg ) { + vFAIL3("Verb pattern '%.*s' has a mandatory argument", + verb_len, start_verb); + } else { + ret = reganode(pRExC_state, op, internal_argval); + if ( ! internal_argval && ! SIZE_ONLY ) { + if (start_arg) { + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); + RExC_rxi->data->data[ARG(ret)]=(void*)sv; + ret->flags = 0; + } else { + ret->flags = 1; + } + } + } + if (!internal_argval) + RExC_seen |= REG_VERBARG_SEEN; + } else if ( start_arg ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else { + ret = reg_node(pRExC_state, op); + } + nextchar(pRExC_state); + return ret; + } + else if (*RExC_parse == '?') { /* (?...) */ + bool is_logical = 0; + const char * const seqstart = RExC_parse; + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(?...)', the '(' and '?' must be adjacent"); + } + + RExC_parse++; + paren = *RExC_parse++; + ret = NULL; /* For look-ahead/behind. */ + switch (paren) { + + case 'P': /* (?P...) variants for those used to PCRE/Python */ + paren = *RExC_parse++; + if ( paren == '<') /* (?P<...>) named capture */ + goto named_capture; + else if (paren == '>') { /* (?P>name) named recursion */ + goto named_recursion; + } + else if (paren == '=') { /* (?P=...) named backref */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ + char* name_start = RExC_parse; + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + + nextchar(pRExC_state); + return ret; + } + RExC_parse++; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + case '<': /* (?<...) */ + if (*RExC_parse == '!') + paren = ','; + else if (*RExC_parse != '=') + named_capture: + { /* (?<...>) */ + char *name_start; + SV *svname; + paren= '>'; + case '\'': /* (?'...') */ + name_start= RExC_parse; + svname = reg_scan_name(pRExC_state, + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) + vFAIL2("Sequence (?%c... not terminated", + paren=='>' ? '<' : paren); + if (SIZE_ONLY) { + HE *he_str; + SV *sv_dat = NULL; + if (!svname) /* shouldn't happen */ + Perl_croak(aTHX_ + "panic: reg_scan_name returned NULL"); + if (!RExC_paren_names) { + RExC_paren_names= newHV(); + sv_2mortal(MUTABLE_SV(RExC_paren_names)); +#ifdef DEBUGGING + RExC_paren_name_list= newAV(); + sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); +#endif + } + he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) { + /* croak baby croak */ + Perl_croak(aTHX_ + "panic: paren_name hash element allocation failed"); + } else if ( SvPOK(sv_dat) ) { + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIV_set(sv_dat, SvIVX(sv_dat) + 1); + } + } else { + (void)SvUPGRADE(sv_dat,SVt_PVNV); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); + SvIOK_on(sv_dat); + SvIV_set(sv_dat, 1); + } +#ifdef DEBUGGING + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) + SvREFCNT_dec_NN(svname); +#endif + + /*sv_dump(sv_dat);*/ + } + nextchar(pRExC_state); + paren = 1; + goto capturing_parens; + } + RExC_seen |= REG_LOOKBEHIND_SEEN; + RExC_in_lookbehind++; + RExC_parse++; + /* FALLTHROUGH */ + case '=': /* (?=...) */ + RExC_seen_zerolen++; + break; + case '!': /* (?!...) */ + RExC_seen_zerolen++; + if (*RExC_parse == ')') { + ret=reg_node(pRExC_state, OPFAIL); + nextchar(pRExC_state); + return ret; + } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; + case ':': /* (?:...) */ + case '>': /* (?>...) */ + break; + case '$': /* (?$...) */ + case '@': /* (?@...) */ + vFAIL2("Sequence (?%c...) not implemented", (int)paren); + break; + case '0' : /* (?0) */ + case 'R' : /* (?R) */ + if (*RExC_parse != ')') + FAIL("Sequence (?R) not terminated"); + ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + /*notreached*/ + /* named and numeric backreferences */ + case '&': /* (?&NAME) */ + parse_start = RExC_parse - 1; + named_recursion: + { + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); + goto gen_recurse_regop; + assert(0); /* NOT REACHED */ + case '+': + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse++; + vFAIL("Illegal pattern"); + } + goto parse_recursion; + /* NOT REACHED*/ + case '-': /* (?-1) */ + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse--; /* rewind to let it be handled later */ + goto parse_flags; + } + /* FALLTHROUGH */ + case '1': case '2': case '3': case '4': /* (?1) */ + case '5': case '6': case '7': case '8': case '9': + RExC_parse--; + parse_recursion: + num = atoi(RExC_parse); + parse_start = RExC_parse - 1; /* MJD */ + if (*RExC_parse == '-') + RExC_parse++; + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (*RExC_parse!=')') + vFAIL("Expecting close bracket"); + + gen_recurse_regop: + if ( paren == '-' ) { + /* + Diagram of capture buffer numbering. + Top line is the normal capture buffer numbers + Bottom line is the negative indexing as from + the X (the (?-2)) + + + 1 2 3 4 5 X 6 7 + /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ + - 5 4 3 2 1 X x x + + */ + num = RExC_npar + num; + if (num < 1) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + } else if ( paren == '+' ) { + num = RExC_npar + num - 1; + } + + ret = reganode(pRExC_state, GOSUB, num); + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + ARG2L_SET( ret, RExC_recurse_count++); + RExC_emit++; + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); + } else { + RExC_size++; + } + RExC_seen |= REG_RECURSE_SEEN; + Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ + Set_Node_Offset(ret, parse_start); /* MJD */ + + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + + assert(0); /* NOT REACHED */ + + case '?': /* (??...) */ + is_logical = 1; + if (*RExC_parse != '{') { + RExC_parse++; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); + /*NOTREACHED*/ + } + *flagp |= POSTPONED; + paren = *RExC_parse++; + /* FALLTHROUGH */ + case '{': /* (?{...}) */ + { + U32 n = 0; + struct reg_code_block *cb; + + RExC_seen_zerolen++; + + if ( !pRExC_state->num_code_blocks + || pRExC_state->code_index >= pRExC_state->num_code_blocks + || pRExC_state->code_blocks[pRExC_state->code_index].start + != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) + - RExC_start) + ) { + if (RExC_pm_flags & PMf_USE_RE_EVAL) + FAIL("panic: Sequence (?{...}): no code block found\n"); + FAIL("Eval-group not allowed at runtime, use re 'eval'"); + } + /* this is a pre-compiled code block (?{...}) */ + cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + RExC_parse = RExC_start + cb->end; + if (!SIZE_ONLY) { + OP *o = cb->block; + if (cb->src_regex) { + n = add_data(pRExC_state, STR_WITH_LEN("rl")); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o; + } + else { + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); + RExC_rxi->data->data[n] = (void*)o; + } + } + pRExC_state->code_index++; + nextchar(pRExC_state); + + if (is_logical) { + regnode *eval; + ret = reg_node(pRExC_state, LOGICAL); + eval = reganode(pRExC_state, EVAL, n); + if (!SIZE_ONLY) { + ret->flags = 2; + /* for later propagation into (??{}) return value */ + eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); + } + REGTAIL(pRExC_state, ret, eval); + /* deal with the length of this later - MJD */ + return ret; + } + ret = reganode(pRExC_state, EVAL, n); + Set_Node_Length(ret, RExC_parse - parse_start + 1); + Set_Node_Offset(ret, parse_start); + return ret; + } + case '(': /* (?(?{...})...) and (?(?=...)...) */ + { + int is_define= 0; + if (RExC_parse[0] == '?') { /* (?(?...)) */ + if (RExC_parse[1] == '=' || RExC_parse[1] == '!' + || RExC_parse[1] == '<' + || RExC_parse[1] == '{') { /* Lookahead or eval. */ + I32 flag; + regnode *tail; + + ret = reg_node(pRExC_state, LOGICAL); + if (!SIZE_ONLY) + ret->flags = 1; + + tail = reg(pRExC_state, 1, &flag, depth+1); + if (flag & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + REGTAIL(pRExC_state, ret, tail); + goto insert_if; + } + } + else if ( RExC_parse[0] == '<' /* (?()...) */ + || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ + { + char ch = RExC_parse[0] == '<' ? '>' : '\''; + char *name_start= RExC_parse++; + U32 num = 0; + SV *sv_dat=reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ch) + vFAIL2("Sequence (?(%c... not terminated", + (ch == '>' ? '<' : ch)); + RExC_parse++; + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + ret = reganode(pRExC_state,NGROUPP,num); + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'D' && + RExC_parse[1] == 'E' && + RExC_parse[2] == 'F' && + RExC_parse[3] == 'I' && + RExC_parse[4] == 'N' && + RExC_parse[5] == 'E') + { + ret = reganode(pRExC_state,DEFINEP,0); + RExC_parse +=6 ; + is_define = 1; + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'R') { + RExC_parse++; + parno = 0; + if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + parno = atoi(RExC_parse++); + while (isDIGIT(*RExC_parse)) + RExC_parse++; + } else if (RExC_parse[0] == '&') { + SV *sv_dat; + RExC_parse++; + sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); + parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + ret = reganode(pRExC_state,INSUBP,parno); + goto insert_if_check_paren; + } + else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + /* (?(1)...) */ + char c; + char *tmp; + parno = atoi(RExC_parse++); + + while (isDIGIT(*RExC_parse)) + RExC_parse++; + ret = reganode(pRExC_state, GROUPP, parno); + + insert_if_check_paren: + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Switch condition not recognized"); + } + insert_if: + REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); + br = regbranch(pRExC_state, &flags, 1,depth+1); + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + (UV) flags); + } else + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); + c = *nextchar(pRExC_state); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + if (c == '|') { + if (is_define) + vFAIL("(?(DEFINE)....) does not allow branches"); + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + (UV) flags); + } + REGTAIL(pRExC_state, ret, lastbr); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + c = *nextchar(pRExC_state); + } + else + lastbr = NULL; + if (c != ')') + vFAIL("Switch (?(condition)... contains too many branches"); + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, br, ender); + if (lastbr) { + REGTAIL(pRExC_state, lastbr, ender); + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); + } + else + REGTAIL(pRExC_state, ret, ender); + RExC_size++; /* XXX WHY do we need this?!! + For large programs it seems to be required + but I can't figure out why. -- dmq*/ + return ret; + } + else { + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); + } + } + case '[': /* (?[ ... ]) */ + return handle_regex_sets(pRExC_state, NULL, flagp, depth, + oregcomp_parse); + case 0: + RExC_parse--; /* for vFAIL to print correctly */ + vFAIL("Sequence (? incomplete"); + break; + default: /* e.g., (?i) */ + --RExC_parse; + parse_flags: + parse_lparen_question_flags(pRExC_state); + if (UCHARAT(RExC_parse) != ':') { + nextchar(pRExC_state); + *flagp = TRYAGAIN; + return NULL; + } + paren = ':'; + nextchar(pRExC_state); + ret = NULL; + goto parse_rest; + } /* end switch */ + } + else { /* (...) */ + capturing_parens: + parno = RExC_npar; + RExC_npar++; + + ret = reganode(pRExC_state, OPEN, parno); + if (!SIZE_ONLY ){ + if (!RExC_nestroot) + RExC_nestroot = parno; + if (RExC_seen & REG_RECURSE_SEEN + && !RExC_open_parens[parno-1]) + { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting open paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ret))); + RExC_open_parens[parno-1]= ret; + } + } + Set_Node_Length(ret, 1); /* MJD */ + Set_Node_Offset(ret, RExC_parse); /* MJD */ + is_open = 1; + } + } + else /* ! paren */ + ret = NULL; + + parse_rest: + /* Pick up the branches, linking them together. */ + parse_start = RExC_parse; /* MJD */ + br = regbranch(pRExC_state, &flags, 1,depth+1); + + /* branch_len = (paren != 0); */ + + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + } + if (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, BRANCHJ, br, depth+1); + } + else { /* MJD */ + reginsert(pRExC_state, BRANCH, br, depth+1); + Set_Node_Length(br, paren != 0); + Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); + } + have_branch = 1; + if (SIZE_ONLY) + RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ + } + else if (paren == ':') { + *flagp |= flags&SIMPLE; + } + if (is_open) { /* Starts with OPEN. */ + REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ + } + else if (paren != '?') /* Not Conditional */ + ret = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + lastbr = br; + while (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + ender = reganode(pRExC_state, LONGJMP,0); + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); + } + if (SIZE_ONLY) + RExC_extralen += 2; /* Account for LONGJMP. */ + nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } + br = regbranch(pRExC_state, &flags, 0, depth+1); + + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + } + REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ + lastbr = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + } + + if (have_branch || paren != ':') { + /* Make a closing node, and hook it on the end. */ + switch (paren) { + case ':': + ender = reg_node(pRExC_state, TAIL); + break; + case 1: case 2: + ender = reganode(pRExC_state, CLOSE, parno); + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting close paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ender))); + RExC_close_parens[parno-1]= ender; + if (RExC_nestroot == parno) + RExC_nestroot = 0; + } + Set_Node_Offset(ender,RExC_parse+1); /* MJD */ + Set_Node_Length(ender,1); /* MJD */ + break; + case '<': + case ',': + case '=': + case '!': + *flagp &= ~HASWIDTH; + /* FALLTHROUGH */ + case '>': + ender = reg_node(pRExC_state, SUCCEED); + break; + case 0: + ender = reg_node(pRExC_state, END); + if (!SIZE_ONLY) { + assert(!RExC_opend); /* there can only be one! */ + RExC_opend = ender; + } + break; + } + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("lsbr"); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); + PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(lastbr), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - lastbr) + ); + }); + REGTAIL(pRExC_state, lastbr, ender); + + if (have_branch && !SIZE_ONLY) { + char is_nothing= 1; + if (depth==1) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; + + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br; br = regnext(br)) { + const U8 op = PL_regkind[OP(br)]; + if (op == BRANCH) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) + is_nothing= 0; + } + else if (op == BRANCHJ) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); + /* for now we always disable this optimisation * / + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) + */ + is_nothing= 0; + } + } + if (is_nothing) { + br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("NADA"); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); + PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(ret), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - ret) + ); + }); + OP(br)= NOTHING; + if (OP(ender) == TAIL) { + NEXT_OFF(br)= 0; + RExC_emit= br + 1; + } else { + regnode *opt; + for ( opt= br + 1; opt < ender ; opt++ ) + OP(opt)= OPTIMIZED; + NEXT_OFF(br)= ender - br; + } + } + } + } + + { + const char *p; + static const char parens[] = "=!<,>"; + + if (paren && (p = strchr(parens, paren))) { + U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; + int flag = (p - parens) > 1; + + if (paren == '>') + node = SUSPEND, flag = 0; + reginsert(pRExC_state, node,ret, depth+1); + Set_Node_Cur_Length(ret, parse_start); + Set_Node_Offset(ret, parse_start + 1); + ret->flags = flag; + REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); + } + } + + /* Check for proper termination. */ + if (paren) { + /* restore original flags, but keep (?p) */ + RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); + if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ("); + } + } + else if (!paren && RExC_parse < RExC_end) { + if (*RExC_parse == ')') { + RExC_parse++; + vFAIL("Unmatched )"); + } + else + FAIL("Junk on end of regexp"); /* "Can't happen". */ + assert(0); /* NOTREACHED */ + } + + if (RExC_in_lookbehind) { + RExC_in_lookbehind--; + } + if (after_freeze > RExC_npar) + RExC_npar = after_freeze; + return(ret); +} + +/* + - regbranch - one alternative of an | operator + * + * Implements the concatenation operator. + * + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + * restarted. + */ +STATIC regnode * +S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) +{ + regnode *ret; + regnode *chain = NULL; + regnode *latest; + I32 flags = 0, c = 0; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGBRANCH; + + DEBUG_PARSE("brnc"); + + if (first) + ret = NULL; + else { + if (!SIZE_ONLY && RExC_extralen) + ret = reganode(pRExC_state, BRANCHJ,0); + else { + ret = reg_node(pRExC_state, BRANCH); + Set_Node_Length(ret, 1); + } + } + + if (!first && SIZE_ONLY) + RExC_extralen += 1; /* BRANCHJ */ + + *flagp = WORST; /* Tentatively. */ + + RExC_parse--; + nextchar(pRExC_state); + while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { + flags &= ~TRYAGAIN; + latest = regpiece(pRExC_state, &flags,depth+1); + if (latest == NULL) { + if (flags & TRYAGAIN) + continue; + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); + } + else if (ret == NULL) + ret = latest; + *flagp |= flags&(HASWIDTH|POSTPONED); + if (chain == NULL) /* First piece. */ + *flagp |= flags&SPSTART; + else { + RExC_naughty++; + REGTAIL(pRExC_state, chain, latest); + } + chain = latest; + c++; + } + if (chain == NULL) { /* Loop ran zero times. */ + chain = reg_node(pRExC_state, NOTHING); + if (ret == NULL) + ret = chain; + } + if (c == 1) { + *flagp |= flags&SIMPLE; + } + + return ret; +} + +/* + - regpiece - something followed by possible [*+?] + * + * Note that the branching code sequences used for ? and the general cases + * of * and + are somewhat optimized: they use the same NOTHING node as + * both the endmarker for their branch list and the body of the last branch. + * It might seem that this node could be dispensed with entirely, but the + * endmarker role is not redundant. + * + * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with + * TRYAGAIN. + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + * restarted. + */ +STATIC regnode * +S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + regnode *ret; + char op; + char *next; + I32 flags; + const char * const origparse = RExC_parse; + I32 min; + I32 max = REG_INFTY; +#ifdef RE_TRACK_PATTERN_OFFSETS + char *parse_start; +#endif + const char *maxpos = NULL; + + /* Save the original in case we change the emitted regop to a FAIL. */ + regnode * const orig_emit = RExC_emit; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPIECE; + + DEBUG_PARSE("piec"); + + ret = regatom(pRExC_state, &flags,depth+1); + if (ret == NULL) { + if (flags & (TRYAGAIN|RESTART_UTF8)) + *flagp |= flags & (TRYAGAIN|RESTART_UTF8); + else + FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); + return(NULL); + } + + op = *RExC_parse; + + if (op == '{' && regcurly(RExC_parse)) { + maxpos = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; /* MJD */ +#endif + next = RExC_parse + 1; + while (isDIGIT(*next) || *next == ',') { + if (*next == ',') { + if (maxpos) + break; + else + maxpos = next; + } + next++; + } + if (*next == '}') { /* got one */ + if (!maxpos) + maxpos = next; + RExC_parse++; + min = atoi(RExC_parse); + if (*maxpos == ',') + maxpos++; + else + maxpos = RExC_parse; + max = atoi(maxpos); + if (!max && *maxpos != '0') + max = REG_INFTY; /* meaning "infinity" */ + else if (max >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + RExC_parse = next; + nextchar(pRExC_state); + if (max < min) { /* If can't match, warn and optimize to fail + unconditionally */ + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); + + /* We can't back off the size because we have to reserve + * enough space for all the things we are about to throw + * away, but we can shrink it by the ammount we are about + * to re-use here */ + RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; + } + else { + RExC_emit = orig_emit; + } + ret = reg_node(pRExC_state, OPFAIL); + return ret; + } + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { + if (SIZE_ONLY) { + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); + } + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); + } + + do_curly: + if ((flags&SIMPLE)) { + RExC_naughty += 2 + RExC_naughty / 2; + reginsert(pRExC_state, CURLY, ret, depth+1); + Set_Node_Offset(ret, parse_start+1); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); + } + else { + regnode * const w = reg_node(pRExC_state, WHILEM); + + w->flags = 0; + REGTAIL(pRExC_state, ret, w); + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, LONGJMP,ret, depth+1); + reginsert(pRExC_state, NOTHING,ret, depth+1); + NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ + } + reginsert(pRExC_state, CURLYX,ret, depth+1); + /* MJD hk */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Length(ret, + op == '{' ? (RExC_parse - parse_start) : 1); + + if (!SIZE_ONLY && RExC_extralen) + NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ + REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); + if (SIZE_ONLY) + RExC_whilem_seen++, RExC_extralen += 3; + RExC_naughty += 4 + RExC_naughty; /* compound interest */ + } + ret->flags = 0; + + if (min > 0) + *flagp = WORST; + if (max > 0) + *flagp |= HASWIDTH; + if (!SIZE_ONLY) { + ARG1_SET(ret, (U16)min); + ARG2_SET(ret, (U16)max); + } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + + goto nest_check; + } + } + + if (!ISMULT1(op)) { + *flagp = flags; + return(ret); + } + +#if 0 /* Now runtime fix should be reliable. */ + + /* if this is reinstated, don't forget to put this back into perldiag: + + =item Regexp *+ operand could be empty at {#} in regex m/%s/ + + (F) The part of the regexp subject to either the * or + quantifier + could match an empty string. The {#} shows in the regular + expression about where the problem was discovered. + + */ + + if (!(flags&HASWIDTH) && op != '?') + vFAIL("Regexp *+ operand could be empty"); +#endif + +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; +#endif + nextchar(pRExC_state); + + *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); + + if (op == '*' && (flags&SIMPLE)) { + reginsert(pRExC_state, STAR, ret, depth+1); + ret->flags = 0; + RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + } + else if (op == '*') { + min = 0; + goto do_curly; + } + else if (op == '+' && (flags&SIMPLE)) { + reginsert(pRExC_state, PLUS, ret, depth+1); + ret->flags = 0; + RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + } + else if (op == '+') { + min = 1; + goto do_curly; + } + else if (op == '?') { + min = 0; max = 1; + goto do_curly; + } + nest_check: + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); + (void)ReREFCNT_inc(RExC_rx_sv); + } + + if (RExC_parse < RExC_end && *RExC_parse == '?') { + nextchar(pRExC_state); + reginsert(pRExC_state, MINMOD, ret, depth+1); + REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); + } + else + if (RExC_parse < RExC_end && *RExC_parse == '+') { + regnode *ender; + nextchar(pRExC_state); + ender = reg_node(pRExC_state, SUCCEED); + REGTAIL(pRExC_state, ret, ender); + reginsert(pRExC_state, SUSPEND, ret, depth+1); + ret->flags = 0; + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, ret, ender); + } + + if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { + RExC_parse++; + vFAIL("Nested quantifiers"); + } + + return(ret); +} + +STATIC bool +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ + ) +{ + + /* This is expected to be called by a parser routine that has recognized '\N' + and needs to handle the rest. RExC_parse is expected to point at the first + char following the N at the time of the call. On successful return, + RExC_parse has been updated to point to just after the sequence identified + by this routine, and <*flagp> has been updated. + + The \N may be inside (indicated by the boolean ) or outside a + character class. + + \N may begin either a named sequence, or if outside a character class, mean + to match a non-newline. For non single-quoted regexes, the tokenizer has + attempted to decide which, and in the case of a named sequence, converted it + into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, + where c1... are the characters in the sequence. For single-quoted regexes, + the tokenizer passes the \N sequence through unchanged; this code will not + attempt to determine this nor expand those, instead raising a syntax error. + The net effect is that if the beginning of the passed-in pattern isn't '{U+' + or there is no '}', it signals that this \N occurrence means to match a + non-newline. + + Only the \N{U+...} form should occur in a character class, for the same + reason that '.' inside a character class means to just match a period: it + just doesn't make sense. + + The function raises an error (via vFAIL), and doesn't return for various + syntax errors. Otherwise it returns TRUE and sets or on + success; it returns FALSE otherwise. Returns FALSE, setting *flagp to + RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is + only possible if node_p is non-NULL. + + + If is non-null, it means the caller can accept an input sequence + consisting of a just a single code point; <*valuep> is set to that value + if the input is such. + + If is non-null it signifies that the caller can accept any other + legal sequence (i.e., one that isn't just a single code point). <*node_p> + is set as follows: + 1) \N means not-a-NL: points to a newly created REG_ANY node; + 2) \N{}: points to a new NOTHING node; + 3) otherwise: points to a new EXACT node containing the resolved + string. + Note that FALSE is returned for single code point sequences if is + null. + */ + + char * endbrace; /* '}' following the name */ + char* p; + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + bool has_multiple_chars; /* true if the input stream contains a sequence of + more than one character */ + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_GROK_BSLASH_N; + + GET_RE_DEBUG_FLAGS; + + assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ + + /* The [^\n] meaning of \N ignores spaces and comments under the /x + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ + p = (RExC_flags & RXf_PMf_EXTENDED) + ? regpatws(pRExC_state, RExC_parse, + TRUE) /* means recognize comments */ + : RExC_parse; + + /* Disambiguate between \N meaning a named character versus \N meaning + * [^\n]. The former is assumed when it can't be the latter. */ + if (*p != '{' || regcurly(p)) { + RExC_parse = p; + if (! node_p) { + /* no bare \N allowed in a charclass */ + if (in_char_class) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + return FALSE; + } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ + nextchar(pRExC_state); + *node_p = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + Set_Node_Length(*node_p, 1); /* MJD */ + return TRUE; + } + + /* Here, we have decided it should be a named character or sequence */ + + /* The test above made sure that the next real character is a '{', but + * under the /x modifier, it could be separated by space (or a comment and + * \n) and this is not allowed (for consistency with \x{...} and the + * tokenizer handling of \N{NAME}). */ + if (*RExC_parse != '{') { + vFAIL("Missing braces on \\N{}"); + } + + RExC_parse++; /* Skip past the '{' */ + + if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ + || ! (endbrace == RExC_parse /* nothing between the {} */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ + { + if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ + vFAIL("\\N{NAME} must be resolved by the lexer"); + } + + if (endbrace == RExC_parse) { /* empty: \N{} */ + bool ret = TRUE; + if (node_p) { + *node_p = reg_node(pRExC_state,NOTHING); + } + else if (in_char_class) { + if (SIZE_ONLY && in_char_class) { + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } + else { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class"); + } + } + ret = FALSE; + } + else { + return FALSE; + } + nextchar(pRExC_state); + return ret; + } + + RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ + RExC_parse += 2; /* Skip past the 'U+' */ + + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + + /* Code points are separated by dots. If none, there is only one code + * point, and is terminated by the brace */ + has_multiple_chars = (endchar < endbrace); + + if (valuep && (! has_multiple_chars || in_char_class)) { + /* We only pay attention to the first char of + multichar strings being returned in char classes. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. XXX Solution is to recharacterize as + [rest-of-class]|multi1|multi2... */ + + STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); + I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + + *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); + + /* The tokenizer should have guaranteed validity, but it's possible to + * bypass it by using single quoting, so check */ + if (length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) { + RExC_parse = endchar; + } + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + + if (in_char_class && has_multiple_chars) { + if (strict) { + RExC_parse = endbrace; + vFAIL("\\N{} in character class restricted to one character"); + } + else { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } + } + + RExC_parse = endbrace + 1; + } + else if (! node_p || ! has_multiple_chars) { + + /* Here, the input is legal, but not according to the caller's + * options. We fail without advancing the parse, so that the + * caller can try again */ + RExC_parse = p; + return FALSE; + } + else { + + /* What is done here is to convert this to a sub-pattern of the form + * (?:\x{char1}\x{char2}...) + * and then call reg recursively. That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. toke.c has converted the original Unicode values + * to native, so that we can just pass on the hex values unchanged. We + * do have to set a flag to keep recoding from happening in the + * recursion */ + + SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); + STRLEN len; + char *orig_end = RExC_end; + I32 flags; + + while (RExC_parse < endbrace) { + + /* Convert to notation the rest of the code understands */ + sv_catpv(substitute_parse, "\\x{"); + sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(substitute_parse, "}"); + + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + } + sv_catpv(substitute_parse, ")"); + + RExC_parse = SvPV(substitute_parse, len); + + /* Don't allow empty number */ + if (len < 8) { + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + RExC_end = RExC_parse + len; + + /* The values are Unicode, and therefore not subject to recoding */ + RExC_override_recoding = 1; + + if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return FALSE; + } + FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", + (UV) flags); + } + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + + RExC_parse = endbrace; + RExC_end = orig_end; + RExC_override_recoding = 0; + + nextchar(pRExC_state); + } + + return TRUE; +} + + +/* + * reg_recode + * + * It returns the code point in utf8 for the value in *encp. + * value: a code value in the source encoding + * encp: a pointer to an Encode object + * + * If the result from Encode is not a single character, + * it returns U+FFFD (Replacement character) and sets *encp to NULL. + */ +STATIC UV +S_reg_recode(pTHX_ const char value, SV **encp) +{ + STRLEN numlen = 1; + SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); + const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); + const STRLEN newlen = SvCUR(sv); + UV uv = UNICODE_REPLACEMENT; + + PERL_ARGS_ASSERT_REG_RECODE; + + if (newlen) + uv = SvUTF8(sv) + ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) + : *(U8*)s; + + if (!newlen || numlen != newlen) { + uv = UNICODE_REPLACEMENT; + *encp = NULL; + } + return uv; +} + +PERL_STATIC_INLINE U8 +S_compute_EXACTish(RExC_state_t *pRExC_state) +{ + U8 op; + + PERL_ARGS_ASSERT_COMPUTE_EXACTISH; + + if (! FOLD) { + return EXACT; + } + + op = get_regex_charset(RExC_flags); + if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { + op--; /* /a is same as /u, and map /aa's offset to what /a's would have + been, so there is no hole */ + } + + return op + EXACTF; +} + +PERL_STATIC_INLINE void +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) +{ + /* This knows the details about sizing an EXACTish node, setting flags for + * it (by setting <*flagp>, and potentially populating it with a single + * character. + * + * If (the length in bytes) is non-zero, this function assumes that + * the node has already been populated, and just does the sizing. In this + * case should be the final code point that has already been + * placed into the node. This value will be ignored except that under some + * circumstances <*flagp> is set based on it. + * + * If is zero, the function assumes that the node is to contain only + * the single character given by and calculates what + * should be. In pass 1, it sizes the node appropriately. In pass 2, it + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set + * + * It knows that under FOLD, the Latin Sharp S and UTF characters above + * 255, must be folded (the former only when the rules indicate it can + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ + + bool len_passed_in = cBOOL(len != 0); + U8 character[UTF8_MAXBYTES_CASE+1]; + + PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + + if (! len_passed_in) { + if (UTF) { + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, + character, + &len, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; + } + else { + uvchr_to_utf8( character, code_point); + len = UTF8SKIP(character); + } + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { + *character = (U8) code_point; + len = 1; + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { + *character = 's'; + *(character + 1) = 's'; + len = 2; + } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } + } + + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } + else { + RExC_emit += STR_SZ(len); + STR_LEN(node) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(node), len, char); + } + } + + *flagp |= HASWIDTH; + + /* A single character node is SIMPLE, except for the special-cased SHARP S + * under /di. */ + if ((len == 1 || (UTF && len == UNISKIP(code_point))) + && (code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS)) + { + *flagp |= SIMPLE; + } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } +} + + +/* return atoi(p), unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + char *q = p; + + for (;isDIGIT(*q); q++) {} /* calculate length of num */ + if (q - p == 0 || q - p > 9) + return I32_MAX; + return atoi(p); +} + + +/* + - regatom - the lowest level + + Try to identify anything special at the start of the pattern. If there + is, then handle it as required. This may involve generating a single regop, + such as for an assertion; or it may involve recursing, such as to + handle a () structure. + + If the string doesn't start with something special then we gobble up + as much literal text as we can. + + Once we have been able to handle whatever type of thing started the + sequence, we return. + + Note: we have to be careful with escapes, as they can be both literal + and special, and in the case of \10 and friends, context determines which. + + A summary of the code structure is: + + switch (first_byte) { + cases for each special: + handle this special; + break; + case '\\': + switch (2nd byte) { + cases for each unambiguous special: + handle this special; + break; + cases for each ambigous special/literal: + disambiguate; + if (special) handle here + else goto defchar; + default: // unambiguously literal: + goto defchar; + } + default: // is a literal char + // FALL THROUGH + defchar: + create EXACTish node for literal; + while (more input and node isn't full) { + switch (input_byte) { + cases for each special; + make sure parse pointer is set so that the next call to + regatom will see this special first + goto loopdone; // EXACTish node terminated by prev. char + default: + append char to EXACTISH node; + } + get next input byte; + } + loopdone: + } + return the generated node; + + Specifically there are two separate switches for handling + escape sequences, with the one for handling literal escapes requiring + a dummy entry for all of the special escapes that are actually handled + by the other. + + Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with + TRYAGAIN. + Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + restarted. + Otherwise does not return NULL. +*/ + +STATIC regnode * +S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + regnode *ret = NULL; + I32 flags = 0; + char *parse_start = RExC_parse; + U8 op; + int invert = 0; + U8 arg; + + GET_RE_DEBUG_FLAGS_DECL; + + *flagp = WORST; /* Tentatively. */ + + DEBUG_PARSE("atom"); + + PERL_ARGS_ASSERT_REGATOM; + +tryagain: + switch ((U8)*RExC_parse) { + case '^': + RExC_seen_zerolen++; + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MBOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SBOL); + else + ret = reg_node(pRExC_state, BOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '$': + nextchar(pRExC_state); + if (*RExC_parse) + RExC_seen_zerolen++; + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MEOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SEOL); + else + ret = reg_node(pRExC_state, EOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '.': + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SANY); + else + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + Set_Node_Length(ret, 1); /* MJD */ + break; + case '[': + { + char * const oregcomp_parse = ++RExC_parse; + ret = regclass(pRExC_state, flagp,depth+1, + FALSE, /* means parse the whole char class */ + TRUE, /* allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + NULL); + if (*RExC_parse != ']') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } + if (ret == NULL) { + if (*flagp & RESTART_UTF8) + return NULL; + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); + } + nextchar(pRExC_state); + Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ + break; + } + case '(': + nextchar(pRExC_state); + ret = reg(pRExC_state, 2, &flags,depth+1); + if (ret == NULL) { + if (flags & TRYAGAIN) { + if (RExC_parse == RExC_end) { + /* Make parent create an empty node if needed. */ + *flagp |= TRYAGAIN; + return(NULL); + } + goto tryagain; + } + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); + } + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + break; + case '|': + case ')': + if (flags & TRYAGAIN) { + *flagp |= TRYAGAIN; + return NULL; + } + vFAIL("Internal urp"); + /* Supposed to be caught earlier. */ + break; + case '?': + case '+': + case '*': + RExC_parse++; + vFAIL("Quantifier follows nothing"); + break; + case '\\': + /* Special Escapes + + This switch handles escape sequences that resolve to some kind + of special regop and not to literal text. Escape sequnces that + resolve to literal text are handled below in the switch marked + "Literal Escapes". + + Every entry in this switch *must* have a corresponding entry + in the literal escape switch. However, the opposite is not + required, as the default for this switch is to jump to the + literal text handling code. + */ + switch ((U8)*++RExC_parse) { + /* Special Escapes */ + case 'A': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, SBOL); + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'G': + ret = reg_node(pRExC_state, GPOS); + RExC_seen |= REG_GPOS_SEEN; + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'K': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, KEEPS); + *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_LOOKBEHIND_SEEN; + goto finish_meta_pat; + case 'Z': + ret = reg_node(pRExC_state, SEOL); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'z': + ret = reg_node(pRExC_state, EOS); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'C': + ret = reg_node(pRExC_state, CANY); + RExC_seen |= REG_CANY_SEEN; + *flagp |= HASWIDTH|SIMPLE; + if (SIZE_ONLY) { + ckWARNdep(RExC_parse+1, "\\C is deprecated"); + } + goto finish_meta_pat; + case 'X': + ret = reg_node(pRExC_state, CLUMP); + *flagp |= HASWIDTH; + goto finish_meta_pat; + + case 'W': + invert = 1; + /* FALLTHROUGH */ + case 'w': + arg = ANYOF_WORDCHAR; + goto join_posix; + + case 'b': + RExC_seen_zerolen++; + RExC_seen |= REG_LOOKBEHIND_SEEN; + op = BOUND + get_regex_charset(RExC_flags); + if (op > BOUNDA) { /* /aa is same as /a */ + op = BOUNDA; + } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); + } + goto finish_meta_pat; + case 'B': + RExC_seen_zerolen++; + RExC_seen |= REG_LOOKBEHIND_SEEN; + op = NBOUND + get_regex_charset(RExC_flags); + if (op > NBOUNDA) { /* /aa is same as /a */ + op = NBOUNDA; + } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); + } + goto finish_meta_pat; + + case 'D': + invert = 1; + /* FALLTHROUGH */ + case 'd': + arg = ANYOF_DIGIT; + goto join_posix; + + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + + case 'H': + invert = 1; + /* FALLTHROUGH */ + case 'h': + arg = ANYOF_BLANK; + op = POSIXU; + goto join_posix_op_known; + + case 'V': + invert = 1; + /* FALLTHROUGH */ + case 'v': + arg = ANYOF_VERTWS; + op = POSIXU; + goto join_posix_op_known; + + case 'S': + invert = 1; + /* FALLTHROUGH */ + case 's': + arg = ANYOF_SPACE; + + join_posix: + + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; + } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } + + join_posix_op_known: + + if (invert) { + op += NPOSIXD - POSIXD; + } + + ret = reg_node(pRExC_state, op); + if (! SIZE_ONLY) { + FLAGS(ret) = namedclass_to_classnum(arg); + } + + *flagp |= HASWIDTH|SIMPLE; + /* FALLTHROUGH */ + + finish_meta_pat: + nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ + break; + case 'p': + case 'P': + { +#ifdef DEBUGGING + char* parse_start = RExC_parse - 2; +#endif + + RExC_parse--; + + ret = regclass(pRExC_state, flagp,depth+1, + TRUE, /* means just parse this element */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. + It would be a bug if these returned + non-portables */ + NULL); + /* regclass() can only return RESTART_UTF8 if multi-char folds + are allowed. */ + if (!ret) + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); + + RExC_parse--; + + Set_Node_Offset(ret, parse_start + 2); + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + } + break; + case 'N': + /* Handle \N and \N{NAME} with multiple code points here and not + * below because it can be multicharacter. join_exact() will join + * them up later on. Also this makes sure that things like + * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. + * The options to the grok function call causes it to fail if the + * sequence is just a single code point. We then go treat it as + * just another character in the current EXACT node, and hence it + * gets uniform treatment with all the other characters. The + * special treatment for quantifiers is not needed for such single + * character sequences */ + ++RExC_parse; + if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, + FALSE /* not strict */ )) { + if (*flagp & RESTART_UTF8) + return NULL; + RExC_parse--; + goto defchar; + } + break; + case 'k': /* Handle \k and \k'NAME' */ + parse_named_seq: + { + char ch= RExC_parse[1]; + if (ch != '<' && ch != '\'' && ch != '{') { + RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.2s... not terminated",parse_start); + } else { + /* this pretty much dupes the code for (?P=...) in reg(), if + you change this make sure you change that */ + char* name_start = (RExC_parse += 2); + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; + if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + + } + break; + } + case 'g': + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + { + I32 num; + bool hasbrace = 0; + + if (*RExC_parse == 'g') { + bool isrel = 0; + + RExC_parse++; + if (*RExC_parse == '{') { + RExC_parse++; + hasbrace = 1; + } + if (*RExC_parse == '-') { + RExC_parse++; + isrel = 1; + } + if (hasbrace && !isDIGIT(*RExC_parse)) { + if (isrel) RExC_parse--; + RExC_parse -= 2; + goto parse_named_seq; + } + + num = S_backref_value(RExC_parse); + if (num == 0) + vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); + } + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); + } + } + else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } + } + + /* at this point RExC_parse definitely points to a backref + * number */ + { +#ifdef RE_TRACK_PATTERN_OFFSETS + char * const parse_start = RExC_parse - 1; /* MJD */ +#endif + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (hasbrace) { + if (*RExC_parse != '}') + vFAIL("Unterminated \\g{...} pattern"); + RExC_parse++; + } + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) + vFAIL("Reference to nonexistent group"); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? REF + : (ASCII_FOLD_RESTRICTED) + ? REFFA + : (AT_LEAST_UNI_SEMANTICS) + ? REFFU + : (LOC) + ? REFFL + : REFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + RExC_parse--; + nextchar(pRExC_state); + } + } + break; + case '\0': + if (RExC_parse >= RExC_end) + FAIL("Trailing \\"); + /* FALLTHROUGH */ + default: + /* Do not generate "unrecognized" warnings here, we fall + back into the quick-grab loop below */ + parse_start--; + goto defchar; + } + break; + + case '#': + if (RExC_flags & RXf_PMf_EXTENDED) { + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) + goto tryagain; + } + /* FALLTHROUGH */ + + default: + + parse_start = RExC_parse - 1; + + RExC_parse++; + + defchar: { + STRLEN len = 0; + UV ender = 0; + char *p; + char *s; +#define MAX_NODE_STRING_SIZE 127 + char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; + char *s0; + U8 upper_parse = MAX_NODE_STRING_SIZE; + U8 node_type = compute_EXACTish(pRExC_state); + bool next_is_quantifier; + char * oldp = NULL; + + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); + + /* If a folding node contains only code points that don't + * participate in folds, it can be changed into an EXACT node, + * which allows the optimizer more things to look for */ + bool maybe_exact; + + ret = reg_node(pRExC_state, node_type); + + /* In pass1, folded, we use a temporary buffer instead of the + * actual node, as the node doesn't exist yet */ + s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); + + s0 = s; + + reparse: + + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; + + /* XXX The node can hold up to 255 bytes, yet this only goes to + * 127. I (khw) do not know why. Keeping it somewhat less than + * 255 allows us to not have to worry about overflow due to + * converting to utf8 and fold expansion, but that value is + * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes + * split up by this limit into a single one using the real max of + * 255. Even at 127, this breaks under rare circumstances. If + * folding, we do not want to split a node at a character that is a + * non-final in a multi-char fold, as an input string could just + * happen to want to match across the node boundary. The join + * would solve that problem if the join actually happens. But a + * series of more than two nodes in a row each of 127 would cause + * the first join to succeed to get to 254, but then there wouldn't + * be room for the next one, which could at be one of those split + * multi-char folds. I don't know of any fool-proof solution. One + * could back off to end with only a code point that isn't such a + * non-final, but it is possible for there not to be any in the + * entire node. */ + for (p = RExC_parse - 1; + len < upper_parse && p < RExC_end; + len++) + { + oldp = p; + + if (RExC_flags & RXf_PMf_EXTENDED) + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ + switch ((U8)*p) { + case '^': + case '$': + case '.': + case '[': + case '(': + case ')': + case '|': + goto loopdone; + case '\\': + /* Literal Escapes Switch + + This switch is meant to handle escape sequences that + resolve to a literal character. + + Every escape sequence that represents something + else, like an assertion or a char class, is handled + in the switch marked 'Special Escapes' above in this + routine, but also has an entry here as anything that + isn't explicitly mentioned here will be treated as + an unescaped equivalent literal. + */ + + switch ((U8)*++p) { + /* These are all the special escapes. */ + case 'A': /* Start assertion */ + case 'b': case 'B': /* Word-boundary assertion*/ + case 'C': /* Single char !DANGEROUS! */ + case 'd': case 'D': /* digit class */ + case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ + case 'k': case 'K': /* named backref, keep marker */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ + case 's': case 'S': /* space class */ + case 'v': case 'V': /* VERTWS */ + case 'w': case 'W': /* word class */ + case 'X': /* eXtended Unicode "combining + character sequence" */ + case 'z': case 'Z': /* End of line/string assertion */ + --p; + goto loopdone; + + /* Anything after here is an escape that resolves to a + literal. (Except digits, which may or may not) + */ + case 'n': + ender = '\n'; + p++; + break; + case 'N': /* Handle a single-code point named character. */ + /* The options cause it to fail if a multiple code + * point sequence. Handle those in the switch() above + * */ + RExC_parse = p + 1; + if (! grok_bslash_N(pRExC_state, NULL, &ender, + flagp, depth, FALSE, + FALSE /* not strict */ )) + { + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + RExC_parse = p = oldp; + goto loopdone; + } + p = RExC_parse; + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + case 'r': + ender = '\r'; + p++; + break; + case 't': + ender = '\t'; + p++; + break; + case 'f': + ender = '\f'; + p++; + break; + case 'e': + ender = ASCII_TO_NATIVE('\033'); + p++; + break; + case 'a': + ender = '\a'; + p++; + break; + case 'o': + { + UV result; + const char* error_msg; + + bool valid = grok_bslash_o(&p, + &result, + &error_msg, + TRUE, /* out warnings */ + FALSE, /* not strict */ + TRUE, /* Output warnings + for non- + portables */ + UTF); + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + ender = result; + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } + case 'x': + { + UV result = UV_MAX; /* initialize to erroneous + value */ + const char* error_msg; + + bool valid = grok_bslash_x(&p, + &result, + &error_msg, + TRUE, /* out warnings */ + FALSE, /* not strict */ + TRUE, /* Output warnings + for non- + portables */ + UTF); + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + ender = result; + + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } + case 'c': + p++; + ender = grok_bslash_c(*p++, SIZE_ONLY); + break; + case '8': case '9': /* must be a backreference */ + --p; + goto loopdone; + case '1': case '2': case '3':case '4': + case '5': case '6': case '7': + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. + * + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) + { /* Not to be treated as an octal constant, go + find backref */ + --p; + goto loopdone; + } + /* FALLTHROUGH */ + case '0': + { + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + STRLEN numlen = 3; + ender = grok_oct(p, &numlen, &flags, NULL); + if (ender > 0xff) { + REQUIRE_UTF8; + } + p += numlen; + if (SIZE_ONLY /* like \08, \178 */ + && numlen < 3 + && p < RExC_end + && isDIGIT(*p) && ckWARN(WARN_REGEXP)) + { + reg_warn_non_literal_string( + p + 1, + form_short_octal_warning(p, numlen)); + } + } + if (PL_encoding && ender < 0x100) + goto recode_encoding; + break; + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + ender = reg_recode((const char)(U8)ender, &enc); + if (!enc && SIZE_ONLY) + ckWARNreg(p, "Invalid escape in the specified encoding"); + REQUIRE_UTF8; + } + break; + case '\0': + if (p >= RExC_end) + FAIL("Trailing \\"); + /* FALLTHROUGH */ + default: + if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { + /* Include any { following the alpha to emphasize + * that it could be part of an escape at some point + * in the future */ + int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; + ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p); + } + goto normal_default; + } /* End of switch on '\' */ + break; + case '{': + /* Currently we don't warn when the lbrace is at the start + * of a construct. This catches it in the middle of a + * literal string, or when its the first thing after + * something like "\b" */ + if (! SIZE_ONLY + && (len || (p > RExC_start && isALPHA_A(*(p -1))))) + { + ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); + } + /*FALLTHROUGH*/ + default: /* A literal character */ + normal_default: + if (UTF8_IS_START(*p) && UTF) { + STRLEN numlen; + ender = utf8n_to_uvchr((U8*)p, RExC_end - p, + &numlen, UTF8_ALLOW_DEFAULT); + p += numlen; + } + else + ender = (U8) *p++; + break; + } /* End of switch on the literal */ + + /* Here, have looked at the literal character and + * contains its ordinal,

points to the character after it + */ + + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ + + /* If the next thing is a quantifier, it applies to this + * character only, which means that this character has to be in + * its own node and can't just be appended to the string in an + * existing node, so if there are already other characters in + * the node, close the node with just them, and set up to do + * this character again next time through, when it will be the + * only thing in its new node */ + if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + { + p = oldp; + goto loopdone; + } + + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { + if (UTF) { + const STRLEN unilen = reguni(pRExC_state, ender, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len--; + } + else { + REGC((char)ender, s++); + } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } + } + else /* FOLD */ + if (! ( UTF + /* See comments for join_exact() as to why we fold this + * non-UTF at compile time */ + || (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S))) + { + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) + { + maybe_exactfu = FALSE; + } + } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ + *(s++) = (char) ender; + } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; + } + else { + STRLEN foldlen; + + folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { + maybe_exact = FALSE; + } + } + } + ender = folded; + } + + if (next_is_quantifier) { + + /* Here, the next input is a quantifier, and to get here, + * the current character is the only one in the node. + * Also, here doesn't include the final byte for this + * character */ + len++; + goto loopdone; + } + + } /* End of loop through literal characters */ + + /* Here we have either exhausted the input or ran out of room in + * the node. (If we encountered a character that can't be in the + * node, transfer is made directly to , and so we + * wouldn't have fallen off the end of the loop.) In the latter + * case, we artificially have to split the node into two, because + * we just don't have enough space to hold everything. This + * creates a problem if the final character participates in a + * multi-character fold in the non-final position, as a match that + * should have occurred won't, due to the way nodes are matched, + * and our artificial boundary. So back off until we find a non- + * problematic character -- one that isn't at the beginning or + * middle of such a fold. (Either it doesn't participate in any + * folds, or appears only in the final position of all the folds it + * does participate in.) A better solution with far fewer false + * positives, and that would fill the nodes more completely, would + * be to actually have available all the multi-character folds to + * test against, and to back-off only far enough to be sure that + * this node isn't ending with a partial one. is set + * further below (if we need to reparse the node) to include just + * up through that final non-problematic character that this code + * identifies, so when it is set to less than the full node, we can + * skip the rest of this */ + if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { + + const STRLEN full_len = len; + + assert(len >= MAX_NODE_STRING_SIZE); + + /* Here, points to the final byte of the final character. + * Look backwards through the string until find a non- + * problematic character */ + + if (! UTF) { + + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { + goto loopdone; + } + + while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } + len = s - s0 + 1; + } + else { + if (! PL_NonL1NonFinalFold) { + PL_NonL1NonFinalFold = _new_invlist_C_array( + NonL1_Perl_Non_Final_Folds_invlist); + } + + /* Point to the first byte of the final character */ + s = (char *) utf8_hop((U8 *) s, -1); + + while (s >= s0) { /* Search backwards until find + non-problematic char */ + if (UTF8_IS_INVARIANT(*s)) { + + /* There are no ascii characters that participate + * in multi-char folds under /aa. In EBCDIC, the + * non-ascii invariants are all control characters, + * so don't ever participate in any folds. */ + if (ASCII_FOLD_RESTRICTED + || ! IS_NON_FINAL_FOLD(*s)) + { + break; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + *s, *(s+1)))) + { + break; + } + } + else if (! _invlist_contains_cp( + PL_NonL1NonFinalFold, + valid_utf8_to_uvchr((U8 *) s, NULL))) + { + break; + } + + /* Here, the current character is problematic in that + * it does occur in the non-final position of some + * fold, so try the character before it, but have to + * special case the very first byte in the string, so + * we don't read outside the string */ + s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); + } /* End of loop backwards through the string */ + + /* If there were only problematic characters in the string, + * will point to before s0, in which case the length + * should be 0, otherwise include the length of the + * non-problematic character just found */ + len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); + } + + /* Here, have found the final character, if any, that is + * non-problematic as far as ending the node without splitting + * it across a potential multi-char fold. contains the + * number of bytes in the node up-to and including that + * character, or is 0 if there is no such character, meaning + * the whole node contains only problematic characters. In + * this case, give up and just take the node as-is. We can't + * do any better */ + if (len == 0) { + len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } + } else { + + /* Here, the node does contain some characters that aren't + * problematic. If one such is the final character in the + * node, we are done */ + if (len == full_len) { + goto loopdone; + } + else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { + + /* If the final character is problematic, but the + * penultimate is not, back-off that last character to + * later start a new node with it */ + p = oldp; + goto loopdone; + } + + /* Here, the final non-problematic character is earlier + * in the input than the penultimate character. What we do + * is reparse from the beginning, going up only as far as + * this final ok one, thus guaranteeing that the node ends + * in an acceptable character. The reason we reparse is + * that we know how far in the character is, but we don't + * know how to correlate its position with the input parse. + * An alternate implementation would be to build that + * correlation as we go along during the original parse, + * but that would entail extra work for every node, whereas + * this code gets executed only when the string is too + * large for the node, and the final two characters are + * problematic, an infrequent occurrence. Yet another + * possible strategy would be to save the tail of the + * string, and the next time regatom is called, initialize + * with that. The problem with this is that unless you + * back off one more character, you won't be guaranteed + * regatom will get called again, unless regbranch, + * regpiece ... are also changed. If you do back off that + * extra character, so that there is input guaranteed to + * force calling regatom, you can't handle the case where + * just the first character in the node is acceptable. I + * (khw) decided to try this method which doesn't have that + * pitfall; if performance issues are found, we can do a + * combination of the current approach plus that one */ + upper_parse = len; + len = 0; + s = s0; + goto reparse; + } + } /* End of verifying node ends with an appropriate char */ + + loopdone: /* Jumped to when encounters something that shouldn't be in + the node */ + + /* I (khw) don't know if you can get here with zero length, but the + * old code handled this situation by creating a zero-length EXACT + * node. Might as well be NOTHING instead */ + if (len == 0) { + OP(ret) = NOTHING; + } + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * (for /u), or depending on locale for /l */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } + } + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); + } + + RExC_parse = p - 1; + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + { + /* len is STRLEN which is unsigned, need to copy to signed */ + IV iv = len; + if (iv < 0) + vFAIL("Internal disaster"); + } + + } /* End of label 'defchar:' */ + break; + } /* End of giant switch on input character */ + + return(ret); +} + +STATIC char * +S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) +{ + /* Returns the next non-pattern-white space, non-comment character (the + * latter only if 'recognize_comment is true) in the string p, which is + * ended by RExC_end. See also reg_skipcomment */ + const char *e = RExC_end; + + PERL_ARGS_ASSERT_REGPATWS; + + while (p < e) { + STRLEN len; + if ((len = is_PATWS_safe(p, e, UTF))) { + p += len; + } + else if (recognize_comment && *p == '#') { + p = reg_skipcomment(pRExC_state, p); + } + else + break; + } + return p; +} + +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + +/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. + Character classes ([:foo:]) can also be negated ([:^foo:]). + Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. + Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, + but trigger failures because they are currently unimplemented. */ + +#define POSIXCC_DONE(c) ((c) == ':') +#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') +#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) + +PERL_STATIC_INLINE I32 +S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) +{ + I32 namedclass = OOB_NAMEDCLASS; + + PERL_ARGS_ASSERT_REGPPOSIXCC; + + if (value == '[' && RExC_parse + 1 < RExC_end && + /* I smell either [: or [= or [. -- POSIX has been here, right? */ + POSIXCC(UCHARAT(RExC_parse))) + { + const char c = UCHARAT(RExC_parse); + char* const s = RExC_parse++; + + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) + RExC_parse++; + if (RExC_parse == RExC_end) { + if (strict) { + + /* Try to give a better location for the error (than the end of + * the string) by looking for the matching ']' */ + RExC_parse = s; + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { + RExC_parse++; + } + vFAIL2("Unmatched '%c' in POSIX class", c); + } + /* Grandfather lone [:, [=, [. */ + RExC_parse = s; + } + else { + const char* const t = RExC_parse++; /* skip over the c */ + assert(*t == c); + + if (UCHARAT(RExC_parse) == ']') { + const char *posixcc = s + 1; + RExC_parse++; /* skip over the ending ] */ + + if (*s == ':') { + const I32 complement = *posixcc == '^' ? *posixcc++ : 0; + const I32 skip = t - posixcc; + + /* Initially switch on the length of the name. */ + switch (skip) { + case 4: + if (memEQ(posixcc, "word", 4)) /* this is not POSIX, + this is the Perl \w + */ + namedclass = ANYOF_WORDCHAR; + break; + case 5: + /* Names all of length 5. */ + /* alnum alpha ascii blank cntrl digit graph lower + print punct space upper */ + /* Offset 4 gives the best switch position. */ + switch (posixcc[4]) { + case 'a': + if (memEQ(posixcc, "alph", 4)) /* alpha */ + namedclass = ANYOF_ALPHA; + break; + case 'e': + if (memEQ(posixcc, "spac", 4)) /* space */ + namedclass = ANYOF_PSXSPC; + break; + case 'h': + if (memEQ(posixcc, "grap", 4)) /* graph */ + namedclass = ANYOF_GRAPH; + break; + case 'i': + if (memEQ(posixcc, "asci", 4)) /* ascii */ + namedclass = ANYOF_ASCII; + break; + case 'k': + if (memEQ(posixcc, "blan", 4)) /* blank */ + namedclass = ANYOF_BLANK; + break; + case 'l': + if (memEQ(posixcc, "cntr", 4)) /* cntrl */ + namedclass = ANYOF_CNTRL; + break; + case 'm': + if (memEQ(posixcc, "alnu", 4)) /* alnum */ + namedclass = ANYOF_ALPHANUMERIC; + break; + case 'r': + if (memEQ(posixcc, "lowe", 4)) /* lower */ + namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; + else if (memEQ(posixcc, "uppe", 4)) /* upper */ + namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; + break; + case 't': + if (memEQ(posixcc, "digi", 4)) /* digit */ + namedclass = ANYOF_DIGIT; + else if (memEQ(posixcc, "prin", 4)) /* print */ + namedclass = ANYOF_PRINT; + else if (memEQ(posixcc, "punc", 4)) /* punct */ + namedclass = ANYOF_PUNCT; + break; + } + break; + case 6: + if (memEQ(posixcc, "xdigit", 6)) + namedclass = ANYOF_XDIGIT; + break; + } + + if (namedclass == OOB_NAMEDCLASS) + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); + + /* The #defines are structured so each complement is +1 to + * the normal one */ + if (complement) { + namedclass++; + } + assert (posixcc[skip] == ':'); + assert (posixcc[skip+1] == ']'); + } else if (!SIZE_ONLY) { + /* [[=foo=]] and [[.foo.]] are still future. */ + + /* adjust RExC_parse so the warning shows after + the class closes */ + while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') + RExC_parse++; + vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } + } else { + /* Maternal grandfather: + * "[:" ending in ":" but not in ":]" */ + if (strict) { + vFAIL("Unmatched '[' in POSIX class"); + } + + /* Grandfather lone [:, [=, [. */ + RExC_parse = s; + } + } + } + + return namedclass; +} + +STATIC bool +S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state) +{ + /* This applies some heuristics at the current parse position (which should + * be at a '[') to see if what follows might be intended to be a [:posix:] + * class. It returns true if it really is a posix class, of course, but it + * also can return true if it thinks that what was intended was a posix + * class that didn't quite make it. + * + * It will return true for + * [:alphanumerics: + * [:alphanumerics] (as long as the ] isn't followed immediately by a + * ')' indicating the end of the (?[ + * [:any garbage including %^&$ punctuation:] + * + * This is designed to be called only from S_handle_regex_sets; it could be + * easily adapted to be called from the spot at the beginning of regclass() + * that checks to see in a normal bracketed class if the surrounding [] + * have been omitted ([:word:] instead of [[:word:]]). But doing so would + * change long-standing behavior, so I (khw) didn't do that */ + char* p = RExC_parse + 1; + char first_char = *p; + + PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS; + + assert(*(p - 1) == '['); + + if (! POSIXCC(first_char)) { + return FALSE; + } + + p++; + while (p < RExC_end && isWORDCHAR(*p)) p++; + + if (p >= RExC_end) { + return FALSE; + } + + if (p - RExC_parse > 2 /* Got at least 1 word character */ + && (*p == first_char + || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')'))) + { + return TRUE; + } + + p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse); + + return (p + && p - RExC_parse > 2 /* [:] evaluates to colon; + [::] is a bad posix class. */ + && first_char == *(p - 1)); +} + +STATIC regnode * +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) +{ + /* Handle the (?[...]) construct to do set operations */ + + U8 curchar; + UV start, end; /* End points of code point ranges */ + SV* result_string; + char *save_end, *save_parse; + SV* final; + STRLEN len; + regnode* node; + AV* stack; + const bool save_fold = FOLD; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; + + if (LOC) { + vFAIL("(?[...]) not valid in locale"); + } + RExC_uni_semantics = 1; + + /* This will return only an ANYOF regnode, or (unlikely) something smaller + * (such as EXACT). Thus we can skip most everything if just sizing. We + * call regclass to handle '[]' so as to not have to reinvent its parsing + * rules here (throwing away the size it computes each time). And, we exit + * upon an unescaped ']' that isn't one ending a regclass. To do both + * these things, we need to realize that something preceded by a backslash + * is escaped, so we have to keep track of backslashes */ + if (SIZE_ONLY) { + UV depth = 0; /* how many nested (?[...]) constructs */ + + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__REGEX_SETS), + "The regex_sets feature is experimental" REPORT_LOCATION, + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); + + while (RExC_parse < RExC_end) { + SV* current = NULL; + RExC_parse = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + switch (*RExC_parse) { + case '?': + if (RExC_parse[1] == '[') depth++, RExC_parse++; + /* FALLTHROUGH */ + default: + break; + case '\\': + /* Skip the next byte (which could cause us to end up in + * the middle of a UTF-8 character, but since none of those + * are confusable with anything we currently handle in this + * switch (invariants all), it's safe. We'll just hit the + * default: case next time and keep on incrementing until + * we find one of the invariants we do handle. */ + RExC_parse++; + break; + case '[': + { + /* If this looks like it is a [:posix:] class, leave the + * parse pointer at the '[' to fool regclass() into + * thinking it is part of a '[[:posix:]]'. That function + * will use strict checking to force a syntax error if it + * doesn't work out to a legitimate class */ + bool is_posix_class + = could_it_be_a_POSIX_class(pRExC_state); + if (! is_posix_class) { + RExC_parse++; + } + + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if (!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char + class only if not a + posix class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + + /* function call leaves parse pointing to the ']', except + * if we faked it */ + if (is_posix_class) { + RExC_parse--; + } + + SvREFCNT_dec(current); /* In case it returned something */ + break; + } + + case ']': + if (depth--) break; + RExC_parse++; + if (RExC_parse < RExC_end + && *RExC_parse == ')') + { + node = reganode(pRExC_state, ANYOF, 0); + RExC_size += ANYOF_SKIP; + nextchar(pRExC_state); + Set_Node_Length(node, + RExC_parse - oregcomp_parse + 1); /* MJD */ + return node; + } + goto no_close; + } + RExC_parse++; + } + + no_close: + FAIL("Syntax error in (?[...])"); + } + + /* Pass 2 only after this. Everything in this construct is a + * metacharacter. Operands begin with either a '\' (for an escape + * sequence), or a '[' for a bracketed character class. Any other + * character should be an operator, or parenthesis for grouping. Both + * types of operands are handled by calling regclass() to parse them. It + * is called with a parameter to indicate to return the computed inversion + * list. The parsing here is implemented via a stack. Each entry on the + * stack is a single character representing one of the operators, or the + * '('; or else a pointer to an operand inversion list. */ + +#define IS_OPERAND(a) (! SvIOK(a)) + + /* The stack starts empty. It is a syntax error if the first thing parsed + * is a binary operator; everything else is pushed on the stack. When an + * operand is parsed, the top of the stack is examined. If it is a binary + * operator, the item before it should be an operand, and both are replaced + * by the result of doing that operation on the new operand and the one on + * the stack. Thus a sequence of binary operands is reduced to a single + * one before the next one is parsed. + * + * A unary operator may immediately follow a binary in the input, for + * example + * [a] + ! [b] + * When an operand is parsed and the top of the stack is a unary operator, + * the operation is performed, and then the stack is rechecked to see if + * this new operand is part of a binary operation; if so, it is handled as + * above. + * + * A '(' is simply pushed on the stack; it is valid only if the stack is + * empty, or the top element of the stack is an operator or another '(' + * (for which the parenthesized expression will become an operand). By the + * time the corresponding ')' is parsed everything in between should have + * been parsed and evaluated to a single operand (or else is a syntax + * error), and is handled as a regular operand */ + + sv_2mortal((SV *)(stack = newAV())); + + while (RExC_parse < RExC_end) { + I32 top_index = av_tindex(stack); + SV** top_ptr; + SV* current = NULL; + + /* Skip white space */ + RExC_parse = regpatws(pRExC_state, RExC_parse, + TRUE /* means recognize comments */ ); + if (RExC_parse >= RExC_end) { + Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); + } + if ((curchar = UCHARAT(RExC_parse)) == ']') { + break; + } + + switch (curchar) { + + case '?': + if (av_tindex(stack) >= 0 /* This makes sure that we can + safely subtract 1 from + RExC_parse in the next clause. + If we have something on the + stack, we have parsed something + */ + && UCHARAT(RExC_parse - 1) == '(' + && RExC_parse < RExC_end) + { + /* If is a '(?', could be an embedded '(?flags:(?[...])'. + * This happens when we have some thing like + * + * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; + * ... + * qr/(?[ \p{Digit} & $thai_or_lao ])/; + * + * Here we would be handling the interpolated + * '$thai_or_lao'. We handle this by a recursive call to + * ourselves which returns the inversion list the + * interpolated expression evaluates to. We use the flags + * from the interpolated pattern. */ + U32 save_flags = RExC_flags; + const char * const save_parse = ++RExC_parse; + + parse_lparen_question_flags(pRExC_state); + + if (RExC_parse == save_parse /* Makes sure there was at + least one flag (or this + embedding wasn't compiled) + */ + || RExC_parse >= RExC_end - 4 + || UCHARAT(RExC_parse) != ':' + || UCHARAT(++RExC_parse) != '(' + || UCHARAT(++RExC_parse) != '?' + || UCHARAT(++RExC_parse) != '[') + { + + /* In combination with the above, this moves the + * pointer to the point just after the first erroneous + * character (or if there are no flags, to where they + * should have been) */ + if (RExC_parse >= RExC_end - 4) { + RExC_parse = RExC_end; + } + else if (RExC_parse != save_parse) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } + vFAIL("Expecting '(?flags:(?[...'"); + } + RExC_parse++; + (void) handle_regex_sets(pRExC_state, ¤t, flagp, + depth+1, oregcomp_parse); + + /* Here, 'current' contains the embedded expression's + * inversion list, and RExC_parse points to the trailing + * ']'; the next character should be the ')' which will be + * paired with the '(' that has been put on the stack, so + * the whole embedded expression reduces to '(operand)' */ + RExC_parse++; + + RExC_flags = save_flags; + goto handle_operand; + } + /* FALLTHROUGH */ + + default: + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unexpected character"); + + case '\\': + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if (!regclass(pRExC_state, flagp,depth+1, + TRUE, /* means parse just the next thing */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + /* regclass() will return with parsing just the \ sequence, + * leaving the parse pointer at the next thing to parse */ + RExC_parse--; + goto handle_operand; + + case '[': /* Is a bracketed character class */ + { + bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state); + + if (! is_posix_class) { + RExC_parse++; + } + + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if(!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char class + only if not a posix class */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + /* function call leaves parse pointing to the ']', except if we + * faked it */ + if (is_posix_class) { + RExC_parse--; + } + + goto handle_operand; + } + + case '&': + case '|': + case '+': + case '-': + case '^': + if (top_index < 0 + || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) + || ! IS_OPERAND(*top_ptr)) + { + RExC_parse++; + vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar); + } + av_push(stack, newSVuv(curchar)); + break; + + case '!': + av_push(stack, newSVuv(curchar)); + break; + + case '(': + if (top_index >= 0) { + top_ptr = av_fetch(stack, top_index, FALSE); + assert(top_ptr); + if (IS_OPERAND(*top_ptr)) { + RExC_parse++; + vFAIL("Unexpected '(' with no preceding operator"); + } + } + av_push(stack, newSVuv(curchar)); + break; + + case ')': + { + SV* lparen; + if (top_index < 1 + || ! (current = av_pop(stack)) + || ! IS_OPERAND(current) + || ! (lparen = av_pop(stack)) + || IS_OPERAND(lparen) + || SvUV(lparen) != '(') + { + SvREFCNT_dec(current); + RExC_parse++; + vFAIL("Unexpected ')'"); + } + top_index -= 2; + SvREFCNT_dec_NN(lparen); + + /* FALLTHROUGH */ + } + + handle_operand: + + /* Here, we have an operand to process, in 'current' */ + + if (top_index < 0) { /* Just push if stack is empty */ + av_push(stack, current); + } + else { + SV* top = av_pop(stack); + SV *prev = NULL; + char current_operator; + + if (IS_OPERAND(top)) { + SvREFCNT_dec_NN(top); + SvREFCNT_dec_NN(current); + vFAIL("Operand with no preceding operator"); + } + current_operator = (char) SvUV(top); + switch (current_operator) { + case '(': /* Push the '(' back on followed by the new + operand */ + av_push(stack, top); + av_push(stack, current); + SvREFCNT_inc(top); /* Counters the '_dec' done + just after the 'break', so + it doesn't get wrongly freed + */ + break; + + case '!': + _invlist_invert(current); + + /* Unlike binary operators, the top of the stack, + * now that this unary one has been popped off, may + * legally be an operator, and we now have operand + * for it. */ + top_index--; + SvREFCNT_dec_NN(top); + goto handle_operand; + + case '&': + prev = av_pop(stack); + _invlist_intersection(prev, + current, + ¤t); + av_push(stack, current); + break; + + case '|': + case '+': + prev = av_pop(stack); + _invlist_union(prev, current, ¤t); + av_push(stack, current); + break; + + case '-': + prev = av_pop(stack);; + _invlist_subtract(prev, current, ¤t); + av_push(stack, current); + break; + + case '^': /* The union minus the intersection */ + { + SV* i = NULL; + SV* u = NULL; + SV* element; + + prev = av_pop(stack); + _invlist_union(prev, current, &u); + _invlist_intersection(prev, current, &i); + /* _invlist_subtract will overwrite current + without freeing what it already contains */ + element = current; + _invlist_subtract(u, i, ¤t); + av_push(stack, current); + SvREFCNT_dec_NN(i); + SvREFCNT_dec_NN(u); + SvREFCNT_dec_NN(element); + break; + } + + default: + Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); + } + SvREFCNT_dec_NN(top); + SvREFCNT_dec(prev); + } + } + + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } + + if (av_tindex(stack) < 0 /* Was empty */ + || ((final = av_pop(stack)) == NULL) + || ! IS_OPERAND(final) + || av_tindex(stack) >= 0) /* More left on stack */ + { + vFAIL("Incomplete expression within '(?[ ])'"); + } + + /* Here, 'final' is the resultant inversion list from evaluating the + * expression. Return it if so requested */ + if (return_invlist) { + *return_invlist = final; + return END; + } + + /* Otherwise generate a resultant node, based on 'final'. regclass() is + * expecting a string of ranges and individual code points */ + invlist_iterinit(final); + result_string = newSVpvs(""); + while (invlist_iternext(final, &start, &end)) { + if (start == end) { + Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start); + } + else { + Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}", + start, end); + } + } + + save_parse = RExC_parse; + RExC_parse = SvPV(result_string, len); + save_end = RExC_end; + RExC_end = RExC_parse + len; + + /* We turn off folding around the call, as the class we have constructed + * already has all folding taken into consideration, and we don't want + * regclass() to add to that */ + RExC_flags &= ~RXf_PMf_FOLD; + /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. + */ + node = regclass(pRExC_state, flagp,depth+1, + FALSE, /* means parse the whole char class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. The above may very + well have generated non-portable code points, but + they're valid on this machine */ + NULL); + if (!node) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, + PTR2UV(flagp)); + if (save_fold) { + RExC_flags |= RXf_PMf_FOLD; + } + RExC_parse = save_parse + 1; + RExC_end = save_end; + SvREFCNT_dec_NN(final); + SvREFCNT_dec_NN(result_string); + + nextchar(pRExC_state); + Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ + return node; +} +#undef IS_OPERAND + +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a simple fold with other code points above + * Latin1. It would give false results if /aa has been specified. + * Multi-char folds are outside the scope of this, and must be handled + * specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp)); + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + default: + /* Use deprecated warning to increase the chances of this being + * output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + break; + } +} + +/* The names of properties whose definitions are not known at compile time are + * stored in this SV, after a constant heading. So if the length has been + * changed since initialization, then there is a run-time definition. */ +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) + +STATIC regnode * +S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, + const bool stop_at_1, /* Just parse the next thing, don't + look for a full character class */ + bool allow_multi_folds, + const bool silence_non_portable, /* Don't output warnings + about too large + characters */ + SV** ret_invlist) /* Return an inversion list, not a node */ +{ + /* parse a bracketed class specification. Most of these will produce an + * ANYOF node; but something like [a] will produce an EXACT node; [aA], an + * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex + * under /i with multi-character folds: it will be rewritten following the + * paradigm of this example, where the s are characters which + * fold to multiple character sequences: + * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i + * gets effectively rewritten as: + * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i + * reg() gets called (recursively) on the rewritten version, and this + * function will return what it constructs. (Actually the s + * aren't physically removed from the [abcdefghi], it's just that they are + * ignored in the recursion by means of a flag: + * .) + * + * ANYOF nodes contain a bit map for the first 256 characters, with the + * corresponding bit set if that character is in the list. For characters + * above 255, a range list or swash is used. There are extra bits for \w, + * etc. in locale ANYOFs, as what these match is not determinable at + * compile time + * + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs + * to be restarted. This can only happen if ret_invlist is non-NULL. + */ + + UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; + IV range = 0; + UV value = OOB_UNICODE, save_value = OOB_UNICODE; + regnode *ret; + STRLEN numlen; + IV namedclass = OOB_NAMEDCLASS; + char *rangebegin = NULL; + bool need_class = 0; + SV *listsv = NULL; + STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more + than just initialized. */ + SV* properties = NULL; /* Code points that match \p{} \P{} */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ + UV element_count = 0; /* Number of distinct elements in the class. + Optimizations may be possible if this is tiny */ + AV * multi_char_matches = NULL; /* Code points that fold to more than one + character; used under /i */ + UV n; + char * stop_ptr = RExC_end; /* where to stop parsing */ + const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white + space? */ + const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */ + + /* Unicode properties are stored in a swash; this holds the current one + * being parsed. If this swash is the only above-latin1 component of the + * character class, an optimization is to pass it directly on to the + * execution engine. Otherwise, it is set to NULL to indicate that there + * are other things in the class that have to be dealt with at execution + * time */ + SV* swash = NULL; /* Code points that match \p{} \P{} */ + + /* Set if a component of this character class is user-defined; just passed + * on to the engine */ + bool has_user_defined_property = FALSE; + + /* inversion list of code points this node matches only when the target + * string is in UTF-8. (Because is under /d) */ + SV* depends_list = NULL; + + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ + SV* cp_list = NULL; + + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + +#ifdef EBCDIC + /* In a range, counts how many 0-2 of the ends of it came from literals, + * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ + UV literal_endpoint = 0; +#endif + bool invert = FALSE; /* Is this class to be complemented */ + + bool warn_super = ALWAYS_WARN_SUPER; + + regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in + case we need to change the emitted regop to an EXACT. */ + const char * orig_parse = RExC_parse; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCLASS; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + DEBUG_PARSE("clas"); + + /* Assume we are going to generate an ANYOF node. */ + ret = reganode(pRExC_state, ANYOF, 0); + + if (SIZE_ONLY) { + RExC_size += ANYOF_SKIP; + listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ + } + else { + ANYOF_FLAGS(ret) = 0; + + RExC_emit += ANYOF_SKIP; + listsv = newSVpvs_flags("# comment\n", SVs_TEMP); + initial_listsv_len = SvCUR(listsv); + SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ + } + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ + RExC_parse++; + invert = TRUE; + allow_multi_folds = FALSE; + RExC_naughty++; + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + } + + /* Check that they didn't say [:posix:] instead of [[:posix:]] */ + if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) { + const char *s = RExC_parse; + const char c = *s++; + + while (isWORDCHAR(*s)) + s++; + if (*s && c == *s && s[1] == ']') { + SAVEFREESV(RExC_rx_sv); + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + + /* If the caller wants us to just parse a single element, accomplish this + * by faking the loop ending condition */ + if (stop_at_1 && RExC_end > RExC_parse) { + stop_ptr = RExC_parse + 1; + } + + /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ + if (UCHARAT(RExC_parse) == ']') + goto charclassloop; + +parseit: + while (1) { + if (RExC_parse >= stop_ptr) { + break; + } + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (UCHARAT(RExC_parse) == ']') { + break; + } + + charclassloop: + + namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + save_value = value; + save_prevvalue = prevvalue; + + if (!range) { + rangebegin = RExC_parse; + element_count++; + } + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + + if (value == '[' + && RExC_parse < RExC_end + && POSIXCC(UCHARAT(RExC_parse))) + { + namedclass = regpposixcc(pRExC_state, value, strict); + } + else if (value == '\\') { + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. + * A similar issue a little bit later when switching on + * namedclass. --jhi */ + + /* If the \ is escaping white space when white space is being + * skipped, it means that that white space is wanted literally, and + * is already in 'value'. Otherwise, need to translate the escape + * into what it signifies. */ + if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) { + + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; + case 's': namedclass = ANYOF_SPACE; break; + case 'S': namedclass = ANYOF_NSPACE; break; + case 'd': namedclass = ANYOF_DIGIT; break; + case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; + case 'N': /* Handle \N{NAME} in class */ + { + /* We only pay attention to the first char of + multichar strings being returned. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. */ + if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, + TRUE, /* => charclass */ + strict)) + { + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + goto parseit; + } + } + break; + case 'p': + case 'P': + { + char *e; + + /* We will handle any undefined properties ourselves */ + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; + + if (RExC_parse >= RExC_end) + vFAIL2("Empty \\%c{}", (U8)value); + if (*RExC_parse == '{') { + const U8 c = (U8)value; + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL2("Missing right brace on \\%c{}", c); + while (isSPACE(*RExC_parse)) + RExC_parse++; + if (e == RExC_parse) + vFAIL2("Empty \\%c{}", c); + n = e - RExC_parse; + while (isSPACE(*(RExC_parse + n - 1))) + n--; + } + else { + e = RExC_parse; + n = 1; + } + if (!SIZE_ONLY) { + SV* invlist; + char* name; + + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + n--; + /* toggle. (The rhs xor gets the single bit that + * differs between P and p; the other xor inverts just + * that bit) */ + value ^= 'P' ^ 'p'; + + while (isSPACE(*RExC_parse)) { + RExC_parse++; + n--; + } + } + /* Try to get the definition of the property into + * . If /i is in effect, the effective property + * will have its name be <__NAME_i>. The design is + * discussed in commit + * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ + name = savepv(Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + )); + + /* Look up the property name, and get its swash and + * inversion list, if the property is found */ + if (swash) { + SvREFCNT_dec_NN(swash); + } + swash = _core_swash_init("utf8", name, &PL_sv_undef, + 1, /* binary */ + 0, /* not tr/// */ + NULL, /* No inversion list */ + &swash_init_flags + ); + if (! swash || ! (invlist = _get_swash_invlist(swash))) { + HV* curpkg = (IN_PERL_COMPILETIME) + ? PL_curstash + : CopSTASH(PL_curcop); + if (swash) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + /* Here didn't find it. It could be a user-defined + * property that will be available at run-time. If we + * accept only compile-time properties, is an error; + * otherwise add it to the list for run-time look up */ + if (ret_invlist) { + RExC_parse = e + 1; + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); + } + + /* If the property name doesn't already have a package + * name, add the current one to it so that it can be + * referred to outside it. [perl #121777] */ + if (curpkg && ! instr(name, "::")) { + char* pkgname = HvNAME(curpkg); + if (strNE(pkgname, "main")) { + char* full_name = Perl_form(aTHX_ + "%s::%s", + pkgname, + name); + n = strlen(full_name); + Safefree(name); + name = savepvn(full_name, n); + } + } + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", + (value == 'p' ? '+' : '!'), + UTF8fARG(UTF, n, name)); + has_user_defined_property = TRUE; + + /* We don't know yet, so have to assume that the + * property could match something in the Latin1 range, + * hence something that isn't utf8. Note that this + * would cause things in to match + * inappropriately, except that any \p{}, including + * this one forces Unicode semantics, which means there + * is no */ + ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; + } + else { + + /* Here, did get the swash and its inversion list. If + * the swash is from a user-defined property, then this + * whole character class should be regarded as such */ + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + + + /* Invert if asking for the complement */ + if (value == 'P') { + _invlist_union_complement_2nd(properties, + invlist, + &properties); + + /* The swash can't be used as-is, because we've + * inverted things; delay removing it to here after + * have copied its invlist above */ + SvREFCNT_dec_NN(swash); + swash = NULL; + } + else { + _invlist_union(properties, invlist, &properties); + } + } + Safefree(name); + } + RExC_parse = e + 1; + namedclass = ANYOF_UNIPROP; /* no official name, but it's + named */ + + /* \p means they want Unicode semantics */ + RExC_uni_semantics = 1; + } + break; + case 'n': value = '\n'; break; + case 'r': value = '\r'; break; + case 't': value = '\t'; break; + case 'f': value = '\f'; break; + case 'b': value = '\b'; break; + case 'e': value = ASCII_TO_NATIVE('\033');break; + case 'a': value = '\a'; break; + case 'o': + RExC_parse--; /* function expects to be pointed at the 'o' */ + { + const char* error_msg; + bool valid = grok_bslash_o(&RExC_parse, + &value, + &error_msg, + SIZE_ONLY, /* warnings in pass + 1 only */ + strict, + silence_non_portable, + UTF); + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) { + goto recode_encoding; + } + break; + case 'x': + RExC_parse--; /* function expects to be pointed at the 'x' */ + { + const char* error_msg; + bool valid = grok_bslash_x(&RExC_parse, + &value, + &error_msg, + TRUE, /* Output warnings */ + strict, + silence_non_portable, + UTF); + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + case 'c': + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + /* Take 1-3 octal digits */ + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + numlen = (strict) ? 4 : 3; + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); + RExC_parse += numlen; + if (numlen != 3) { + if (strict) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Need exactly 3 octal digits"); + } + else if (! SIZE_ONLY /* like \08, \178 */ + && numlen < 3 + && RExC_parse < RExC_end + && isDIGIT(*RExC_parse) + && ckWARN(WARN_REGEXP)) + { + SAVEFREESV(RExC_rx_sv); + reg_warn_non_literal_string( + RExC_parse + 1, + form_short_octal_warning(RExC_parse, numlen)); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + } + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + value = reg_recode((const char)(U8)value, &enc); + if (!enc) { + if (strict) { + vFAIL("Invalid escape in the specified encoding"); + } + else if (SIZE_ONLY) { + ckWARNreg(RExC_parse, + "Invalid escape in the specified encoding"); + } + } + break; + } + default: + /* Allow \_ to not give an error */ + if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') { + if (strict) { + vFAIL2("Unrecognized escape \\%c in character class", + (int)value); + } + else { + SAVEFREESV(RExC_rx_sv); + ckWARN2reg(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + break; + } /* End of switch on char following backslash */ + } /* end of handling backslash escape sequences */ +#ifdef EBCDIC + else + literal_endpoint++; +#endif + + /* Here, we have the current token in 'value' */ + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; + + /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a + * literal, as is the character that began the false range, i.e. + * the 'a' in the examples */ + if (range) { + if (!SIZE_ONLY) { + const int w = (RExC_parse >= rangebegin) + ? RExC_parse - rangebegin + : 0; + if (strict) { + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + } + else { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + (void)ReREFCNT_inc(RExC_rx_sv); + cp_list = add_cp_to_invlist(cp_list, '-'); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); + } + } + + range = 0; /* this was not a true range */ + element_count += 2; /* So counts for three values */ + } + + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* Coverity thinks it is possible for this to be negative; both + * jhi and khw think it's not, but be safer */ + assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ + if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ + + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ + if (classnum != _CC_VERTSPACE) { + assert( namedclass == ANYOF_HORIZWS + || namedclass == ANYOF_NHORIZWS); + + /* It turns out that \h is just a synonym for + * XPosixBlank */ + classnum = _CC_BLANK; + } + + _invlist_union_maybe_complement_2nd( + cp_list, + PL_XPosix_ptrs[classnum], + namedclass % 2 != 0, /* Complement if odd + (NHORIZWS, NVERTWS) + */ + &cp_list); + } + } + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); + } + continue; /* Go get next character */ + } + } /* end of namedclass \blah */ + + /* Here, we have a single value. If 'range' is set, it is the ending + * of a range--check its validity. Later, we will handle each + * individual code point in the range. If 'range' isn't set, this + * could be the beginning of a range, so check for that by looking + * ahead to see if the next real character to be processed is the range + * indicator--the minus sign */ + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (range) { + if (prevvalue > value) /* b-a */ { + const int w = RExC_parse - rangebegin; + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + range = 0; /* not a valid range */ + } + } + else { + prevvalue = value; /* save the beginning of the potential range */ + if (! stop_at_1 /* Can't be a range if parsing just one thing */ + && *RExC_parse == '-') + { + char* next_char_ptr = RExC_parse + 1; + if (skip_white) { /* Get the next real char after the '-' */ + next_char_ptr = regpatws(pRExC_state, + RExC_parse + 1, + FALSE); /* means don't recognize + comments */ + } + + /* If the '-' is at the end of the class (just before the ']', + * it is a literal minus; otherwise it is a range */ + if (next_char_ptr < RExC_end && *next_char_ptr != ']') { + RExC_parse = next_char_ptr; + + /* a bad range like \w-, [:word:]- ? */ + if (namedclass > OOB_NAMEDCLASS) { + if (strict || ckWARN(WARN_REGEXP)) { + const int w = + RExC_parse >= rangebegin ? + RExC_parse - rangebegin : 0; + if (strict) { + vFAIL4("False [] range \"%*.*s\"", + w, w, rangebegin); + } + else { + vWARN4(RExC_parse, + "False [] range \"%*.*s\"", + w, w, rangebegin); + } + } + if (!SIZE_ONLY) { + cp_list = add_cp_to_invlist(cp_list, '-'); + } + element_count++; + } else + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ + } + } + } + + /* Here, is the beginning of the range, if any; or + * if not */ + + /* non-Latin1 code point implies unicode semantics. Must be set in + * pass1 so is there for the whole of pass 2 */ + if (value > 255) { + RExC_uni_semantics = 1; + } + + /* Ready to process either the single value, or the completed range. + * For single-valued non-inverted ranges, we consider the possibility + * of multi-char folds. (We made a conscious decision to not do this + * for the other cases because it can often lead to non-intuitive + * results. For example, you have the peculiar case that: + * "s s" =~ /^[^\xDF]+$/i => Y + * "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + if (FOLD && allow_multi_folds && value == prevvalue) { + if (value == LATIN_SMALL_LETTER_SHARP_S + || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, + value))) + { + /* Here is indeed a multi-char fold. Get what it is */ + + U8 foldbuf[UTF8_MAXBYTES_CASE]; + STRLEN foldlen; + + UV folded = _to_uni_fold_flags( + value, + foldbuf, + &foldlen, + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + + /* Here, should be the first character of the + * multi-char fold of , with containing the + * whole thing. But, if this fold is not allowed (because of + * the flags), will be the same as , and should + * be processed like any other character, so skip the special + * handling */ + if (folded != value) { + + /* Skip if we are recursed, currently parsing the class + * again. Otherwise add this character to the list of + * multi-char folds. */ + if (! RExC_in_multi_char_class) { + AV** this_array_ptr; + AV* this_array; + STRLEN cp_count = utf8_length(foldbuf, + foldbuf + foldlen); + SV* multi_fold = sv_2mortal(newSVpvs("")); + + Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); + + + if (! multi_char_matches) { + multi_char_matches = newAV(); + } + + /* is actually an array of arrays. + * There will be one or two top-level elements: [2], + * and/or [3]. The [2] element is an array, each + * element thereof is a character which folds to TWO + * characters; [3] is for folds to THREE characters. + * (Unicode guarantees a maximum of 3 characters in any + * fold.) When we rewrite the character class below, + * we will do so such that the longest folds are + * written first, so that it prefers the longest + * matching strings first. This is done even if it + * turns out that any quantifier is non-greedy, out of + * programmer laziness. Tom Christiansen has agreed + * that this is ok. This makes the test for the + * ligature 'ffi' come before the test for 'ff' */ + if (av_exists(multi_char_matches, cp_count)) { + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + this_array = *this_array_ptr; + } + else { + this_array = newAV(); + av_store(multi_char_matches, cp_count, + (SV*) this_array); + } + av_push(this_array, multi_fold); + } + + /* This element should not be processed further in this + * class */ + element_count--; + value = save_value; + prevvalue = save_prevvalue; + continue; + } + } + } + + /* Deal with this element of the class */ + if (! SIZE_ONLY) { +#ifndef EBCDIC + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); +#else + SV* this_range = _new_invlist(1); + _append_range_to_invlist(this_range, prevvalue, value); + + /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. + * If this range was specified using something like 'i-j', we want + * to include only the 'i' and the 'j', and not anything in + * between, so exclude non-ASCII, non-alphabetics from it. + * However, if the range was specified with something like + * [\x89-\x91] or [\x89-j], all code points within it should be + * included. literal_endpoint==2 means both ends of the range used + * a literal character, not \x{foo} */ + if (literal_endpoint == 2 + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) + { + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII], + &this_range); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], + &this_range); + } + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); + literal_endpoint = 0; +#endif + } + + range = 0; /* this range (if it was one) is done now */ + } /* End of loop through all the text within the brackets */ + + /* If anything in the class expands to more than one character, we have to + * deal with them by building up a substitute parse string, and recursively + * calling reg() on it, instead of proceeding */ + if (multi_char_matches) { + SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); + I32 cp_count; + STRLEN len; + char *save_end = RExC_end; + char *save_parse = RExC_parse; + bool first_time = TRUE; /* First multi-char occurrence doesn't get + a "|" */ + I32 reg_flags; + + assert(! invert); +#if 0 /* Have decided not to deal with multi-char folds in inverted classes, + because too confusing */ + if (invert) { + sv_catpv(substitute_parse, "(?:"); + } +#endif + + /* Look at the longest folds first */ + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { + + if (av_exists(multi_char_matches, cp_count)) { + AV** this_array_ptr; + SV* this_sequence; + + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + while ((this_sequence = av_pop(*this_array_ptr)) != + &PL_sv_undef) + { + if (! first_time) { + sv_catpv(substitute_parse, "|"); + } + first_time = FALSE; + + sv_catpv(substitute_parse, SvPVX(this_sequence)); + } + } + } + + /* If the character class contains anything else besides these + * multi-character folds, have to include it in recursive parsing */ + if (element_count) { + sv_catpv(substitute_parse, "|["); + sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); + sv_catpv(substitute_parse, "]"); + } + + sv_catpv(substitute_parse, ")"); +#if 0 + if (invert) { + /* This is a way to get the parse to skip forward a whole named + * sequence instead of matching the 2nd character when it fails the + * first */ + sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); + } +#endif + + RExC_parse = SvPV(substitute_parse, len); + RExC_end = RExC_parse + len; + RExC_in_multi_char_class = 1; + RExC_emit = (regnode *)orig_emit; + + ret = reg(pRExC_state, 1, ®_flags, depth+1); + + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8); + + RExC_parse = save_parse; + RExC_end = save_end; + RExC_in_multi_char_class = 0; + SvREFCNT_dec_NN(multi_char_matches); + return ret; + } + + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { + U8 op = END; + U8 arg = 0; + + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ + + /* All named classes are mapped into POSIXish nodes, with its FLAG + * argument giving which class it is */ + switch ((I32)namedclass) { + case ANYOF_UNIPROP: + break; + + /* These don't depend on the charset modifiers. They always + * match under /u rules */ + case ANYOF_NHORIZWS: + case ANYOF_HORIZWS: + namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS; + /* FALLTHROUGH */ + + case ANYOF_NVERTWS: + case ANYOF_VERTWS: + op = POSIXU; + goto join_posix; + + /* The actual POSIXish node for all the rest depends on the + * charset modifier. The ones in the first set depend only on + * ASCII or, if available on this platform, locale */ + case ANYOF_ASCII: + case ANYOF_NASCII: +#ifdef HAS_ISASCII + op = (LOC) ? POSIXL : POSIXA; +#else + op = POSIXA; +#endif + goto join_posix; + + case ANYOF_NCASED: + case ANYOF_LOWER: + case ANYOF_NLOWER: + case ANYOF_UPPER: + case ANYOF_NUPPER: + /* under /a could be alpha */ + if (FOLD) { + if (ASCII_RESTRICTED) { + namedclass = ANYOF_ALPHA + (namedclass % 2); + } + else if (! LOC) { + break; + } + } + /* FALLTHROUGH */ + + /* The rest have more possibilities depending on the charset. + * We take advantage of the enum ordering of the charset + * modifiers to get the exact node type, */ + default: + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; + } + + join_posix: + /* The odd numbered ones are the complements of the + * next-lower even number one */ + if (namedclass % 2 == 1) { + invert = ! invert; + namedclass--; + } + arg = namedclass_to_classnum(namedclass); + break; + } + } + else if (value == prevvalue) { + + /* Here, the class consists of just a single code point */ + + if (invert) { + if (! LOC && value == '\n') { + op = REG_ANY; /* Optimize [^\n] */ + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + else if (value < 256 || UTF) { + + /* Optimize a single value into an EXACTish node, but not if it + * would require converting the pattern to UTF-8. */ + op = compute_EXACTish(pRExC_state); + } + } /* Otherwise is a range */ + else if (! LOC) { /* locale could vary these */ + if (prevvalue == '0') { + if (value == '9') { + arg = _CC_DIGIT; + op = POSIXA; + } + } + else if (prevvalue == 'A') { + if (value == 'Z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } + } + else if (prevvalue == 'a') { + if (value == 'z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } + } + } + + /* Here, we have changed away from its initial value iff we found + * an optimization */ + if (op != END) { + + /* Throw away this ANYOF regnode, and emit the calculated one, + * which should correspond to the beginning, not current, state of + * the parse */ + const char * cur_parse = RExC_parse; + RExC_parse = (char *)orig_parse; + if ( SIZE_ONLY) { + if (! LOC) { + + /* To get locale nodes to not use the full ANYOF size would + * require moving the code above that writes the portions + * of it that aren't in other nodes to after this point. + * e.g. ANYOF_POSIXL_SET */ + RExC_size = orig_size; + } + } + else { + RExC_emit = (regnode *)orig_emit; + if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } + if (invert) { + op += NPOSIXD - POSIXD; + } + } + } + + ret = reg_node(pRExC_state, op); + + if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { + if (! SIZE_ONLY) { + FLAGS(ret) = arg; + } + *flagp |= HASWIDTH|SIMPLE; + } + else if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); + } + + RExC_parse = (char *) cur_parse; + + SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); + SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); + return ret; + } + } + + if (SIZE_ONLY) + return ret; + /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ + + /* If folding, we calculate all characters that could fold to or from the + * ones already on the list */ + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; + } + + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, + &fold_intersection); + + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { + + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + } + + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; + + if (j < 256) { + + if (IS_IN_SOME_FOLD_L1(j)) { + + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } + } + + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); + } + continue; + } + + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } + + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } + } + } + } + } + SvREFCNT_dec_NN(fold_intersection); + } + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); + } + + /* And combine the result (if any) with any inversion list from posix + * classes. The lists are kept separate up to now because we don't want to + * fold the classes (folding of those is automatically handled by the swash + * fetching code) */ + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } + if (! DEPENDS_SEMANTICS) { + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + } + else { + cp_list = posixes; + } + } + else { + /* Under /d, we put into a separate list the Latin1 things that + * match only when the target string is utf8 */ + SV* nonascii_but_latin1_properties = NULL; + _invlist_intersection(posixes, PL_UpperLatin1, + &nonascii_but_latin1_properties); + _invlist_subtract(posixes, nonascii_but_latin1_properties, + &posixes); + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + } + else { + cp_list = posixes; + } + + if (depends_list) { + _invlist_union(depends_list, nonascii_but_latin1_properties, + &depends_list); + SvREFCNT_dec_NN(nonascii_but_latin1_properties); + } + else { + depends_list = nonascii_but_latin1_properties; + } + } + } + + /* And combine the result (if any) with any inversion list from properties. + * The lists are kept separate up to now so that we can distinguish the two + * in regards to matching above-Unicode. A run-time warning is generated + * if a Unicode property is matched against a non-Unicode code point. But, + * we allow user-defined properties to match anything, without any warning, + * and we also suppress the warning if there is a portion of the character + * class that isn't a Unicode property, and which matches above Unicode, \W + * or [\x{110000}] for example. + * (Note that in this case, unlike the Posix one above, there is no + * , because having a Unicode property forces Unicode + * semantics */ + if (properties) { + if (cp_list) { + + /* If it matters to the final outcome, see if a non-property + * component of the class matches above Unicode. If so, the + * warning gets suppressed. This is true even if just a single + * such code point is specified, as though not strictly correct if + * another such code point is matched against, the fact that they + * are using above-Unicode code points indicates they should know + * the issues involved */ + if (warn_super) { + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); + } + + _invlist_union(properties, cp_list, &cp_list); + SvREFCNT_dec_NN(properties); + } + else { + cp_list = properties; + } + + if (warn_super) { + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + } + } + + /* Here, we have calculated what code points should be in the character + * class. + * + * Now we can see about various optimizations. Fold calculation (which we + * did above) needs to take place before inversion. Otherwise /[^k]/i + * would invert to include K, which under /i would match k, which it + * shouldn't. Therefore we can't invert folded locale now, as it won't be + * folded until runtime */ + + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known + * at compile time. Besides not inverting folded locale now, we can't + * invert if there are things such as \w, which aren't known until runtime + * */ + if (cp_list + && invert + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! depends_list + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + _invlist_invert(cp_list); + + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + /* Clear the invert flag since have just done it here */ + invert = FALSE; + } + + if (ret_invlist) { + *ret_invlist = cp_list; + SvREFCNT_dec(swash); + + /* Discard the generated node */ + if (SIZE_ONLY) { + RExC_size = orig_size; + } + else { + RExC_emit = orig_emit; + } + return orig_emit; + } + + /* Some character classes are equivalent to other nodes. Such nodes take + * up less room and generally fewer operations to execute than ANYOF nodes. + * Above, we checked for and optimized into some such equivalents for + * certain common classes that are easy to test. Getting to this point in + * the code means that the class didn't get optimized there. Since this + * code is only executed in Pass 2, it is too late to save space--it has + * been allocated in Pass 1, and currently isn't given back. But turning + * things into an EXACTish node can allow the optimizer to join it to any + * adjacent such nodes. And if the class is equivalent to things like /./, + * expensive run-time swashes can be avoided. Now that we have more + * complete information, we can find things necessarily missed by the + * earlier code. I (khw) am not sure how much to look for here. It would + * be easy, but perhaps too slow, to check any candidates against all the + * node types they could possibly match using _invlistEQ(). */ + + if (cp_list + && ! invert + && ! depends_list + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) + { + UV start, end; + U8 op = END; /* The optimzation node-type */ + const char * cur_parse= RExC_parse; + + invlist_iterinit(cp_list); + if (! invlist_iternext(cp_list, &start, &end)) { + + /* Here, the list is empty. This happens, for example, when a + * Unicode property is the only thing in the character class, and + * it doesn't match anything. (perluniprops.pod notes such + * properties) */ + op = OPFAIL; + *flagp |= HASWIDTH|SIMPLE; + } + else if (start == end) { /* The range is a single code point */ + if (! invlist_iternext(cp_list, &start, &end) + + /* Don't do this optimization if it would require changing + * the pattern to UTF-8 */ + && (start < 256 || UTF)) + { + /* Here, the list contains a single code point. Can optimize + * into an EXACTish node */ + + value = start; + + if (! FOLD) { + op = EXACT; + } + else if (LOC) { + + /* A locale node under folding with one code point can be + * an EXACTFL, as its fold won't be calculated until + * runtime */ + op = EXACTFL; + } + else { + + /* Here, we are generally folding, but there is only one + * code point to match. If we have to, we use an EXACT + * node, but it would be better for joining with adjacent + * nodes in the optimization pass if we used the same + * EXACTFish node that any such are likely to be. We can + * do this iff the code point doesn't participate in any + * folds. For example, an EXACTF of a colon is the same as + * an EXACT one, since nothing folds to or from a colon. */ + if (value < 256) { + if (IS_IN_SOME_FOLD_L1(value)) { + op = EXACT; + } + } + else { + if (_invlist_contains_cp(PL_utf8_foldable, value)) { + op = EXACT; + } + } + + /* If we haven't found the node type, above, it means we + * can use the prevailing one */ + if (op == END) { + op = compute_EXACTish(pRExC_state); + } + } + } + } + else if (start == 0) { + if (end == UV_MAX) { + op = SANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + else if (end == '\n' - 1 + && invlist_iternext(cp_list, &start, &end) + && start == '\n' + 1 && end == UV_MAX) + { + op = REG_ANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + invlist_iterfinish(cp_list); + + if (op != END) { + RExC_parse = (char *)orig_parse; + RExC_emit = (regnode *)orig_emit; + + ret = reg_node(pRExC_state, op); + + RExC_parse = (char *)cur_parse; + + if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); + } + + SvREFCNT_dec_NN(cp_list); + return ret; + } + } + + /* Here, contains all the code points we can determine at + * compile time that match under all conditions. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * . While we are at it, see if everything above 255 is in the + * list, and if so, set a flag to speed up execution */ + + populate_ANYOF_from_invlist(ret, &cp_list); + + if (invert) { + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + } + + /* Here, the bitmap has been populated with all the Latin1 code points that + * always match. Can now add to the overall list those that match only + * when the target string is UTF-8 (). */ + if (depends_list) { + if (cp_list) { + _invlist_union(cp_list, depends_list, &cp_list); + SvREFCNT_dec_NN(depends_list); + } + else { + cp_list = depends_list; + } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; + } + + /* If there is a swash and more than one element, we can't use the swash in + * the optimization below. */ + if (swash && element_count > 1) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); + } + else { + AV * const av = newAV(); + SV *rv; + + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); + if (swash) { + assert(cp_list); + av_store(av, 1, swash); + SvREFCNT_dec_NN(cp_list); + } + else { + av_store(av, 1, &PL_sv_undef); + if (cp_list) { + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); + } + } + + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + + rv = newRV_noinc(MUTABLE_SV(av)); + n = add_data(pRExC_state, STR_WITH_LEN("s")); + RExC_rxi->data->data[n] = (void*)rv; + ARG_SET(node, n); + } +} + + +/* reg_skipcomment() + + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. + + Note it's the callers responsibility to ensure that we are + actually in /x mode + +*/ + +PERL_STATIC_INLINE char* +S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) +{ + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + + assert(*p == '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; + } + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; +} + +/* nextchar() + + Advances the parse position, and optionally absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + +STATIC char* +S_nextchar(pTHX_ RExC_state_t *pRExC_state) +{ + char* const retval = RExC_parse++; + + PERL_ARGS_ASSERT_NEXTCHAR; + + for (;;) { + if (RExC_end - RExC_parse >= 3 + && *RExC_parse == '(' + && RExC_parse[1] == '?' + && RExC_parse[2] == '#') + { + while (*RExC_parse != ')') { + if (RExC_parse == RExC_end) + FAIL("Sequence (?#... not terminated"); + RExC_parse++; + } + RExC_parse++; + continue; + } + if (RExC_flags & RXf_PMf_EXTENDED) { + char * p = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + if (p != RExC_parse) { + RExC_parse = p; + continue; + } + } + return retval; + } +} + +/* +- reg_node - emit a node +*/ +STATIC regnode * /* Location. */ +S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) +{ + regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 1; + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, (void*)RExC_emit, (void*)RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE(ptr, op); + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + "reg_node", __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reganode - emit a node with an argument +*/ +STATIC regnode * /* Location. */ +S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) +{ + regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGANODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 2; + /* + We can't do this: + + assert(2==regarglen[op]+1); + + Anything larger than this has to allocate the extra amount. + If we changed this to be: + + RExC_size += (1 + regarglen[op]); + + then it wouldn't matter. Its not clear what side effect + might come from that so its not done so far. + -- dmq + */ + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, (void*)RExC_emit, (void*)RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE_ARG(ptr, op, arg); + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reganode", + __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? + "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Cur_Node_Offset; + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reguni - emit (if appropriate) a Unicode character +*/ +PERL_STATIC_INLINE STRLEN +S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) +{ + PERL_ARGS_ASSERT_REGUNI; + + return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); +} + +/* +- reginsert - insert an operator in front of already-emitted operand +* +* Means relocating the operand. +*/ +STATIC void +S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) +{ + regnode *src; + regnode *dst; + regnode *place; + const int offset = regarglen[(U8)op]; + const int size = NODE_STEP_REGNODE + offset; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGINSERT; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(depth); +/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ + DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); + if (SIZE_ONLY) { + RExC_size += size; + return; + } + + src = RExC_emit; + RExC_emit += size; + dst = RExC_emit; + if (RExC_open_parens) { + int paren; + /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ + for ( paren=0 ; paren < RExC_npar ; paren++ ) { + if ( RExC_open_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("open"," - %d",size);*/ + RExC_open_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ + } + if ( RExC_close_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("close"," - %d",size);*/ + RExC_close_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ + } + } + } + + while (src > opnd) { + StructCopy(--src, --dst, regnode); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD 20010112 */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + "reg_insert", + __LINE__, + PL_reg_name[op], + (UV)(dst - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(src - RExC_emit_start), + (UV)(dst - RExC_emit_start), + (UV)RExC_offsets[0])); + Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); + Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); + } +#endif + } + + + place = opnd; /* Op node, where operand used to be. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reginsert", + __LINE__, + PL_reg_name[op], + (UV)(place - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(place - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(place, RExC_parse); + Set_Node_Length(place, 1); + } +#endif + src = NEXTOPER(place); + FILL_ADVANCE_NODE(place, op); + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1); + Zero(src, offset, regnode); +} + +/* +- regtail - set the next-pointer at the end of a node chain of p to val. +- SEE ALSO: regtail_study +*/ +/* TODO: All three parms should be const */ +STATIC void +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) +{ + regnode *scan; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if (SIZE_ONLY) + return; + + /* Find last node. */ + scan = p; + for (;;) { + regnode * const temp = regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tail" : "")); + regprop(RExC_rx, mysv, scan, NULL); + PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", + SvPV_nolen_const(mysv), REG_NODE_NUM(scan), + (temp == NULL ? "->" : ""), + (temp == NULL ? PL_reg_name[OP(val)] : "") + ); + }); + if (temp == NULL) + break; + scan = temp; + } + + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } +} + +#ifdef DEBUGGING +/* +- regtail_study - set the next-pointer at the end of a node chain of p to val. +- Look for optimizable sequences at the same time. +- currently only looks for EXACT chains. + +This is experimental code. The idea is to use this routine to perform +in place optimizations on branches and groups as they are constructed, +with the long term intention of removing optimization from study_chunk so +that it is purely analytical. + +Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used +to control which is which. + +*/ +/* TODO: All four parms should be const */ + +STATIC U8 +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) +{ + dVAR; + regnode *scan; + U8 exact = PSEUDO; +#ifdef EXPERIMENTAL_INPLACESCAN + I32 min = 0; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL_STUDY; + + + if (SIZE_ONLY) + return exact; + + /* Find last node. */ + + scan = p; + for (;;) { + regnode * const temp = regnext(scan); +#ifdef EXPERIMENTAL_INPLACESCAN + if (PL_regkind[OP(scan)] == EXACT) { + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) + return EXACT; + } +#endif + if ( exact ) { + switch (OP(scan)) { + case EXACT: + case EXACTF: + case EXACTFA_NO_TRIE: + case EXACTFA: + case EXACTFU: + case EXACTFU_SS: + case EXACTFL: + if( exact == PSEUDO ) + exact= OP(scan); + else if ( exact != OP(scan) ) + exact= 0; + case NOTHING: + break; + default: + exact= 0; + } + } + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); + regprop(RExC_rx, mysv, scan, NULL); + PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", + SvPV_nolen_const(mysv), + REG_NODE_NUM(scan), + PL_reg_name[exact]); + }); + if (temp == NULL) + break; + scan = temp; + } + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_PARSE_MSG(""); + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(val), + (IV)(val - scan) + ); + }); + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } + + return exact; +} +#endif + +/* + - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form + */ +#ifdef DEBUGGING + +static void +S_regdump_intflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitprogram, ri->program + 1, NULL, NULL, sv, 0, 0); + + /* Header fields of interest. */ + if (r->anchored_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), + RE_SV_DUMPLEN(r->anchored_substr), 30); + PerlIO_printf(Perl_debug_log, + "anchored %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_substr), + (IV)r->anchored_offset); + } else if (r->anchored_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), + RE_SV_DUMPLEN(r->anchored_utf8), 30); + PerlIO_printf(Perl_debug_log, + "anchored utf8 %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_utf8), + (IV)r->anchored_offset); + } + if (r->float_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), + RE_SV_DUMPLEN(r->float_substr), 30); + PerlIO_printf(Perl_debug_log, + "floating %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_substr), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } else if (r->float_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), + RE_SV_DUMPLEN(r->float_utf8), 30); + PerlIO_printf(Perl_debug_log, + "floating utf8 %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_utf8), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, + (const char *) + (r->check_substr == r->float_substr + && r->check_utf8 == r->float_utf8 + ? "(checking floating" : "(checking anchored")); + if (r->intflags & PREGf_NOSCAN) + PerlIO_printf(Perl_debug_log, " noscan"); + if (r->extflags & RXf_CHECK_ALL) + PerlIO_printf(Perl_debug_log, " isall"); + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, ") "); + + if (ri->regstclass) { + regprop(r, sv, ri->regstclass, NULL); + PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); + } + if (r->intflags & PREGf_ANCH) { + PerlIO_printf(Perl_debug_log, "anchored"); + if (r->intflags & PREGf_ANCH_BOL) + PerlIO_printf(Perl_debug_log, "(BOL)"); + if (r->intflags & PREGf_ANCH_MBOL) + PerlIO_printf(Perl_debug_log, "(MBOL)"); + if (r->intflags & PREGf_ANCH_SBOL) + PerlIO_printf(Perl_debug_log, "(SBOL)"); + if (r->intflags & PREGf_ANCH_GPOS) + PerlIO_printf(Perl_debug_log, "(GPOS)"); + PerlIO_putc(Perl_debug_log, ' '); + } + if (r->intflags & PREGf_GPOS_SEEN) + PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); + if (r->intflags & PREGf_SKIP) + PerlIO_printf(Perl_debug_log, "plus "); + if (r->intflags & PREGf_IMPLICIT) + PerlIO_printf(Perl_debug_log, "implicit "); + PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); + if (r->extflags & RXf_EVAL_SEEN) + PerlIO_printf(Perl_debug_log, "with eval "); + PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r({ + regdump_extflags("r->extflags: ",r->extflags); + regdump_intflags("r->intflags: ",r->intflags); + }); +#else + PERL_ARGS_ASSERT_REGDUMP; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(r); +#endif /* DEBUGGING */ +} + +/* +- regprop - printable representation of opcode, with run time support +*/ + +void +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) +{ +#ifdef DEBUGGING + dVAR; + int k; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { +#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ + || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ + || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ + || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ + || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \ + || _CC_VERTSPACE != 16 + #error Need to adjust order of anyofs[] +#endif + "\\w", + "\\W", + "\\d", + "\\D", + "[:alpha:]", + "[:^alpha:]", + "[:lower:]", + "[:^lower:]", + "[:upper:]", + "[:^upper:]", + "[:punct:]", + "[:^punct:]", + "[:print:]", + "[:^print:]", + "[:alnum:]", + "[:^alnum:]", + "[:graph:]", + "[:^graph:]", + "[:cased:]", + "[:^cased:]", + "\\s", + "\\S", + "[:blank:]", + "[:^blank:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:ascii:]", + "[:^ascii:]", + "\\v", + "\\V" + }; + RXi_GET_DECL(prog,progi); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPROP; + + sv_setpvs(sv, ""); + + if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ + /* It would be nice to FAIL() here, but this may be called from + regexec.c, and it would be hard to supply pRExC_state. */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); + sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ + + k = PL_regkind[OP(o)]; + + if (k == EXACT) { + sv_catpvs(sv, " "); + /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) + * is a crude hack but it may be the best for now since + * we have no flag "this EXACTish node was UTF-8" + * --jhi */ + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_ESCAPE_NONASCII | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); + } else if (k == TRIE) { + /* print the details of the trie in dumpuntil instead, as + * progi->data isn't available here */ + const char op = OP(o); + const U32 n = ARG(o); + const reg_ac_data * const ac = IS_TRIE_AC(op) ? + (reg_ac_data *)progi->data->data[n] : + NULL; + const reg_trie_data * const trie + = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; + + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); + DEBUG_TRIE_COMPILE_r( + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); + ); + if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { + sv_catpvs(sv, "["); + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); + sv_catpvs(sv, "]"); + } + + } else if (k == CURLY) { + if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); + } + else if (k == WHILEM && o->flags) /* Ordinal/of */ + Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { + Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ + if ( RXp_PAREN_NAMES(prog) ) { + if ( k != REF || (OP(o) < NREF)) { + AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + SV **name= av_fetch(list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + } + else { + AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + SV **name= av_fetch(list, nums[0], 0 ); + I32 n; + if (name) { + for ( n=0; noffs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } + } else if (k == GOSUB) + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); + else if (k == VERB) { + if (!o->flags) + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); + } else if (k == LOGICAL) + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); + else if (k == ANYOF) { + const U8 flags = ANYOF_FLAGS(o); + int do_sep = 0; + + + if (flags & ANYOF_LOCALE_FLAGS) + sv_catpvs(sv, "{loc}"); + if (flags & ANYOF_LOC_FOLD) + sv_catpvs(sv, "{i}"); + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + if (flags & ANYOF_INVERT) + sv_catpvs(sv, "^"); + + /* output what the standard cp 0-255 bitmap matches */ + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); + + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { + sv_catpv(sv, anyofs[i]); + do_sep = 1; + } + } + } + + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); + } + + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } + + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { + char *s = savesvpv(lv); + char * const origs = s; + + while (*s && *s != '\n') + s++; + + if (*s == '\n') { + const char * const t = ++s; + + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } + + if (byte_output) { + sv_catpvs(sv, " "); + } + + while (*s) { + if (*s == '\n') { + + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); + } + + out_dump: + + Safefree(origs); + SvREFCNT_dec_NN(lv); + } + + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); + } + } + } + + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + } + else if (k == POSIXD || k == NPOSIXD) { + U8 index = FLAGS(o) * 2; + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } + } + else { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + } + else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) + Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); +#else + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(sv); + PERL_UNUSED_ARG(o); + PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); +#endif /* DEBUGGING */ +} + + + +SV * +Perl_re_intuit_string(pTHX_ REGEXP * const r) +{ /* Assume that RE_INTUIT is set */ + struct regexp *const prog = ReANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_STRING; + PERL_UNUSED_CONTEXT; + + DEBUG_COMPILE_r( + { + const char * const s = SvPV_nolen_const(prog->check_substr + ? prog->check_substr : prog->check_utf8); + + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, + "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", + PL_colors[4], + prog->check_substr ? "" : "utf8 ", + PL_colors[5],PL_colors[0], + s, + PL_colors[1], + (strlen(s) > 60 ? "..." : "")); + } ); + + return prog->check_substr ? prog->check_substr : prog->check_utf8; +} + +/* + pregfree() + + handles refcounting and freeing the perl core regexp structure. When + it is necessary to actually free the structure the first thing it + does is call the 'free' method of the regexp_engine associated to + the regexp, allowing the handling of the void *pprivate; member + first. (This routine is not overridable by extensions, which is why + the extensions free is called first.) + + See regdupe and regdupe_internal if you change anything here. +*/ +#ifndef PERL_IN_XSUB_RE +void +Perl_pregfree(pTHX_ REGEXP *r) +{ + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) +{ + struct regexp *const r = ReANY(rx); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_PREGFREE2; + + if (r->mother_re) { + ReREFCNT_dec(r->mother_re); + } else { + CALLREGFREE_PVT(rx); /* free the private data */ + SvREFCNT_dec(RXp_PAREN_NAMES(r)); + Safefree(r->xpv_len_u.xpvlenu_pv); + } + if (r->substrs) { + SvREFCNT_dec(r->anchored_substr); + SvREFCNT_dec(r->anchored_utf8); + SvREFCNT_dec(r->float_substr); + SvREFCNT_dec(r->float_utf8); + Safefree(r->substrs); + } + RX_MATCH_COPY_FREE(rx); +#ifdef PERL_ANY_COW + SvREFCNT_dec(r->saved_copy); +#endif + Safefree(r->offs); + SvREFCNT_dec(r->qr_anoncv); + rx->sv_u.svu_rx = 0; +} + +/* reg_temp_copy() + + This is a hacky workaround to the structural issue of match results + being stored in the regexp structure which is in turn stored in + PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern + could be PL_curpm in multiple contexts, and could require multiple + result sets being associated with the pattern simultaneously, such + as when doing a recursive match with (??{$qr}) + + The solution is to make a lightweight copy of the regexp structure + when a qr// is returned from the code executed by (??{$qr}) this + lightweight copy doesn't actually own any of its data except for + the starp/end and the actual regexp structure itself. + +*/ + + +REGEXP * +Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) +{ + struct regexp *ret; + struct regexp *const r = ReANY(rx); + const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV; + + PERL_ARGS_ASSERT_REG_TEMP_COPY; + + if (!ret_x) + ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + else { + SvOK_off((SV *)ret_x); + if (islv) { + /* For PVLVs, SvANY points to the xpvlv body while sv_u points + to the regexp. (For SVt_REGEXPs, sv_upgrade has already + made both spots point to the same regexp body.) */ + REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); + assert(!SvPVX(ret_x)); + ret_x->sv_u.svu_rx = temp->sv_any; + temp->sv_any = NULL; + SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; + SvREFCNT_dec_NN(temp); + /* SvCUR still resides in the xpvlv struct, so the regexp copy- + ing below will not set it. */ + SvCUR_set(ret_x, SvCUR(rx)); + } + } + /* This ensures that SvTHINKFIRST(sv) is true, and hence that + sv_force_normal(sv) is called. */ + SvFAKE_on(ret_x); + ret = ReANY(ret_x); + + SvFLAGS(ret_x) |= SvUTF8(rx); + /* We share the same string buffer as the original regexp, on which we + hold a reference count, incremented when mother_re is set below. + The string pointer is copied here, being part of the regexp struct. + */ + memcpy(&(ret->xpv_cur), &(r->xpv_cur), + sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); + if (r->offs) { + const I32 npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + } + if (r->substrs) { + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + SvREFCNT_inc_void(ret->anchored_substr); + SvREFCNT_inc_void(ret->anchored_utf8); + SvREFCNT_inc_void(ret->float_substr); + SvREFCNT_inc_void(ret->float_utf8); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + } + RX_MATCH_COPIED_off(ret_x); +#ifdef PERL_ANY_COW + ret->saved_copy = NULL; +#endif + ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); + SvREFCNT_inc_void(ret->qr_anoncv); + + return ret_x; +} +#endif + +/* regfree_internal() + + Free the private data in a regexp. This is overloadable by + extensions. Perl takes care of the regexp structure in pregfree(), + this covers the *pprivate pointer which technically perl doesn't + know about, however of course we have to handle the + regexp_internal structure when no extension is in use. + + Note this is called before freeing anything in the regexp + structure. + */ + +void +Perl_regfree_internal(pTHX_ REGEXP * const rx) +{ + struct regexp *const r = ReANY(rx); + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGFREE_INTERNAL; + + DEBUG_COMPILE_r({ + if (!PL_colorset) + reginitcolors(); + { + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); + PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + PL_colors[4],PL_colors[5],s); + } + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) + Safefree(ri->u.offsets); /* 20010421 MJD */ +#endif + if (ri->code_blocks) { + int n; + for (n = 0; n < ri->num_code_blocks; n++) + SvREFCNT_dec(ri->code_blocks[n].src_regex); + Safefree(ri->code_blocks); + } + + if (ri->data) { + int n = ri->data->count; + + while (--n >= 0) { + /* If you add a ->what type here, update the comment in regcomp.h */ + switch (ri->data->what[n]) { + case 'a': + case 'r': + case 's': + case 'S': + case 'u': + SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); + break; + case 'f': + Safefree(ri->data->data[n]); + break; + case 'l': + case 'L': + break; + case 'T': + { /* Aho Corasick add-on structure for a trie node. + Used in stclass optimization only */ + U32 refcount; + reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif + OP_REFCNT_LOCK; + refcount = --aho->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(aho->states); + PerlMemShared_free(aho->fail); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } + } + } + break; + case 't': + { + /* trie structure. */ + U32 refcount; + reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif + OP_REFCNT_LOCK; + refcount = --trie->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(trie->charmap); + PerlMemShared_free(trie->states); + PerlMemShared_free(trie->trans); + if (trie->bitmap) + PerlMemShared_free(trie->bitmap); + if (trie->jump) + PerlMemShared_free(trie->jump); + PerlMemShared_free(trie->wordinfo); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + } + } + break; + default: + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); + } + } + Safefree(ri->data->what); + Safefree(ri->data); + } + + Safefree(ri); +} + +#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t)) +#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) +#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) + +/* + re_dup - duplicate a regexp. + + This routine is expected to clone a given regexp structure. It is only + compiled under USE_ITHREADS. + + After all of the core data stored in struct regexp is duplicated + the regexp_engine.dupe method is used to copy any private data + stored in the *pprivate pointer. This allows extensions to handle + any duplication it needs to do. + + See pregfree() and regfree_internal() if you change anything here. +*/ +#if defined(USE_ITHREADS) +#ifndef PERL_IN_XSUB_RE +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) +{ + dVAR; + I32 npar; + const struct regexp *r = ReANY(sstr); + struct regexp *ret = ReANY(dstr); + + PERL_ARGS_ASSERT_RE_DUP_GUTS; + + npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + + if (ret->substrs) { + /* Do it this way to avoid reading from *r after the StructCopy(). + That way, if any of the sv_dup_inc()s dislodge *r from the L1 + cache, it doesn't matter. */ + const bool anchored = r->check_substr + ? r->check_substr == r->anchored_substr + : r->check_utf8 == r->anchored_utf8; + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); + ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); + ret->float_substr = sv_dup_inc(ret->float_substr, param); + ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + + if (ret->check_substr) { + if (anchored) { + assert(r->check_utf8 == r->anchored_utf8); + ret->check_substr = ret->anchored_substr; + ret->check_utf8 = ret->anchored_utf8; + } else { + assert(r->check_substr == r->float_substr); + assert(r->check_utf8 == r->float_utf8); + ret->check_substr = ret->float_substr; + ret->check_utf8 = ret->float_utf8; + } + } else if (ret->check_utf8) { + if (anchored) { + ret->check_utf8 = ret->anchored_utf8; + } else { + ret->check_utf8 = ret->float_utf8; + } + } + } + + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); + ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); + + if (ret->pprivate) + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); + + if (RX_MATCH_COPIED(dstr)) + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); + else + ret->subbeg = NULL; +#ifdef PERL_ANY_COW + ret->saved_copy = NULL; +#endif + + /* Whether mother_re be set or no, we need to copy the string. We + cannot refrain from copying it when the storage points directly to + our mother regexp, because that's + 1: a buffer in a different thread + 2: something we no longer hold a reference on + so we need to copy it locally. */ + RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); + ret->mother_re = NULL; +} +#endif /* PERL_IN_XSUB_RE */ + +/* + regdupe_internal() + + This is the internal complement to regdupe() which is used to copy + the structure pointed to by the *pprivate pointer in the regexp. + This is the core version of the extension overridable cloning hook. + The regexp structure being duplicated will be copied by perl prior + to this and will be provided as the regexp *r argument, however + with the /old/ structures pprivate pointer value. Thus this routine + may override any copying normally done by perl. + + It returns a pointer to the new regexp_internal structure. +*/ + +void * +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) +{ + dVAR; + struct regexp *const r = ReANY(rx); + regexp_internal *reti; + int len; + RXi_GET_DECL(r,ri); + + PERL_ARGS_ASSERT_REGDUPE_INTERNAL; + + len = ProgLen(ri); + + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); + Copy(ri->program, reti->program, len+1, regnode); + + reti->num_code_blocks = ri->num_code_blocks; + if (ri->code_blocks) { + int n; + Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, + struct reg_code_block); + Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, + struct reg_code_block); + for (n = 0; n < ri->num_code_blocks; n++) + reti->code_blocks[n].src_regex = (REGEXP*) + sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); + } + else + reti->code_blocks = NULL; + + reti->regstclass = NULL; + + if (ri->data) { + struct reg_data *d; + const int count = ri->data->count; + int i; + + Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), + char, struct reg_data); + Newx(d->what, count, U8); + + d->count = count; + for (i = 0; i < count; i++) { + d->what[i] = ri->data->what[i]; + switch (d->what[i]) { + /* see also regcomp.h and regfree_internal() */ + case 'a': /* actually an AV, but the dup function is identical. */ + case 'r': + case 's': + case 'S': + case 'u': /* actually an HV, but the dup function is identical. */ + d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); + break; + case 'f': + /* This is cheating. */ + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); + reti->regstclass = (regnode*)d->data[i]; + break; + case 'T': + /* Trie stclasses are readonly and can thus be shared + * without duplication. We free the stclass in pregfree + * when the corresponding reg_ac_data struct is freed. + */ + reti->regstclass= ri->regstclass; + /* FALLTHROUGH */ + case 't': + OP_REFCNT_LOCK; + ((reg_trie_data*)ri->data->data[i])->refcount++; + OP_REFCNT_UNLOCK; + /* FALLTHROUGH */ + case 'l': + case 'L': + d->data[i] = ri->data->data[i]; + break; + default: + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); + } + } + + reti->data = d; + } + else + reti->data = NULL; + + reti->name_list_idx = ri->name_list_idx; + +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) { + Newx(reti->u.offsets, 2*len+1, U32); + Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); + } +#else + SetProgLen(reti,len); +#endif + + return (void*)reti; +} + +#endif /* USE_ITHREADS */ + +#ifndef PERL_IN_XSUB_RE + +/* + - regnext - dig the "next" pointer out of a node + */ +regnode * +Perl_regnext(pTHX_ regnode *p) +{ + I32 offset; + + if (!p) + return(NULL); + + if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); + } + + offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); + if (offset == 0) + return(NULL); + + return(p+offset); +} +#endif + +STATIC void +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) +{ + va_list args; + STRLEN l1 = strlen(pat1); + STRLEN l2 = strlen(pat2); + char buf[512]; + SV *msv; + const char *message; + + PERL_ARGS_ASSERT_RE_CROAK2; + + if (l1 > 510) + l1 = 510; + if (l1 + l2 > 510) + l2 = 510 - l1; + Copy(pat1, buf, l1 , char); + Copy(pat2, buf + l1, l2 , char); + buf[l1 + l2] = '\n'; + buf[l1 + l2 + 1] = '\0'; + va_start(args, pat2); + msv = vmess(buf, &args); + va_end(args); + message = SvPV_const(msv,l1); + if (l1 > 512) + l1 = 512; + Copy(message, buf, l1 , char); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); +} + +/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ + +#ifndef PERL_IN_XSUB_RE +void +Perl_save_re_context(pTHX) +{ + /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ + if (PL_curpm) { + const REGEXP * const rx = PM_GETRE(PL_curpm); + if (rx) { + U32 i; + for (i = 1; i <= RX_NPARENS(rx); i++) { + char digits[TYPE_CHARS(long)]; + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); + GV *const *const gvp + = (GV**)hv_fetch(PL_defstash, digits, len, 0); + + if (gvp) { + GV * const gv = *gvp; + if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) + save_scalar(gv); + } + } + } + } +} +#endif + +#ifdef DEBUGGING + +STATIC void +S_put_byte(pTHX_ SV *sv, int c) +{ + PERL_ARGS_ASSERT_PUT_BYTE; + + if (!isPRINT(c)) { + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; + } + } + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } +} + +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end'. It assumes that only ASCII printables are displayable + * as-is (though some of these will be escaped by put_byte()). For the + * time being, this subroutine only works for latin1 (< 256) code points */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + while (start <= end) { + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) { + put_byte(sv, start); + } + break; + } + + /* For small ranges that include printable ASCII characters, it's more + * legible to print those characters rather than hex values. For + * larger ranges that include more than printables, it's probably + * clearer to just give the start and end points of the range in hex, + * and that's all we can do if there aren't any printables within the + * range + * + * On ASCII platforms the range of printables is contiguous. If the + * entire range is printable, we print each character as such. If the + * range is partially printable and partially not, it's less likely + * that the individual printables are meaningful, especially if all or + * almost all of them are in the range. But we err on the side of the + * individual printables being meaningful by using the hex only if the + * range contains all but 2 of the printables. + * + * On EBCDIC platforms, the printables are scattered around so that the + * maximum range length containing only them is about 10. Anything + * longer we treat as hex; otherwise we examine the range character by + * character to see */ +#ifdef EBCDIC + if (start < 256 && (((end < 255) ? end : 255) - start <= 10)) +#else + if ((isPRINT_A(start) && isPRINT_A(end)) + || (end >= 0x7F && (isPRINT_A(start) && start > 0x21)) + || ((end < 0x7D && isPRINT_A(end)) && start < 0x20)) +#endif + { + /* If the range beginning isn't an ASCII printable, we find the + * last such in the range, then split the output, so all the + * non-printables are in one subrange; then process the remaining + * portion as usual. If the entire range isn't printables, we + * don't split, but drop down to print as hex */ + if (! isPRINT_A(start)) { + UV temp_end = start + 1; + while (temp_end <= end && ! isPRINT_A(temp_end)) { + temp_end++; + } + if (temp_end <= end) { + put_range(sv, start, temp_end - 1); + start = temp_end; + continue; + } + } + + /* If the range beginning is a digit, output a subrange of just the + * digits, then process the remaining portion as usual */ + if (isDIGIT_A(start)) { + put_byte(sv, start); + sv_catpvs(sv, "-"); + while (start <= end && isDIGIT_A(start)) start++; + put_byte(sv, start - 1); + continue; + } + + /* Similarly for alphabetics. Because in both ASCII and EBCDIC, + * the code points for upper and lower A-Z and a-z aren't + * intermixed, the resulting subrange will consist solely of either + * upper- or lower- alphabetics */ + if (isALPHA_A(start)) { + put_byte(sv, start); + sv_catpvs(sv, "-"); + while (start <= end && isALPHA_A(start)) start++; + put_byte(sv, start - 1); + continue; + } + + /* We output any remaining printables as individual characters */ + if (isPUNCT_A(start) || isSPACE_A(start)) { + while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) { + put_byte(sv, start); + start++; + } + continue; + } + } + + /* Here is a control or non-ascii. Output the range or subrange as + * hex. */ + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + break; + } +} + +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i < 256; i++) { + if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) { + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j <= 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); + has_output_anything = TRUE; + i = j; + } + } + + return has_output_anything; +} + +#define CLEAR_OPTSTART \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ + } STMT_END + +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); + +STATIC const regnode * +S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, + const regnode *last, const regnode *plast, + SV* sv, I32 indent, U32 depth) +{ + dVAR; + U8 op = PSEUDO; /* Arbitrary non-END op. */ + const regnode *next; + const regnode *optstart= NULL; + + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMPUNTIL; + +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, + last ? last-start : 0,plast ? plast-start : 0); +#endif + + if (plast && plast < last) + last= plast; + + while (PL_regkind[op] != END && (!last || node < last)) { + assert(node); + /* While that wasn't END last time... */ + NODE_ALIGN(node); + op = OP(node); + if (op == CLOSE || op == WHILEM) + indent--; + next = regnext((regnode *)node); + + /* Where, what. */ + if (OP(node) == OPTIMIZED) { + if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) + optstart = node; + else + goto after_print; + } else + CLEAR_OPTSTART; + + regprop(r, sv, node, NULL); + PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), + (int)(2*indent + 1), "", SvPVX_const(sv)); + + if (OP(node) != OPTIMIZED) { + if (next == NULL) /* Next ptr. */ + PerlIO_printf(Perl_debug_log, " (0)"); + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) + PerlIO_printf(Perl_debug_log, " (FAIL)"); + else + PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); + (void)PerlIO_putc(Perl_debug_log, '\n'); + } + + after_print: + if (PL_regkind[(U8)op] == BRANCHJ) { + assert(next); + { + const regnode *nnode = (OP(next) == LONGJMP + ? regnext((regnode *)next) + : next); + if (last && nnode > last) + nnode = last; + DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); + } + } + else if (PL_regkind[(U8)op] == BRANCH) { + assert(next); + DUMPUNTIL(NEXTOPER(node), next); + } + else if ( PL_regkind[(U8)op] == TRIE ) { + const regnode *this_trie = node; + const char op = OP(node); + const U32 n = ARG(node); + const reg_ac_data * const ac = op>=AHOCORASICK ? + (reg_ac_data *)ri->data->data[n] : + NULL; + const reg_trie_data * const trie = + (reg_trie_data*)ri->data->data[optrie]; +#ifdef DEBUGGING + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); +#endif + const regnode *nextbranch= NULL; + I32 word_idx; + sv_setpvs(sv, ""); + for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { + SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); + + PerlIO_printf(Perl_debug_log, "%*s%s ", + (int)(2*(indent+3)), "", + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT + ) + : "???" + ); + if (trie->jump) { + U16 dist= trie->jump[word_idx+1]; + PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", + (UV)((dist ? this_trie + dist : next) - start)); + if (dist) { + if (!nextbranch) + nextbranch= this_trie + trie->jump[0]; + DUMPUNTIL(this_trie + dist, nextbranch); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode *)nextbranch); + } else { + PerlIO_printf(Perl_debug_log, "\n"); + } + } + if (last && next > last) + node= last; + else + node= next; + } + else if ( op == CURLY ) { /* "next" might be very big: optimizer */ + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, + NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); + } + else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { + assert(next); + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); + } + else if ( op == PLUS || op == STAR) { + DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); + } + else if (PL_regkind[(U8)op] == ANYOF) { + /* arglen 1 + class block */ + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); + node = NEXTOPER(node); + } + else if (PL_regkind[(U8)op] == EXACT) { + /* Literal string, where present. */ + node += NODE_SZ_STR(node) - 1; + node = NEXTOPER(node); + } + else { + node = NEXTOPER(node); + node += regarglen[(U8)op]; + } + if (op == CURLYX || op == OPEN) + indent++; + } + CLEAR_OPTSTART; +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); +#endif + return node; +} + +#endif /* DEBUGGING */ + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021002/regexec.c b/src/5021002/regexec.c new file mode 100644 index 0000000..f158aaa --- /dev/null +++ b/src/5021002/regexec.c @@ -0,0 +1,8176 @@ +/* regexec.c + */ + +/* + * One Ring to rule them all, One Ring to find them + & + * [p.v of _The Lord of the Rings_, opening poem] + * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] + * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] + */ + +/* This file contains functions for executing a regular expression. See + * also regcomp.c which funnily enough, contains functions for compiling + * a regular expression. + * + * This file is also copied at build time to ext/re/re_exec.c, where + * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. + * This causes the main functions to be compiled under new names and with + * debugging support added, which makes "use re 'debug'" work. + */ + +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +/* + * pregcomp and pregexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + **** Alterations to Henry's code are... + **** + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others + **** + **** You may distribute under the terms of either the GNU General Public + **** License or the Artistic License, as specified in the README file. + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#define PERL_IN_REGEXEC_C +#include "perl.h" +#include "re_defs.h" + +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +#else +# include "regcomp.h" +#endif + +#include "inline_invlist.c" +#include "unicode_constants.h" + +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + +#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) + +#ifndef STATIC +#define STATIC static +#endif + +/* Valid only for non-utf8 strings: avoids the reginclass + * call if there are no complications: i.e., if everything matchable is + * straight forward in the bitmap */ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ + : ANYOF_BITMAP_TEST(p,*(c))) + +/* + * Forwards. + */ + +#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b) + +#define HOPc(pos,off) \ + (char *)(reginfo->is_utf8_target \ + ? reghop3((U8*)pos, off, \ + (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ + : (U8*)(pos + off)) + +#define HOPBACKc(pos, off) \ + (char*)(reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ + : (pos - off >= reginfo->strbeg) \ + ? (U8*)pos - off \ + : NULL) + +#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) +#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) + +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) + +#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ +#define NEXTCHR_IS_EOS (nextchr < 0) + +#define SET_nextchr \ + nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) + +#define SET_locinput(p) \ + locinput = (p); \ + SET_nextchr + + +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ + if (!swash_ptr) { \ + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ + swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ + 1, 0, invlist, &flags); \ + assert(swash_ptr); \ + } \ + } STMT_END + +/* If in debug mode, we test that a known character properly matches */ +#ifdef DEBUGGING +# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ + property_name, \ + invlist, \ + utf8_char_in_property) \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ + assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); +#else +# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ + property_name, \ + invlist, \ + utf8_char_in_property) \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) +#endif + +#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ + PL_utf8_swash_ptrs[_CC_WORDCHAR], \ + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); + +#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ + STMT_START { \ + LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ + "_X_regular_begin", \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ + LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ + "_X_extend", \ + NULL, \ + COMBINING_GRAVE_ACCENT_UTF8); \ + } STMT_END + +#define PLACEHOLDER /* Something for the preprocessor to grab onto */ +/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ + +/* for use after a quantifier and before an EXACT-like node -- japhy */ +/* it would be nice to rework regcomp.sym to generate this stuff. sigh + * + * NOTE that *nothing* that affects backtracking should be in here, specifically + * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a + * node that is in between two EXACT like nodes when ascertaining what the required + * "follow" character is. This should probably be moved to regex compile time + * although it may be done at run time beause of the REF possibility - more + * investigation required. -- demerphq +*/ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ + (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ +) +#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) + +#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) + +#if 0 +/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so + we don't need this definition. */ +#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) + +#else +/* ... so we use this as its faster. */ +#define IS_TEXT(rn) ( OP(rn)==EXACT ) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) +#define IS_TEXTF(rn) ( OP(rn)==EXACTF ) +#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) + +#endif + +/* + Search for mandatory following text node; for lookahead, the text must + follow but for lookbehind (rn->flags != 0) we skip to the next step. +*/ +#define FIND_NEXT_IMPT(rn) STMT_START { \ + while (JUMPABLE(rn)) { \ + const OPCODE type = OP(rn); \ + if (type == SUSPEND || PL_regkind[type] == CURLY) \ + rn = NEXTOPER(NEXTOPER(rn)); \ + else if (type == PLUS) \ + rn = NEXTOPER(rn); \ + else if (type == IFMATCH) \ + rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ + else rn += NEXT_OFF(rn); \ + } \ +} STMT_END + +/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode. + * These are for the pre-composed Hangul syllables, which are all in a + * contiguous block and arranged there in such a way so as to facilitate + * alorithmic determination of their characteristics. As such, they don't need + * a swash, but can be determined by simple arithmetic. Almost all are + * GCB=LVT, but every 28th one is a GCB=LV */ +#define SBASE 0xAC00 /* Start of block */ +#define SCount 11172 /* Length of block */ +#define TCount 28 + +#define SLAB_FIRST(s) (&(s)->states[0]) +#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) + +static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); +static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); +static regmatch_state * S_push_slab(pTHX); + +#define REGCP_PAREN_ELEMS 3 +#define REGCP_OTHER_ELEMS 3 +#define REGCP_FRAME_ELEMS 1 +/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and + * are needed for the regexp context stack bookkeeping. */ + +STATIC CHECKPOINT +S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) +{ + const int retval = PL_savestack_ix; + const int paren_elems_to_push = + (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS; + const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; + const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; + I32 p; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCPPUSH; + + if (paren_elems_to_push < 0) + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u", + (int)paren_elems_to_push, (int)maxopenparen, + (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS); + + if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) + Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf + " out of range (%lu-%ld)", + total_elems, + (unsigned long)maxopenparen, + (long)parenfloor); + + SSGROW(total_elems + REGCP_FRAME_ELEMS); + + DEBUG_BUFFERS_r( + if ((int)maxopenparen > (int)parenfloor) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { +/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); + SSPUSHINT(rex->offs[p].start_tmp); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", + (UV)p, + (IV)rex->offs[p].start, + (IV)rex->offs[p].start_tmp, + (IV)rex->offs[p].end + )); + } +/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ + SSPUSHINT(maxopenparen); + SSPUSHINT(rex->lastparen); + SSPUSHINT(rex->lastcloseparen); + SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ + + return retval; +} + +/* These are needed since we do not localize EVAL nodes: */ +#define REGCP_SET(cp) \ + DEBUG_STATE_r( \ + PerlIO_printf(Perl_debug_log, \ + " Setting an EVAL scope, savestack=%"IVdf"\n", \ + (IV)PL_savestack_ix)); \ + cp = PL_savestack_ix + +#define REGCP_UNWIND(cp) \ + DEBUG_STATE_r( \ + if (cp != PL_savestack_ix) \ + PerlIO_printf(Perl_debug_log, \ + " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ + (IV)(cp), (IV)PL_savestack_ix)); \ + regcpblow(cp) + +#define UNWIND_PAREN(lp, lcp) \ + for (n = rex->lastparen; n > lp; n--) \ + rex->offs[n].end = -1; \ + rex->lastparen = n; \ + rex->lastcloseparen = lcp; + + +STATIC void +S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) +{ + UV i; + U32 paren; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCPPOP; + + /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ + i = SSPOPUV; + assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ + i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ + rex->lastcloseparen = SSPOPINT; + rex->lastparen = SSPOPINT; + *maxopenparen_p = SSPOPINT; + + i -= REGCP_OTHER_ELEMS; + /* Now restore the parentheses context. */ + DEBUG_BUFFERS_r( + if (i || rex->lastparen + 1 <= rex->nparens) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + paren = *maxopenparen_p; + for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { + SSize_t tmps; + rex->offs[paren].start_tmp = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; + if (paren <= rex->lastparen) + rex->offs[paren].end = tmps; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n", + (UV)paren, + (IV)rex->offs[paren].start, + (IV)rex->offs[paren].start_tmp, + (IV)rex->offs[paren].end, + (paren > rex->lastparen ? "(skipped)" : "")); + ); + paren--; + } +#if 1 + /* It would seem that the similar code in regtry() + * already takes care of this, and in fact it is in + * a better location to since this code can #if 0-ed out + * but the code in regtry() is needed or otherwise tests + * requiring null fields (pat.t#187 and split.t#{13,14} + * (as of patchlevel 7877) will fail. Then again, + * this code seems to be necessary or otherwise + * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ + * --jhi updated by dapm */ + for (i = rex->lastparen + 1; i <= rex->nparens; i++) { + if (i > *maxopenparen_p) + rex->offs[i].start = -1; + rex->offs[i].end = -1; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %s ..-1 undeffing\n", + (UV)i, + (i > *maxopenparen_p) ? "-1" : " " + )); + } +#endif +} + +/* restore the parens and associated vars at savestack position ix, + * but without popping the stack */ + +STATIC void +S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p) +{ + I32 tmpix = PL_savestack_ix; + PL_savestack_ix = ix; + regcppop(rex, maxopenparen_p); + PL_savestack_ix = tmpix; +} + +#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ + +STATIC bool +S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) +{ + /* Returns a boolean as to whether or not 'character' is a member of the + * Posix character class given by 'classnum' that should be equivalent to a + * value in the typedef '_char_class_number'. + * + * Ideally this could be replaced by a just an array of function pointers + * to the C library functions that implement the macros this calls. + * However, to compile, the precise function signatures are required, and + * these may vary from platform to to platform. To avoid having to figure + * out what those all are on each platform, I (khw) am using this method, + * which adds an extra layer of function call overhead (unless the C + * optimizer strips it away). But we don't particularly care about + * performance with locales anyway. */ + + switch ((_char_class_number) classnum) { + case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character); + case _CC_ENUM_ALPHA: return isALPHA_LC(character); + case _CC_ENUM_ASCII: return isASCII_LC(character); + case _CC_ENUM_BLANK: return isBLANK_LC(character); + case _CC_ENUM_CASED: return isLOWER_LC(character) + || isUPPER_LC(character); + case _CC_ENUM_CNTRL: return isCNTRL_LC(character); + case _CC_ENUM_DIGIT: return isDIGIT_LC(character); + case _CC_ENUM_GRAPH: return isGRAPH_LC(character); + case _CC_ENUM_LOWER: return isLOWER_LC(character); + case _CC_ENUM_PRINT: return isPRINT_LC(character); + case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character); + case _CC_ENUM_PUNCT: return isPUNCT_LC(character); + case _CC_ENUM_SPACE: return isSPACE_LC(character); + case _CC_ENUM_UPPER: return isUPPER_LC(character); + case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character); + case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character); + default: /* VERTSPACE should never occur in locales */ + Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum); + } + + assert(0); /* NOTREACHED */ + return FALSE; +} + +STATIC bool +S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) +{ + /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded + * 'character' is a member of the Posix character class given by 'classnum' + * that should be equivalent to a value in the typedef + * '_char_class_number'. + * + * This just calls isFOO_lc on the code point for the character if it is in + * the range 0-255. Outside that range, all characters avoid Unicode + * rules, ignoring any locale. So use the Unicode function if this class + * requires a swash, and use the Unicode macro otherwise. */ + + PERL_ARGS_ASSERT_ISFOO_UTF8_LC; + + if (UTF8_IS_INVARIANT(*character)) { + return isFOO_lc(classnum, *character); + } + else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { + return isFOO_lc(classnum, + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); + } + + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Initialize the swash unless done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) + character, + TRUE /* is UTF */ )); + } + + switch ((_char_class_number) classnum) { + case _CC_ENUM_SPACE: + case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character); + + case _CC_ENUM_BLANK: return is_HORIZWS_high(character); + case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character); + case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character); + default: return 0; /* Things like CNTRL are always + below 256 */ + } + + assert(0); /* NOTREACHED */ + return FALSE; +} + +/* + * pregexec and friends + */ + +#ifndef PERL_IN_XSUB_RE +/* + - pregexec - match a regexp against a string + */ +I32 +Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) +/* stringarg: the point in the string at which to begin matching */ +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >= minend bytes after stringarg. */ +/* screamer: SV being matched: only used for utf8 flag, pos() etc; string + * itself is accessed via the pointers above */ +/* nosave: For optimizations. */ +{ + PERL_ARGS_ASSERT_PREGEXEC; + + return + regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, + nosave ? 0 : REXEC_COPY_STR); +} +#endif + + + +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. + */ + +char * +Perl_re_intuit_start(pTHX_ + REGEXP * const rx, + SV *sv, + const char * const strbeg, + char *strpos, + char *strend, + const U32 flags, + re_scream_pos_data *data) +{ + struct regexp *const prog = ReANY(rx); + SSize_t start_shift = prog->check_offset_min; + /* Should be nonnegative! */ + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; + SV *check; + const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ + char *check_at = NULL; /* check substr found at this pos */ + const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; + RXi_GET_DECL(prog,progi); + regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ + regmatch_info *const reginfo = ®info_buf; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_START; + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(data); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ + if (prog->minlen > strend - strpos) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too short...\n")); + goto fail; + } + + reginfo->is_utf8_target = cBOOL(utf8_target); + reginfo->info_aux = NULL; + reginfo->strbeg = strbeg; + reginfo->strend = strend; + reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); + reginfo->intuit = 1; + /* not actually used within intuit, but zero for safety anyway */ + reginfo->poscache_maxiter = 0; + + if (utf8_target) { + if (!prog->check_utf8 && prog->check_substr) + to_utf8_substr(prog); + check = prog->check_utf8; + } else { + if (!prog->check_substr && prog->check_utf8) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); + } + } + check = prog->check_substr; + } + + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); + } + }); + + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ + + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); + + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; + } + + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; + } + if (slen && (*SvPVX_const(check) != *s + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); + goto fail_finish; + } + + check_at = s; + goto success_at_start; + } + } + } + + end_shift = prog->check_end_shift; + +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ + if (end_shift < 0) + Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", + (IV)end_shift, RX_PRECOMP(prog)); +#endif + + restart: + + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ + + { + U8* start_point; + U8* end_point; + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); + + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; + } else { + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; + } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } + } + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", + (int)(end_point - start_point), + (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), + start_point); + }); + + check_at = fbm_instr( start_point, end_point, + check, multiline ? FBMrf_MULTILINE : 0); + + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ + + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ + + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); + } + + + /* now look for the 'other' substring if defined */ + + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) + { + /* Take into account the "other" substring. */ + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ + + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); + + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); + } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; + } + else { + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); + } + } + } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); + } + + postprocess_substr_matches: + + /* handle the extra constraint of /^.../m if present */ + + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); + goto fail_finish; + } + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) + { + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; + } + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; + } + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); + } + + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ + + if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { + const U8* const str = (U8*)STRING(progi->regstclass); + + /* XXX this value could be pre-computed */ + const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT + ? (reginfo->is_utf8_pat + ? utf8_distance(str + STR_LEN(progi->regstclass), str) + : STR_LEN(progi->regstclass)) + : 1); + char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } + else + endpos= strend; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); + + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, + reginfo); + if (!s) { + if (endpos == strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) + goto fail; + + /* Contradict one of substrings */ + if (prog->anchored_substr || prog->anchored_utf8) { + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; + } + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; + } + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; + } + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; + } + + /* Success !!! */ + + if (rx_origin != s) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) + ); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Does not contradict STCLASS...\n"); + ); + } + } + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; + } + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; + + fail_finish: /* Substring not found */ + if (prog->check_substr || prog->check_utf8) /* could be removed already */ + BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ + fail: + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", + PL_colors[4], PL_colors[5])); + return NULL; +} + + +#define DECL_TRIE_TYPE(scan) \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ + trie_type = ((scan->flags == EXACT) \ + ? (utf8_target ? trie_utf8 : trie_plain) \ + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + +#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ +STMT_START { \ + STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ + switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ + case trie_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ + len = UTF8SKIP(uc); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ + case trie_latin_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_utf8: \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + break; \ + case trie_plain: \ + uvc = (UV)*uc; \ + len = 1; \ + } \ + if (uvc < 256) { \ + charid = trie->charmap[ uvc ]; \ + } \ + else { \ + charid = 0; \ + if (widecharmap) { \ + SV** const svpp = hv_fetch(widecharmap, \ + (char*)&uvc, sizeof(UV), 0); \ + if (svpp) \ + charid = (U16)SvIV(*svpp); \ + } \ + } \ +} STMT_END + +#define DUMP_EXEC_POS(li,s,doutf8) \ + dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ + startpos, doutf8) + +#define REXEC_FBC_EXACTISH_SCAN(COND) \ +STMT_START { \ + while (s <= e) { \ + if ( (COND) \ + && (ln == 1 || folder(s, pat_string, ln)) \ + && (reginfo->intuit || regtry(reginfo, &s)) )\ + goto got_it; \ + s++; \ + } \ +} STMT_END + +#define REXEC_FBC_UTF8_SCAN(CODE) \ +STMT_START { \ + while (s < strend) { \ + CODE \ + s += UTF8SKIP(s); \ + } \ +} STMT_END + +#define REXEC_FBC_SCAN(CODE) \ +STMT_START { \ + while (s < strend) { \ + CODE \ + s++; \ + } \ +} STMT_END + +#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \ +REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \ + if (COND) { \ + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it; \ + else \ + tmp = doevery; \ + } \ + else \ + tmp = 1; \ +) + +#define REXEC_FBC_CLASS_SCAN(COND) \ +REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ + if (COND) { \ + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it; \ + else \ + tmp = doevery; \ + } \ + else \ + tmp = 1; \ +) + +#define REXEC_FBC_CSCAN(CONDUTF8,COND) \ + if (utf8_target) { \ + REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \ + } \ + else { \ + REXEC_FBC_CLASS_SCAN(COND); \ + } + +/* The three macros below are slightly different versions of the same logic. + * + * The first is for /a and /aa when the target string is UTF-8. This can only + * match ascii, but it must advance based on UTF-8. The other two handle the + * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking + * for the boundary (or non-boundary) between a word and non-word character. + * The utf8 and non-utf8 cases have the same logic, but the details must be + * different. Find the "wordness" of the character just prior to this one, and + * compare it with the wordness of this one. If they differ, we have a + * boundary. At the beginning of the string, pretend that the previous + * character was a new-line. + * + * All these macros uncleanly have side-effects with each other and outside + * variables. So far it's been too much trouble to clean-up + * + * TEST_NON_UTF8 is the macro or function to call to test if its byte input is + * a word character or not. + * IF_SUCCESS is code to do if it finds that we are at a boundary between + * word/non-word + * IF_FAIL is code to do if we aren't at a boundary between word/non-word + * + * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we + * are looking for a boundary or for a non-boundary. If we are looking for a + * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and + * see if this tentative match actually works, and if so, to quit the loop + * here. And vice-versa if we are looking for a non-boundary. + * + * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and + * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of + * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be + * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal + * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that + * complement. But in that branch we complement tmp, meaning that at the + * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s), + * which means at the top of the loop in the next iteration, it is + * TEST_NON_UTF8(s-1) */ +#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + tmp = !tmp; \ + IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + +/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and + * TEST_UTF8 is a macro that for the same input code points returns identically + * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */ +#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ + if (s == reginfo->strbeg) { \ + tmp = '\n'; \ + } \ + else { /* Back-up to the start of the previous character */ \ + U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ + } \ + tmp = TEST_UV(tmp); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ + REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + if (tmp == ! (TEST_UTF8((U8 *) s))) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); + +/* Like the above two macros. UTF8_CODE is the complete code for handling + * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc + * macros below */ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + if (utf8_target) { \ + UTF8_CODE \ + } \ + else { /* Not utf8 */ \ + tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_SCAN( /* advances s while s < strend */ \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + IF_SUCCESS; \ + tmp = !tmp; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + } \ + /* Here, things have been set up by the previous code so that tmp is the \ + * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \ + * utf8ness of the target). We also have to check if this matches against \ + * the EOS, which we treat as a \n (which is the same value in both UTF-8 \ + * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \ + * string */ \ + if (tmp == ! TEST_NON_UTF8('\n')) { \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } + +/* This is the macro to use when we want to see if something that looks like it + * could match, actually does, and if so exits the loop */ +#define REXEC_FBC_TRYIT \ + if ((reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it + +/* The only difference between the BOUND and NBOUND cases is that + * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in + * NBOUND. This is accomplished by passing it as either the if or else clause, + * with the other one being empty (PLACEHOLDER is defined as empty). + * + * The TEST_FOO parameters are for operating on different forms of input, but + * all should be ones that return identically for the same underlying code + * points */ +#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ + TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ + TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ + TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + +#define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ + TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + + +/* We know what class REx starts with. Try to find this position... */ +/* if reginfo->intuit, its a dryrun */ +/* annoyingly all the vars in this routine have different names from their counterparts + in regmatch. /grrr */ +STATIC char * +S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, + const char *strend, regmatch_info *reginfo) +{ + dVAR; + const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; + char *pat_string; /* The pattern's exactish string */ + char *pat_end; /* ptr to end char of pat_string */ + re_fold_t folder; /* Function for computing non-utf8 folds */ + const U8 *fold_array; /* array for folding ords < 256 */ + STRLEN ln; + STRLEN lnc; + U8 c1; + U8 c2; + char *e; + I32 tmp = 1; /* Scratch variable? */ + const bool utf8_target = reginfo->is_utf8_target; + UV utf8_fold_flags = 0; + const bool is_utf8_pat = reginfo->is_utf8_pat; + bool to_complement = FALSE; /* Invert the result? Taking the xor of this + with a result inverts that result, as 0^1 = + 1 and 1^1 = 0 */ + _char_class_number classnum; + + RXi_GET_DECL(prog,progi); + + PERL_ARGS_ASSERT_FIND_BYCLASS; + + /* We know what class it must start with. */ + switch (OP(c)) { + case ANYOF: + if (utf8_target) { + REXEC_FBC_UTF8_CLASS_SCAN( + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); + } + else { + REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); + } + break; + case CANY: + REXEC_FBC_SCAN( + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) + goto got_it; + else + tmp = doevery; + ); + break; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + if (is_utf8_pat || utf8_target) { + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf_utf8; + } + fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */ + folder = foldEQ_latin1; /* /a, except the sharp s one which */ + goto do_exactf_non_utf8; /* isn't dealt with by these */ + + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + if (utf8_target) { + utf8_fold_flags = 0; + goto do_exactf_utf8; + } + fold_array = PL_fold; + folder = foldEQ; + goto do_exactf_non_utf8; + + case EXACTFL: + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_exactf_utf8; + } + fold_array = PL_fold_locale; + folder = foldEQ_locale; + goto do_exactf_non_utf8; + + case EXACTFU_SS: + if (is_utf8_pat) { + utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + } + goto do_exactf_utf8; + + case EXACTFU: + if (is_utf8_pat || utf8_target) { + utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; + goto do_exactf_utf8; + } + + /* Any 'ss' in the pattern should have been replaced by regcomp, + * so we don't have to worry here about this single special case + * in the Latin1 range */ + fold_array = PL_fold_latin1; + folder = foldEQ_latin1; + + /* FALLTHROUGH */ + + do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there + are no glitches with fold-length differences + between the target string and pattern */ + + /* The idea in the non-utf8 EXACTF* cases is to first find the + * first character of the EXACTF* node and then, if necessary, + * case-insensitively compare the full text of the node. c1 is the + * first character. c2 is its fold. This logic will not work for + * Unicode semantics and the german sharp ss, which hence should + * not be compiled into a node that gets here. */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + + /* We know that we have to match at least 'ln' bytes (which is the + * same as characters, since not utf8). If we have to match 3 + * characters, and there are only 2 availabe, we know without + * trying that it will fail; so don't start a match past the + * required minimum number from the far end */ + e = HOP3c(strend, -((SSize_t)ln), s); + + if (reginfo->intuit && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + c1 = *pat_string; + c2 = fold_array[c1]; + if (c1 == c2) { /* If char and fold are the same */ + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); + } + else { + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); + } + break; + + do_exactf_utf8: + { + unsigned expansion; + + /* If one of the operands is in utf8, we can't use the simpler folding + * above, due to the fact that many different characters can have the + * same fold, or portion of a fold, or different- length fold */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_end = pat_string + ln; + lnc = is_utf8_pat /* length to match in characters */ + ? utf8_length((U8 *) pat_string, (U8 *) pat_end) + : ln; + + /* We have 'lnc' characters to match in the pattern, but because of + * multi-character folding, each character in the target can match + * up to 3 characters (Unicode guarantees it will never exceed + * this) if it is utf8-encoded; and up to 2 if not (based on the + * fact that the Latin 1 folds are already determined, and the + * only multi-char fold in that range is the sharp-s folding to + * 'ss'. Thus, a pattern character can match as little as 1/3 of a + * string character. Adjust lnc accordingly, rounding up, so that + * if we need to match at least 4+1/3 chars, that really is 5. */ + expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; + lnc = (lnc + expansion - 1) / expansion; + + /* As in the non-UTF8 case, if we have to match 3 characters, and + * only 2 are left, it's guaranteed to fail, so don't start a + * match that would require us to go beyond the end of the string + */ + e = HOP3c(strend, -((SSize_t)lnc), s); + + if (reginfo->intuit && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + /* XXX Note that we could recalculate e to stop the loop earlier, + * as the worst case expansion above will rarely be met, and as we + * go along we would usually find that e moves further to the left. + * This would happen only after we reached the point in the loop + * where if there were no expansion we should fail. Unclear if + * worth the expense */ + + while (s <= e) { + char *my_strend= (char *)strend; + if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, + pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags) + && (reginfo->intuit || regtry(reginfo, &s)) ) + { + goto got_it; + } + s += (utf8_target) ? UTF8SKIP(s) : 1; + } + break; + } + + case BOUNDL: + FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); + break; + case NBOUNDL: + FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); + break; + case BOUND: + FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case BOUNDA: + FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A); + break; + case NBOUND: + FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case NBOUNDA: + FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A); + break; + case BOUNDU: + FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case NBOUNDU: + FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case LNBREAK: + REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), + is_LNBREAK_latin1_safe(s, strend) + ); + break; + + /* The argument to all the POSIX node types is the class number to pass to + * _generic_isCC() to build a mask for searching in PL_charclass[] */ + + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: + REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), + to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); + break; + + case NPOSIXD: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: + if (utf8_target) { + goto posix_utf8; + } + goto posixa; + + case NPOSIXA: + if (utf8_target) { + /* The complement of something that matches only ASCII matches all + * non-ASCII, plus everything in ASCII that isn't in the class. */ + REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s) + || ! _generic_isCC_A(*s, FLAGS(c))); + break; + } + + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXA: + posixa: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character. */ + REXEC_FBC_CLASS_SCAN( + to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); + break; + + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s, + FLAGS(c)))); + } + else { + + posix_utf8: + classnum = (_char_class_number) FLAGS(c); + if (classnum < _FIRST_NON_SWASH_CC) { + while (s < strend) { + + /* We avoid loading in the swash as long as possible, but + * should we have to, we jump to a separate loop. This + * extra 'if' statement is what keeps this code from being + * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */ + if (UTF8_IS_ABOVE_LATIN1(*s)) { + goto found_above_latin1; + } + if ((UTF8_IS_INVARIANT(*s) + && to_complement ^ cBOOL(_generic_isCC((U8) *s, + classnum))) + || (UTF8_IS_DOWNGRADEABLE_START(*s) + && to_complement ^ cBOOL( + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), + classnum)))) + { + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) + goto got_it; + else { + tmp = doevery; + } + } + else { + tmp = 1; + } + s += UTF8SKIP(s); + } + } + else switch (classnum) { /* These classes are implemented as + macros */ + case _CC_ENUM_SPACE: /* XXX would require separate code if we + revert the change of \v matching this */ + /* FALLTHROUGH */ + + case _CC_ENUM_PSXSPC: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isSPACE_utf8(s))); + break; + + case _CC_ENUM_BLANK: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isBLANK_utf8(s))); + break; + + case _CC_ENUM_XDIGIT: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isXDIGIT_utf8(s))); + break; + + case _CC_ENUM_VERTSPACE: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isVERTWS_utf8(s))); + break; + + case _CC_ENUM_CNTRL: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isCNTRL_utf8(s))); + break; + + default: + Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum); + assert(0); /* NOTREACHED */ + } + } + break; + + found_above_latin1: /* Here we have to load a swash to get the result + for the current code point */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + /* This is a copy of the loop above for swash classes, though using the + * FBC macro instead of being expanded out. Since we've loaded the + * swash, we don't have to check for that each time through the loop */ + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(_generic_utf8( + classnum, + s, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) s, TRUE)))); + break; + + case AHOCORASICKC: + case AHOCORASICK: + { + DECL_TRIE_TYPE(c); + /* what trie are we using right now */ + reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; + reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ]; + HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); + + const char *last_start = strend - trie->minlen; +#ifdef DEBUGGING + const char *real_start = s; +#endif + STRLEN maxlen = trie->maxlen; + SV *sv_points; + U8 **points; /* map of where we were in the input string + when reading a given char. For ASCII this + is unnecessary overhead as the relationship + is always 1:1, but for Unicode, especially + case folded Unicode this is not true. */ + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + U8 *bitmap=NULL; + + + GET_RE_DEBUG_FLAGS_DECL; + + /* We can't just allocate points here. We need to wrap it in + * an SV so it gets freed properly if there is a croak while + * running the match */ + ENTER; + SAVETMPS; + sv_points=newSV(maxlen * sizeof(U8 *)); + SvCUR_set(sv_points, + maxlen * sizeof(U8 *)); + SvPOK_on(sv_points); + sv_2mortal(sv_points); + points=(U8**)SvPV_nolen(sv_points ); + if ( trie_type != trie_utf8_fold + && (trie->bitmap || OP(c)==AHOCORASICKC) ) + { + if (trie->bitmap) + bitmap=(U8*)trie->bitmap; + else + bitmap=(U8*)ANYOF_BITMAP(c); + } + /* this is the Aho-Corasick algorithm modified a touch + to include special handling for long "unknown char" sequences. + The basic idea being that we use AC as long as we are dealing + with a possible matching char, when we encounter an unknown char + (and we have not encountered an accepting state) we scan forward + until we find a legal starting char. + AC matching is basically that of trie matching, except that when + we encounter a failing transition, we fall back to the current + states "fail state", and try the current char again, a process + we repeat until we reach the root state, state 1, or a legal + transition. If we fail on the root state then we can either + terminate if we have reached an accepting state previously, or + restart the entire process from the beginning if we have not. + + */ + while (s <= last_start) { + const U32 uniflags = UTF8_ALLOW_DEFAULT; + U8 *uc = (U8*)s; + U16 charid = 0; + U32 base = 1; + U32 state = 1; + UV uvc = 0; + STRLEN len = 0; + STRLEN foldlen = 0; + U8 *uscan = (U8*)NULL; + U8 *leftmost = NULL; +#ifdef DEBUGGING + U32 accepted_word= 0; +#endif + U32 pointpos = 0; + + while ( state && uc <= (U8*)strend ) { + int failed=0; + U32 word = aho->states[ state ].wordnum; + + if( state==1 ) { + if ( bitmap ) { + DEBUG_TRIE_EXECUTE_r( + if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + dump_exec_pos( (char *)uc, c, strend, real_start, + (char *)uc, utf8_target ); + PerlIO_printf( Perl_debug_log, + " Scanning for legal start char...\n"); + } + ); + if (utf8_target) { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc += UTF8SKIP(uc); + } + } else { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; + } + } + s= (char *)uc; + } + if (uc >(U8*)last_start) break; + } + + if ( word ) { + U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ]; + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=word); + leftmost= lpos; + } + if (base==0) break; + + } + points[pointpos++ % maxlen]= uc; + if (foldlen || uc < (U8*)strend) { + REXEC_TRIE_READ_CHAR(trie_type, trie, + widecharmap, uc, + uscan, len, uvc, charid, foldlen, + foldbuf, uniflags); + DEBUG_TRIE_EXECUTE_r({ + dump_exec_pos( (char *)uc, c, strend, + real_start, s, utf8_target); + PerlIO_printf(Perl_debug_log, + " Charid:%3u CP:%4"UVxf" ", + charid, uvc); + }); + } + else { + len = 0; + charid = 0; + } + + + do { +#ifdef DEBUGGING + word = aho->states[ state ].wordnum; +#endif + base = aho->states[ state ].trans.base; + + DEBUG_TRIE_EXECUTE_r({ + if (failed) + dump_exec_pos( (char *)uc, c, strend, real_start, + s, utf8_target ); + PerlIO_printf( Perl_debug_log, + "%sState: %4"UVxf", word=%"UVxf, + failed ? " Fail transition to " : "", + (UV)state, (UV)word); + }); + if ( base ) { + U32 tmp; + I32 offset; + if (charid && + ( ((offset = base + charid + - 1 - trie->uniquecharcount)) >= 0) + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state + && (tmp=trie->trans[offset].next)) + { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - legal\n")); + state = tmp; + break; + } + else { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - fail\n")); + failed = 1; + state = aho->fail[state]; + } + } + else { + /* we must be accepting here */ + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - accepting\n")); + failed = 1; + break; + } + } while(state); + uc += len; + if (failed) { + if (leftmost) + break; + if (!state) state = 1; + } + } + if ( aho->states[ state ].wordnum ) { + U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ]; + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=aho->states[ state ].wordnum); + leftmost = lpos; + } + } + if (leftmost) { + s = (char*)leftmost; + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( + Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n", + (UV)accepted_word, (IV)(s - real_start) + ); + }); + if (reginfo->intuit || regtry(reginfo, &s)) { + FREETMPS; + LEAVE; + goto got_it; + } + s = HOPc(s,1); + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n"); + }); + } else { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log,"No match.\n")); + break; + } + } + FREETMPS; + LEAVE; + } + break; + default: + Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); + } + return 0; + got_it: + return s; +} + +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +static void +S_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) + { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + + +/* + - regexec_flags - match a regexp against a string + */ +I32 +Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) +/* stringarg: the point in the string at which to begin matching */ +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >= minend bytes after stringarg. */ +/* sv: SV being matched: only used for utf8 flag, pos() etc; string + * itself is accessed via the pointers above */ +/* data: May be used for some additional optimizations. + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ + +{ + struct regexp *const prog = ReANY(rx); + char *s; + regnode *c; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ + const bool utf8_target = cBOOL(DO_UTF8(sv)); + I32 multiline; + RXi_GET_DECL(prog,progi); + regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ + regmatch_info *const reginfo = ®info_buf; + regexp_paren_pair *swap = NULL; + I32 oldsave; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGEXEC_FLAGS; + PERL_UNUSED_ARG(data); + + /* Be paranoid... */ + if (prog == NULL || stringarg == NULL) { + Perl_croak(aTHX_ "NULL regexp parameter"); + } + + DEBUG_EXECUTE_r( + debug_start_match(rx, utf8_target, stringarg, strend, + "Matching"); + ); + + startpos = stringarg; + + if (prog->intflags & PREGf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if prog->gofs is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ + + if (prog->intflags & PREGf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->intflags & PREGf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; + } + + /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), + * which will call destuctors to reset PL_regmatch_state, free higher + * PL_regmatch_slabs, and clean up regmatch_info_aux and + * regmatch_info_aux_eval */ + + oldsave = PL_savestack_ix; + + s = startpos; + + if ((prog->extflags & RXf_USE_INTUIT) + && !(flags & REXEC_CHECKED)) + { + s = re_intuit_start(rx, sv, strbeg, startpos, strend, + flags, NULL); + if (!s) + return 0; + + if (prog->extflags & RXf_CHECK_ALL) { + /* we can match based purely on the result of INTUIT. + * Set up captures etc just for $& and $-[0] + * (an intuit-only match wont have $1,$2,..) */ + assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + /* match via INTUIT shouldn't have any captures. + * Let @-, @+, $^N know */ + prog->lastparen = prog->lastcloseparen = 0; + RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, strend, + sv, flags, utf8_target); + + return 1; + } + } + + multiline = prog->extflags & RXf_PMf_MULTILINE; + + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "String too short [regexec_flags]...\n")); + goto phooey; + } + + /* Check validity of program. */ + if (UCHARAT(progi->program) != REG_MAGIC) { + Perl_croak(aTHX_ "corrupted regexp program"); + } + + RX_MATCH_TAINTED_off(rx); + + reginfo->prog = rx; /* Yes, sorry that this is confusing. */ + reginfo->intuit = 0; + reginfo->is_utf8_target = cBOOL(utf8_target); + reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); + reginfo->warned = FALSE; + reginfo->strbeg = strbeg; + reginfo->sv = sv; + reginfo->poscache_maxiter = 0; /* not yet started a countdown */ + reginfo->strend = strend; + /* see how far we have to get to not match where we matched before */ + reginfo->till = stringarg + minend; + + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { + /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after + S_cleanup_regmatch_info_aux has executed (registered by + SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies + magic belonging to this SV. + Not newSVsv, either, as it does not COW. + */ + assert(!IS_PADGV(sv)); + reginfo->sv = newSV(0); + SvSetSV_nosteal(reginfo->sv, sv); + SAVEFREESV(reginfo->sv); + } + + /* reserve next 2 or 3 slots in PL_regmatch_state: + * slot N+0: may currently be in use: skip it + * slot N+1: use for regmatch_info_aux struct + * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s + * slot N+3: ready for use by regmatch() + */ + + { + regmatch_state *old_regmatch_state; + regmatch_slab *old_regmatch_slab; + int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; + + /* on first ever match, allocate first slab */ + if (!PL_regmatch_slab) { + Newx(PL_regmatch_slab, 1, regmatch_slab); + PL_regmatch_slab->prev = NULL; + PL_regmatch_slab->next = NULL; + PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); + } + + old_regmatch_state = PL_regmatch_state; + old_regmatch_slab = PL_regmatch_slab; + + for (i=0; i <= max; i++) { + if (i == 1) + reginfo->info_aux = &(PL_regmatch_state->u.info_aux); + else if (i ==2) + reginfo->info_aux_eval = + reginfo->info_aux->info_aux_eval = + &(PL_regmatch_state->u.info_aux_eval); + + if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) + PL_regmatch_state = S_push_slab(aTHX); + } + + /* note initial PL_regmatch_state position; at end of match we'll + * pop back to there and free any higher slabs */ + + reginfo->info_aux->old_regmatch_state = old_regmatch_state; + reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; + reginfo->info_aux->poscache = NULL; + + SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); + + if ((prog->extflags & RXf_EVAL_SEEN)) + S_setup_eval_state(aTHX_ reginfo); + else + reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; + } + + /* If there is a "must appear" string, look for it. */ + + if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { + /* We have to be careful. If the previous successful match + was from this regex we don't want a subsequent partially + successful match to clobber the old results. + So when we detect this possibility we add a swap buffer + to the re, and switch the buffer each match. If we fail, + we switch it back; otherwise we leave it swapped. + */ + swap = prog->offs; + /* do we need a save destructor here for eval dies? */ + Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap), + PTR2UV(prog->offs) + )); + } + + /* Simplest case: anchored match need be tried only once. */ + /* [unless only anchor is BOL and multiline is set] */ + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { + if (s == startpos && regtry(reginfo, &s)) + goto got_it; + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ + { + char *end; + + if (minlen) + dontbother = minlen - 1; + end = HOP3c(strend, -dontbother, strbeg) - 1; + /* for multiline we only have to try after newlines */ + if (prog->check_substr || prog->check_utf8) { + /* because of the goto we can not easily reuse the macros for bifurcating the + unicode/non-unicode match modes here like we do elsewhere - demerphq */ + if (utf8_target) { + if (s == startpos) + goto after_try_utf8; + while (1) { + if (regtry(reginfo, &s)) { + goto got_it; + } + after_try_utf8: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, strbeg, + s + UTF8SKIP(s), strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s += UTF8SKIP(s); + } + } + } /* end search for check string in unicode */ + else { + if (s == startpos) { + goto after_try_latin; + } + while (1) { + if (regtry(reginfo, &s)) { + goto got_it; + } + after_try_latin: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, strbeg, + s + 1, strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s++; + } + } + } /* end search for check string in latin*/ + } /* end search for check string */ + else { /* search for newline */ + if (s > startpos) { + /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ + s--; + } + /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ + while (s <= end) { /* note it could be possible to match at the end of the string */ + if (*s++ == '\n') { /* don't need PL_utf8skip here */ + if (regtry(reginfo, &s)) + goto got_it; + } + } + } /* end search for newline */ + } /* end anchored/multiline check string search */ + goto phooey; + } else if (prog->intflags & PREGf_ANCH_GPOS) + { + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) + goto got_it; + goto phooey; + } + + /* Messy cases: unanchored match. */ + if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { + /* we have /x+whatever/ */ + /* it must be a one character string (XXXX Except is_utf8_pat?) */ + char ch; +#ifdef DEBUGGING + int did_match = 0; +#endif + if (utf8_target) { + if (! prog->anchored_utf8) { + to_utf8_substr(prog); + } + ch = SvPVX_const(prog->anchored_utf8)[0]; + REXEC_FBC_SCAN( + if (*s == ch) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (regtry(reginfo, &s)) goto got_it; + s += UTF8SKIP(s); + while (s < strend && *s == ch) + s += UTF8SKIP(s); + } + ); + + } + else { + if (! prog->anchored_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + ch = SvPVX_const(prog->anchored_substr)[0]; + REXEC_FBC_SCAN( + if (*s == ch) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (regtry(reginfo, &s)) goto got_it; + s++; + while (s < strend && *s == ch) + s++; + } + ); + } + DEBUG_EXECUTE_r(if (!did_match) + PerlIO_printf(Perl_debug_log, + "Did not find anchored character...\n") + ); + } + else if (prog->anchored_substr != NULL + || prog->anchored_utf8 != NULL + || ((prog->float_substr != NULL || prog->float_utf8 != NULL) + && prog->float_max_offset < strend - s)) { + SV *must; + SSize_t back_max; + SSize_t back_min; + char *last; + char *last1; /* Last position checked before */ +#ifdef DEBUGGING + int did_match = 0; +#endif + if (prog->anchored_substr || prog->anchored_utf8) { + if (utf8_target) { + if (! prog->anchored_utf8) { + to_utf8_substr(prog); + } + must = prog->anchored_utf8; + } + else { + if (! prog->anchored_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + must = prog->anchored_substr; + } + back_max = back_min = prog->anchored_offset; + } else { + if (utf8_target) { + if (! prog->float_utf8) { + to_utf8_substr(prog); + } + must = prog->float_utf8; + } + else { + if (! prog->float_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + must = prog->float_substr; + } + back_max = prog->float_max_offset; + back_min = prog->float_min_offset; + } + + if (back_min<0) { + last = strend; + } else { + last = HOP3c(strend, /* Cannot start after this */ + -(SSize_t)(CHR_SVLEN(must) + - (SvTAIL(must) != 0) + back_min), strbeg); + } + if (s > reginfo->strbeg) + last1 = HOPc(s, -1); + else + last1 = s - 1; /* bogus */ + + /* XXXX check_substr already used to find "s", can optimize if + check_substr==must. */ + dontbother = 0; + strend = HOPc(strend, -dontbother); + while ( (s <= last) && + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), + (unsigned char*)strend, must, + multiline ? FBMrf_MULTILINE : 0)) ) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (HOPc(s, -back_max) > last1) { + last1 = HOPc(s, -back_min); + s = HOPc(s, -back_max); + } + else { + char * const t = (last1 >= reginfo->strbeg) + ? HOPc(last1, 1) : last1 + 1; + + last1 = HOPc(s, -back_min); + s = t; + } + if (utf8_target) { + while (s <= last1) { + if (regtry(reginfo, &s)) + goto got_it; + if (s >= last1) { + s++; /* to break out of outer loop */ + break; + } + s += UTF8SKIP(s); + } + } + else { + while (s <= last1) { + if (regtry(reginfo, &s)) + goto got_it; + s++; + } + } + } + DEBUG_EXECUTE_r(if (!did_match) { + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", + ((must == prog->anchored_substr || must == prog->anchored_utf8) + ? "anchored" : "floating"), + quoted, RE_SV_TAIL(must)); + }); + goto phooey; + } + else if ( (c = progi->regstclass) ) { + if (minlen) { + const OPCODE op = OP(progi->regstclass); + /* don't bother with what can't match */ + if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE) + strend = HOPc(strend, -(minlen - 1)); + } + DEBUG_EXECUTE_r({ + SV * const prop = sv_newmortal(); + regprop(prog, prop, c, reginfo); + { + RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), + s,strend-s,60); + PerlIO_printf(Perl_debug_log, + "Matching stclass %.*s against %s (%d bytes)\n", + (int)SvCUR(prop), SvPVX_const(prop), + quoted, (int)(strend - s)); + } + }); + if (find_byclass(prog, c, s, strend, reginfo)) + goto got_it; + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n")); + } + else { + dontbother = 0; + if (prog->float_substr != NULL || prog->float_utf8 != NULL) { + /* Trim the end. */ + char *last= NULL; + SV* float_real; + STRLEN len; + const char *little; + + if (utf8_target) { + if (! prog->float_utf8) { + to_utf8_substr(prog); + } + float_real = prog->float_utf8; + } + else { + if (! prog->float_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + float_real = prog->float_substr; + } + + little = SvPV_const(float_real, len); + if (SvTAIL(float_real)) { + /* This means that float_real contains an artificial \n on + * the end due to the presence of something like this: + * /foo$/ where we can match both "foo" and "foo\n" at the + * end of the string. So we have to compare the end of the + * string first against the float_real without the \n and + * then against the full float_real with the string. We + * have to watch out for cases where the string might be + * smaller than the float_real or the float_real without + * the \n. */ + char *checkpos= strend - len; + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%sChecking for float_real.%s\n", + PL_colors[4], PL_colors[5])); + if (checkpos + 1 < strbeg) { + /* can't match, even if we remove the trailing \n + * string is too short to match */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString shorter than required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (memEQ(checkpos + 1, little, len - 1)) { + /* can match, the end of the string matches without the + * "\n" */ + last = checkpos + 1; + } else if (checkpos < strbeg) { + /* cant match, string is too short when the "\n" is + * included */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (!multiline) { + /* non multiline match, so compare with the "\n" at the + * end of the string */ + if (memEQ(checkpos, little, len)) { + last= checkpos; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } + } else { + /* multiline match, so we have to search for a place + * where the full string is located */ + goto find_last; + } + } else { + find_last: + if (len) + last = rninstr(s, strend, little, little + len); + else + last = strend; /* matching "$" */ + } + if (!last) { + /* at one point this block contained a comment which was + * probably incorrect, which said that this was a "should not + * happen" case. Even if it was true when it was written I am + * pretty sure it is not anymore, so I have removed the comment + * and replaced it with this one. Yves */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "String does not contain required substring, cannot match.\n" + )); + goto phooey; + } + dontbother = strend - last + prog->float_min_offset; + } + if (minlen && (dontbother < minlen)) + dontbother = minlen - 1; + strend -= dontbother; /* this one's always in bytes! */ + /* We don't know much -- general case. */ + if (utf8_target) { + for (;;) { + if (regtry(reginfo, &s)) + goto got_it; + if (s >= strend) + break; + s += UTF8SKIP(s); + }; + } + else { + do { + if (regtry(reginfo, &s)) + goto got_it; + } while (s++ < strend); + } + } + + /* Failure. */ + goto phooey; + +got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + DEBUG_BUFFERS_r( + if (swap) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap) + ); + ); + Safefree(swap); + + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ + + LEAVE_SCOPE(oldsave); + + if (RXp_PAREN_NAMES(prog)) + (void)hv_iterinit(RXp_PAREN_NAMES(prog)); + + RX_MATCH_UTF8_set(rx, utf8_target); + + /* make sure $`, $&, $', and $digit will work later */ + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); + + return 1; + +phooey: + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", + PL_colors[4], PL_colors[5])); + + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ + + LEAVE_SCOPE(oldsave); + + if (swap) { + /* we failed :-( roll it back */ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(prog->offs), + PTR2UV(swap) + )); + Safefree(prog->offs); + prog->offs = swap; + } + return 0; +} + + +/* Set which rex is pointed to by PL_reg_curpm, handling ref counting. + * Do inc before dec, in case old and new rex are the same */ +#define SET_reg_curpm(Re2) \ + if (reginfo->info_aux_eval) { \ + (void)ReREFCNT_inc(Re2); \ + ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ + PM_SETRE((PL_reg_curpm), (Re2)); \ + } + + +/* + - regtry - try match at specific point + */ +STATIC I32 /* 0 failure, 1 success */ +S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) +{ + CHECKPOINT lastcp; + REGEXP *const rx = reginfo->prog; + regexp *const prog = ReANY(rx); + SSize_t result; + RXi_GET_DECL(prog,progi); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTRY; + + reginfo->cutpoint=NULL; + + prog->offs[0].start = *startposp - reginfo->strbeg; + prog->lastparen = 0; + prog->lastcloseparen = 0; + + /* XXXX What this code is doing here?!!! There should be no need + to do this again and again, prog->lastparen should take care of + this! --ilya*/ + + /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. + * Actually, the code in regcppop() (which Ilya may be meaning by + * prog->lastparen), is not needed at all by the test suite + * (op/regexp, op/pat, op/split), but that code is needed otherwise + * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ + * Meanwhile, this code *is* needed for the + * above-mentioned test suite tests to succeed. The common theme + * on those tests seems to be returning null fields from matches. + * --jhi updated by dapm */ +#if 1 + if (prog->nparens) { + regexp_paren_pair *pp = prog->offs; + I32 i; + for (i = prog->nparens; i > (I32)prog->lastparen; i--) { + ++pp; + pp->start = -1; + pp->end = -1; + } + } +#endif + REGCP_SET(lastcp); + result = regmatch(reginfo, *startposp, progi->program + 1); + if (result != -1) { + prog->offs[0].end = result; + return 1; + } + if (reginfo->cutpoint) + *startposp= reginfo->cutpoint; + REGCP_UNWIND(lastcp); + return 0; +} + + +#define sayYES goto yes +#define sayNO goto no +#define sayNO_SILENT goto no_silent + +/* we dont use STMT_START/END here because it leads to + "unreachable code" warnings, which are bogus, but distracting. */ +#define CACHEsayNO \ + if (ST.cache_mask) \ + reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ + sayNO + +/* this is used to determine how far from the left messages like + 'failed...' are printed. It should be set such that messages + are inline with the regop output that created them. +*/ +#define REPORT_CODE_OFF 32 + + +#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ +#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ +#define CHRTEST_NOT_A_CP_1 -999 +#define CHRTEST_NOT_A_CP_2 -998 + +/* grab a new slab and return the first slot in it */ + +STATIC regmatch_state * +S_push_slab(pTHX) +{ +#if PERL_VERSION < 9 && !defined(PERL_CORE) + dMY_CXT; +#endif + regmatch_slab *s = PL_regmatch_slab->next; + if (!s) { + Newx(s, 1, regmatch_slab); + s->prev = PL_regmatch_slab; + s->next = NULL; + PL_regmatch_slab->next = s; + } + PL_regmatch_slab = s; + return SLAB_FIRST(s); +} + + +/* push a new state then goto it */ + +#define PUSH_STATE_GOTO(state, node, input) \ + pushinput = input; \ + scan = node; \ + st->resume_state = state; \ + goto push_state; + +/* push a new state with success backtracking, then goto it */ + +#define PUSH_YES_STATE_GOTO(state, node, input) \ + pushinput = input; \ + scan = node; \ + st->resume_state = state; \ + goto push_yes_state; + + + + +/* + +regmatch() - main matching routine + +This is basically one big switch statement in a loop. We execute an op, +set 'next' to point the next op, and continue. If we come to a point which +we may need to backtrack to on failure such as (A|B|C), we push a +backtrack state onto the backtrack stack. On failure, we pop the top +state, and re-enter the loop at the state indicated. If there are no more +states to pop, we return failure. + +Sometimes we also need to backtrack on success; for example /A+/, where +after successfully matching one A, we need to go back and try to +match another one; similarly for lookahead assertions: if the assertion +completes successfully, we backtrack to the state just before the assertion +and then carry on. In these cases, the pushed state is marked as +'backtrack on success too'. This marking is in fact done by a chain of +pointers, each pointing to the previous 'yes' state. On success, we pop to +the nearest yes state, discarding any intermediate failure-only states. +Sometimes a yes state is pushed just to force some cleanup code to be +called at the end of a successful match or submatch; e.g. (??{$re}) uses +it to free the inner regex. + +Note that failure backtracking rewinds the cursor position, while +success backtracking leaves it alone. + +A pattern is complete when the END op is executed, while a subpattern +such as (?=foo) is complete when the SUCCESS op is executed. Both of these +ops trigger the "pop to last yes state if any, otherwise return true" +behaviour. + +A common convention in this function is to use A and B to refer to the two +subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is +the subpattern to be matched possibly multiple times, while B is the entire +rest of the pattern. Variable and state names reflect this convention. + +The states in the main switch are the union of ops and failure/success of +substates associated with with that op. For example, IFMATCH is the op +that does lookahead assertions /(?=A)B/ and so the IFMATCH state means +'execute IFMATCH'; while IFMATCH_A is a state saying that we have just +successfully matched A and IFMATCH_A_fail is a state saying that we have +just failed to match A. Resume states always come in pairs. The backtrack +state we push is marked as 'IFMATCH_A', but when that is popped, we resume +at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking +on success or failure. + +The struct that holds a backtracking state is actually a big union, with +one variant for each major type of op. The variable st points to the +top-most backtrack struct. To make the code clearer, within each +block of code we #define ST to alias the relevant union. + +Here's a concrete example of a (vastly oversimplified) IFMATCH +implementation: + + switch (state) { + .... + +#define ST st->u.ifmatch + + case IFMATCH: // we are executing the IFMATCH op, (?=A)B + ST.foo = ...; // some state we wish to save + ... + // push a yes backtrack state with a resume value of + // IFMATCH_A/IFMATCH_A_fail, then continue execution at the + // first node of A: + PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); + // NOTREACHED + + case IFMATCH_A: // we have successfully executed A; now continue with B + next = B; + bar = ST.foo; // do something with the preserved value + break; + + case IFMATCH_A_fail: // A failed, so the assertion failed + ...; // do some housekeeping, then ... + sayNO; // propagate the failure + +#undef ST + + ... + } + +For any old-timers reading this who are familiar with the old recursive +approach, the code above is equivalent to: + + case IFMATCH: // we are executing the IFMATCH op, (?=A)B + { + int foo = ... + ... + if (regmatch(A)) { + next = B; + bar = foo; + break; + } + ...; // do some housekeeping, then ... + sayNO; // propagate the failure + } + +The topmost backtrack state, pointed to by st, is usually free. If you +want to claim it, populate any ST.foo fields in it with values you wish to +save, then do one of + + PUSH_STATE_GOTO(resume_state, node, newinput); + PUSH_YES_STATE_GOTO(resume_state, node, newinput); + +which sets that backtrack state's resume value to 'resume_state', pushes a +new free entry to the top of the backtrack stack, then goes to 'node'. +On backtracking, the free slot is popped, and the saved state becomes the +new free state. An ST.foo field in this new top state can be temporarily +accessed to retrieve values, but once the main loop is re-entered, it +becomes available for reuse. + +Note that the depth of the backtrack stack constantly increases during the +left-to-right execution of the pattern, rather than going up and down with +the pattern nesting. For example the stack is at its maximum at Z at the +end of the pattern, rather than at X in the following: + + /(((X)+)+)+....(Y)+....Z/ + +The only exceptions to this are lookahead/behind assertions and the cut, +(?>A), which pop all the backtrack states associated with A before +continuing. + +Backtrack state structs are allocated in slabs of about 4K in size. +PL_regmatch_state and st always point to the currently active state, +and PL_regmatch_slab points to the slab currently containing +PL_regmatch_state. The first time regmatch() is called, the first slab is +allocated, and is never freed until interpreter destruction. When the slab +is full, a new one is allocated and chained to the end. At exit from +regmatch(), slabs allocated since entry are freed. + +*/ + + +#define DEBUG_STATE_pp(pp) \ + DEBUG_STATE_r({ \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ + PerlIO_printf(Perl_debug_log, \ + " %*s"pp" %s%s%s%s%s\n", \ + depth*2, "", \ + PL_reg_name[st->resume_state], \ + ((st==yes_state||st==mark_state) ? "[" : ""), \ + ((st==yes_state) ? "Y" : ""), \ + ((st==mark_state) ? "M" : ""), \ + ((st==yes_state||st==mark_state) ? "]" : "") \ + ); \ + }); + + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1) + +#ifdef DEBUGGING + +STATIC void +S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, + const char *start, const char *end, const char *blurb) +{ + const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; + + PERL_ARGS_ASSERT_DEBUG_START_MATCH; + + if (!PL_colorset) + reginitcolors(); + { + RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), + RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); + + RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), + start, end - start, 60); + + PerlIO_printf(Perl_debug_log, + "%s%s REx%s %s against %s\n", + PL_colors[4], blurb, PL_colors[5], s0, s1); + + if (utf8_target||utf8_pat) + PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", + utf8_pat ? "pattern" : "", + utf8_pat && utf8_target ? " and " : "", + utf8_target ? "string" : "" + ); + } +} + +STATIC void +S_dump_exec_pos(pTHX_ const char *locinput, + const regnode *scan, + const char *loc_regeol, + const char *loc_bostr, + const char *loc_reg_starttry, + const bool utf8_target) +{ + const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; + const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ + int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); + /* The part of the string before starttry has one color + (pref0_len chars), between starttry and current + position another one (pref_len - pref0_len chars), + after the current position the third one. + We assume that pref0_len <= pref_len, otherwise we + decrease pref0_len. */ + int pref_len = (locinput - loc_bostr) > (5 + taill) - l + ? (5 + taill) - l : locinput - loc_bostr; + int pref0_len; + + PERL_ARGS_ASSERT_DUMP_EXEC_POS; + + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) + pref_len++; + pref0_len = pref_len - (locinput - loc_reg_starttry); + if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) + l = ( loc_regeol - locinput > (5 + taill) - pref_len + ? (5 + taill) - pref_len : loc_regeol - locinput); + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) + l--; + if (pref0_len < 0) + pref0_len = 0; + if (pref0_len > pref_len) + pref0_len = pref_len; + { + const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0; + + RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), + (locinput - pref_len),pref0_len, 60, 4, 5); + + RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), + (locinput - pref_len + pref0_len), + pref_len - pref0_len, 60, 2, 3); + + RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), + locinput, loc_regeol - locinput, 10, 0, 1); + + const STRLEN tlen=len0+len1+len2; + PerlIO_printf(Perl_debug_log, + "%4"IVdf" <%.*s%.*s%s%.*s>%*s|", + (IV)(locinput - loc_bostr), + len0, s0, + len1, s1, + (docolor ? "" : "> <"), + len2, s2, + (int)(tlen > 19 ? 0 : 19 - tlen), + ""); + } +} + +#endif + +/* reg_check_named_buff_matched() + * Checks to see if a named buffer has matched. The data array of + * buffer numbers corresponding to the buffer is expected to reside + * in the regexp->data->data array in the slot stored in the ARG() of + * node involved. Note that this routine doesn't actually care about the + * name, that information is not preserved from compilation to execution. + * Returns the index of the leftmost defined buffer with the given name + * or 0 if non of the buffers matched. + */ +STATIC I32 +S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) +{ + I32 n; + RXi_GET_DECL(rex,rexi); + SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + + PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; + + for ( n=0; nlastparen >= nums[n] && + rex->offs[nums[n]].end != -1) + { + return nums[n]; + } + } + return 0; +} + + +static bool +S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, + U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) +{ + /* This function determines if there are one or two characters that match + * the first character of the passed-in EXACTish node , and if + * so, returns them in the passed-in pointers. + * + * If it determines that no possible character in the target string can + * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if + * the first character in requires UTF-8 to represent, and the + * target string isn't in UTF-8.) + * + * If there are more than two characters that could match the beginning of + * , or if more context is required to determine a match or not, + * it sets both * and * to CHRTEST_VOID. + * + * The motiviation behind this function is to allow the caller to set up + * tight loops for matching. If is of type EXACT, there is + * only one possible character that can match its first character, and so + * the situation is quite simple. But things get much more complicated if + * folding is involved. It may be that the first character of an EXACTFish + * node doesn't participate in any possible fold, e.g., punctuation, so it + * can be matched only by itself. The vast majority of characters that are + * in folds match just two things, their lower and upper-case equivalents. + * But not all are like that; some have multiple possible matches, or match + * sequences of more than one character. This function sorts all that out. + * + * Consider the patterns A*B or A*?B where A and B are arbitrary. In a + * loop of trying to match A*, we know we can't exit where the thing + * following it isn't a B. And something can't be a B unless it is the + * beginning of B. By putting a quick test for that beginning in a tight + * loop, we can rule out things that can't possibly be B without having to + * break out of the loop, thus avoiding work. Similarly, if A is a single + * character, we can make a tight loop matching A*, using the outputs of + * this function. + * + * If the target string to match isn't in UTF-8, and there aren't + * complications which require CHRTEST_VOID, * and * are set to + * the one or two possible octets (which are characters in this situation) + * that can match. In all cases, if there is only one character that can + * match, * and * will be identical. + * + * If the target string is in UTF-8, the buffers pointed to by + * and will contain the one or two UTF-8 sequences of bytes that + * can match the beginning of . They should be declared with at + * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is + * undefined what these contain.) If one or both of the buffers are + * invariant under UTF-8, *, and * will also be set to the + * corresponding invariant. If variant, the corresponding * and/or + * * will be set to a negative number(s) that shouldn't match any code + * point (unless inappropriately coerced to unsigned). * will equal + * * if and only if and are the same. */ + + const bool utf8_target = reginfo->is_utf8_target; + + UV c1 = CHRTEST_NOT_A_CP_1; + UV c2 = CHRTEST_NOT_A_CP_2; + bool use_chrtest_void = FALSE; + const bool is_utf8_pat = reginfo->is_utf8_pat; + + /* Used when we have both utf8 input and utf8 output, to avoid converting + * to/from code points */ + bool utf8_has_been_setup = FALSE; + + dVAR; + + U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + + if (OP(text_node) == EXACT) { + + /* In an exact node, only one thing can be matched, that first + * character. If both the pat and the target are UTF-8, we can just + * copy the input to the output, avoiding finding the code point of + * that character */ + if (!is_utf8_pat) { + c2 = c1 = *pat; + } + else if (utf8_target) { + Copy(pat, c1_utf8, UTF8SKIP(pat), U8); + Copy(pat, c2_utf8, UTF8SKIP(pat), U8); + utf8_has_been_setup = TRUE; + } + else { + c2 = c1 = valid_utf8_to_uvchr(pat, NULL); + } + } + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; + } + } + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } + } + + pat = folded; + pat_end = d; + } + } + + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 255) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { + + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; + } + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 255, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255. */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } + } + } + } + } + } + else /* Here, c1 is <= 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } + } + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { + + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; + + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; + break; + + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } + } + } + } + + /* Here have figured things out. Set up the returns */ + if (use_chrtest_void) { + *c2p = *c1p = CHRTEST_VOID; + } + else if (utf8_target) { + if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ + uvchr_to_utf8(c1_utf8, c1); + uvchr_to_utf8(c2_utf8, c2); + } + + /* Invariants are stored in both the utf8 and byte outputs; Use + * negative numbers otherwise for the byte ones. Make sure that the + * byte ones are the same iff the utf8 ones are the same */ + *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; + *c2p = (UTF8_IS_INVARIANT(*c2_utf8)) + ? *c2_utf8 + : (c1 == c2) + ? CHRTEST_NOT_A_CP_1 + : CHRTEST_NOT_A_CP_2; + } + else if (c1 > 255) { + if (c2 > 255) { /* both possibilities are above what a non-utf8 string + can represent */ + return FALSE; + } + + *c1p = *c2p = c2; /* c2 is the only representable value */ + } + else { /* c1 is representable; see about c2 */ + *c1p = c1; + *c2p = (c2 < 256) ? c2 : c1; + } + + return TRUE; +} + +/* returns -1 on failure, $+[0] on success */ +STATIC SSize_t +S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) +{ +#if PERL_VERSION < 9 && !defined(PERL_CORE) + dMY_CXT; +#endif + dVAR; + const bool utf8_target = reginfo->is_utf8_target; + const U32 uniflags = UTF8_ALLOW_DEFAULT; + REGEXP *rex_sv = reginfo->prog; + regexp *rex = ReANY(rex_sv); + RXi_GET_DECL(rex,rexi); + /* the current state. This is a cached copy of PL_regmatch_state */ + regmatch_state *st; + /* cache heavy used fields of st in registers */ + regnode *scan; + regnode *next; + U32 n = 0; /* general value; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ + char *locinput = startpos; + char *pushinput; /* where to continue after a PUSH */ + I32 nextchr; /* is always set to UCHARAT(locinput) */ + + bool result = 0; /* return value of S_regmatch */ + int depth = 0; /* depth of backtrack stack */ + U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ + const U32 max_nochange_depth = + (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? + 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; + regmatch_state *yes_state = NULL; /* state to pop to on success of + subpattern */ + /* mark_state piggy backs on the yes_state logic so that when we unwind + the stack on success we can update the mark_state as we go */ + regmatch_state *mark_state = NULL; /* last mark state we have seen */ + regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ + struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ + U32 state_num; + bool no_final = 0; /* prevent failure from backtracking? */ + bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ + char *startpoint = locinput; + SV *popmark = NULL; /* are we looking for a mark? */ + SV *sv_commit = NULL; /* last mark name seen in failure */ + SV *sv_yes_mark = NULL; /* last mark name we have seen + during a successful match */ + U32 lastopen = 0; /* last open we saw */ + bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; + SV* const oreplsv = GvSVn(PL_replgv); + /* these three flags are set by various ops to signal information to + * the very next op. They have a useful lifetime of exactly one loop + * iteration, and are not preserved or restored by state pushes/pops + */ + bool sw = 0; /* the condition value in (?(cond)a|b) */ + bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ + int logical = 0; /* the following EVAL is: + 0: (?{...}) + 1: (?(?{...})X|Y) + 2: (??{...}) + or the following IFMATCH/UNLESSM is: + false: plain (?=foo) + true: used as a condition: (?(?=foo)) + */ + PAD* last_pad = NULL; + dMULTICALL; + I32 gimme = G_SCALAR; + CV *caller_cv = NULL; /* who called us */ + CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ + CHECKPOINT runops_cp; /* savestack position before executing EVAL */ + U32 maxopenparen = 0; /* max '(' index seen so far */ + int to_complement; /* Invert the result? */ + _char_class_number classnum; + bool is_utf8_pat = reginfo->is_utf8_pat; + +#ifdef DEBUGGING + GET_RE_DEBUG_FLAGS_DECL; +#endif + + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ + multicall_oldcatch = 0; + multicall_cv = NULL; + cx = NULL; + PERL_UNUSED_VAR(multicall_cop); + PERL_UNUSED_VAR(newsp); + + + PERL_ARGS_ASSERT_REGMATCH; + + DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ + PerlIO_printf(Perl_debug_log,"regmatch start\n"); + })); + + st = PL_regmatch_state; + + /* Note that nextchr is a byte even in UTF */ + SET_nextchr; + scan = prog; + while (scan != NULL) { + + DEBUG_EXECUTE_r( { + SV * const prop = sv_newmortal(); + regnode *rnext=regnext(scan); + DUMP_EXEC_POS( locinput, scan, utf8_target ); + regprop(rex, prop, scan, reginfo); + + PerlIO_printf(Perl_debug_log, + "%3"IVdf":%*s%s(%"IVdf")\n", + (IV)(scan - rexi->program), depth*2, "", + SvPVX_const(prop), + (PL_regkind[OP(scan)] == END || !rnext) ? + 0 : (IV)(rnext - rexi->program)); + }); + + next = scan + NEXT_OFF(scan); + if (next == scan) + next = NULL; + state_num = OP(scan); + + REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st); + reenter_switch: + to_complement = 0; + + SET_nextchr; + assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); + + switch (state_num) { + case BOL: /* /^../ */ + case SBOL: /* /^../s */ + if (locinput == reginfo->strbeg) + break; + sayNO; + + case MBOL: /* /^../m */ + if (locinput == reginfo->strbeg || + (!NEXTCHR_IS_EOS && locinput[-1] == '\n')) + { + break; + } + sayNO; + + case GPOS: /* \G */ + if (locinput == reginfo->ganch) + break; + sayNO; + + case KEEPS: /* \K */ + /* update the startpoint */ + st->u.keeper.val = rex->offs[0].start; + rex->offs[0].start = locinput - reginfo->strbeg; + PUSH_STATE_GOTO(KEEPS_next, next, locinput); + assert(0); /*NOTREACHED*/ + case KEEPS_next_fail: + /* rollback the start point change */ + rex->offs[0].start = st->u.keeper.val; + sayNO_SILENT; + assert(0); /*NOTREACHED*/ + + case MEOL: /* /..$/m */ + if (!NEXTCHR_IS_EOS && nextchr != '\n') + sayNO; + break; + + case EOL: /* /..$/ */ + /* FALLTHROUGH */ + case SEOL: /* /..$/s */ + if (!NEXTCHR_IS_EOS && nextchr != '\n') + sayNO; + if (reginfo->strend - locinput > 1) + sayNO; + break; + + case EOS: /* \z */ + if (!NEXTCHR_IS_EOS) + sayNO; + break; + + case SANY: /* /./s */ + if (NEXTCHR_IS_EOS) + sayNO; + goto increment_locinput; + + case CANY: /* \C */ + if (NEXTCHR_IS_EOS) + sayNO; + locinput++; + break; + + case REG_ANY: /* /./ */ + if ((NEXTCHR_IS_EOS) || nextchr == '\n') + sayNO; + goto increment_locinput; + + +#undef ST +#define ST st->u.trie + case TRIEC: /* (ab|cd) with known charclass */ + /* In this case the charclass data is available inline so + we can fail fast without a lot of extra overhead. + */ + if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + assert(0); /* NOTREACHED */ + } + /* FALLTHROUGH */ + case TRIE: /* (ab|cd) */ + /* the basic plan of execution of the trie is: + * At the beginning, run though all the states, and + * find the longest-matching word. Also remember the position + * of the shortest matching word. For example, this pattern: + * 1 2 3 4 5 + * ab|a|x|abcd|abc + * when matched against the string "abcde", will generate + * accept states for all words except 3, with the longest + * matching word being 4, and the shortest being 2 (with + * the position being after char 1 of the string). + * + * Then for each matching word, in word order (i.e. 1,2,4,5), + * we run the remainder of the pattern; on each try setting + * the current position to the character following the word, + * returning to try the next word on failure. + * + * We avoid having to build a list of words at runtime by + * using a compile-time structure, wordinfo[].prev, which + * gives, for each word, the previous accepting word (if any). + * In the case above it would contain the mappings 1->2, 2->0, + * 3->0, 4->5, 5->1. We can use this table to generate, from + * the longest word (4 above), a list of all words, by + * following the list of prev pointers; this gives us the + * unordered list 4,5,1,2. Then given the current word we have + * just tried, we can go through the list and find the + * next-biggest word to try (so if we just failed on word 2, + * the next in the list is 4). + * + * Since at runtime we don't record the matching position in + * the string for each word, we have to work that out for + * each word we're about to process. The wordinfo table holds + * the character length of each word; given that we recorded + * at the start: the position of the shortest word and its + * length in chars, we just need to move the pointer the + * difference between the two char lengths. Depending on + * Unicode status and folding, that's cheap or expensive. + * + * This algorithm is optimised for the case where are only a + * small number of accept states, i.e. 0,1, or maybe 2. + * With lots of accepts states, and having to try all of them, + * it becomes quadratic on number of accept states to find all + * the next words. + */ + + { + /* what type of TRIE am I? (utf8 makes this contextual) */ + DECL_TRIE_TYPE(scan); + + /* what trie are we using right now */ + reg_trie_data * const trie + = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; + HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); + U32 state = trie->startstate; + + if ( trie->bitmap + && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) + { + if (trie->states[ state ].wordnum) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %smatched empty string...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + if (!trie->jump) + break; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + } + } + + { + U8 *uc = ( U8* )locinput; + + STRLEN len = 0; + STRLEN foldlen = 0; + U8 *uscan = (U8*)NULL; + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + U32 charcount = 0; /* how many input chars we have matched */ + U32 accepted = 0; /* have we seen any accepting states? */ + + ST.jump = trie->jump; + ST.me = scan; + ST.firstpos = NULL; + ST.longfold = FALSE; /* char longer if folded => it's harder */ + ST.nextword = 0; + + /* fully traverse the TRIE; note the position of the + shortest accept state and the wordnum of the longest + accept state */ + + while ( state && uc <= (U8*)(reginfo->strend) ) { + U32 base = trie->states[ state ].trans.base; + UV uvc = 0; + U16 charid = 0; + U16 wordnum; + wordnum = trie->states[ state ].wordnum; + + if (wordnum) { /* it's an accept state */ + if (!accepted) { + accepted = 1; + /* record first match position */ + if (ST.longfold) { + ST.firstpos = (U8*)locinput; + ST.firstchars = 0; + } + else { + ST.firstpos = uc; + ST.firstchars = charcount; + } + } + if (!ST.nextword || wordnum < ST.nextword) + ST.nextword = wordnum; + ST.topword = wordnum; + } + + DEBUG_TRIE_EXECUTE_r({ + DUMP_EXEC_POS( (char *)uc, scan, utf8_target ); + PerlIO_printf( Perl_debug_log, + "%*s %sState: %4"UVxf" Accepted: %c ", + 2+depth * 2, "", PL_colors[4], + (UV)state, (accepted ? 'Y' : 'N')); + }); + + /* read a char and goto next state */ + if ( base && (foldlen || uc < (U8*)(reginfo->strend))) { + I32 offset; + REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, + uscan, len, uvc, charid, foldlen, + foldbuf, uniflags); + charcount++; + if (foldlen>0) + ST.longfold = TRUE; + if (charid && + ( ((offset = + base + charid - 1 - trie->uniquecharcount)) >= 0) + + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state) + { + state = trie->trans[offset].next; + } + else { + state = 0; + } + uc += len; + + } + else { + state = 0; + } + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log, + "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n", + charid, uvc, (UV)state, PL_colors[5] ); + ); + } + if (!accepted) + sayNO; + + /* calculate total number of accept states */ + { + U16 w = ST.topword; + accepted = 0; + while (w) { + w = trie->wordinfo[w].prev; + accepted++; + } + ST.accepted = accepted; + } + + DEBUG_EXECUTE_r( + PerlIO_printf( Perl_debug_log, + "%*s %sgot %"IVdf" possible matches%s\n", + REPORT_CODE_OFF + depth * 2, "", + PL_colors[4], (IV)ST.accepted, PL_colors[5] ); + ); + goto trie_first_try; /* jump into the fail handler */ + }} + assert(0); /* NOTREACHED */ + + case TRIE_next_fail: /* we failed - try next alternative */ + { + U8 *uc; + if ( ST.jump) { + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + if (!--ST.accepted) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sTRIE failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); + sayNO_SILENT; + } + { + /* Find next-highest word to process. Note that this code + * is O(N^2) per trie run (O(N) per branch), so keep tight */ + U16 min = 0; + U16 word; + U16 const nextword = ST.nextword; + reg_trie_wordinfo * const wordinfo + = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; + for (word=ST.topword; word; word=wordinfo[word].prev) { + if (word > nextword && (!min || word < min)) + min = word; + } + ST.nextword = min; + } + + trie_first_try: + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } + + if ( ST.jump) { + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + REGCP_SET(ST.cp); + } + + /* find start char of end of current word */ + { + U32 chars; /* how many chars to skip */ + reg_trie_data * const trie + = (reg_trie_data*)rexi->data->data[ARG(ST.me)]; + + assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) + >= ST.firstchars); + chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) + - ST.firstchars; + uc = ST.firstpos; + + if (ST.longfold) { + /* the hard option - fold each char in turn and find + * its folded length (which may be different */ + U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; + STRLEN foldlen; + STRLEN len; + UV uvc; + U8 *uscan; + + while (chars) { + if (utf8_target) { + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, + uniflags); + uc += len; + } + else { + uvc = *uc; + uc++; + } + uvc = to_uni_fold(uvc, foldbuf, &foldlen); + uscan = foldbuf; + while (foldlen) { + if (!--chars) + break; + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, + uniflags); + uscan += len; + foldlen -= len; + } + } + } + else { + if (utf8_target) + while (chars--) + uc += UTF8SKIP(uc); + else + uc += chars; + } + } + + scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) + ? ST.jump[ST.nextword] + : NEXT_OFF(ST.me)); + + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sTRIE matched word #%d, continuing%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + ST.nextword, + PL_colors[5] + ); + }); + + if (ST.accepted > 1 || has_cutgroup) { + PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc); + assert(0); /* NOTREACHED */ + } + /* only one choice left - just continue */ + DEBUG_EXECUTE_r({ + AV *const trie_words + = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); + SV ** const tmp = av_fetch( trie_words, + ST.nextword-1, 0 ); + SV *sv= tmp ? sv_newmortal() : NULL; + + PerlIO_printf( Perl_debug_log, + "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], + ST.nextword, + tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII + ) + : "not compiled under -Dr", + PL_colors[5] ); + }); + + locinput = (char*)uc; + continue; /* execute rest of RE */ + assert(0); /* NOTREACHED */ + } +#undef ST + + case EXACT: { /* /abc/ */ + char *s = STRING(scan); + ln = STR_LEN(scan); + if (utf8_target != is_utf8_pat) { + /* The target and the pattern have differing utf8ness. */ + char *l = locinput; + const char * const e = s + ln; + + if (utf8_target) { + /* The target is utf8, the pattern is not utf8. + * Above-Latin1 code points can't match the pattern; + * invariants match exactly, and the other Latin1 ones need + * to be downgraded to a single byte in order to do the + * comparison. (If we could be confident that the target + * is not malformed, this could be refactored to have fewer + * tests by just assuming that if the first bytes match, it + * is an invariant, but there are tests in the test suite + * dealing with (??{...}) which violate this) */ + while (s < e) { + if (l >= reginfo->strend + || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) + { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)l)) { + if (*l != *s) { + sayNO; + } + l++; + } + else { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { + sayNO; + } + l += 2; + } + s++; + } + } + else { + /* The target is not utf8, the pattern is utf8. */ + while (s < e) { + if (l >= reginfo->strend + || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) + { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)s)) { + if (*s != *l) { + sayNO; + } + s++; + } + else { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { + sayNO; + } + s += 2; + } + l++; + } + } + locinput = l; + } + else { + /* The target and the pattern have the same utf8ness. */ + /* Inline the first character, for speed. */ + if (reginfo->strend - locinput < ln + || UCHARAT(s) != nextchr + || (ln > 1 && memNE(s, locinput, ln))) + { + sayNO; + } + locinput += ln; + } + break; + } + + case EXACTFL: { /* /abc/il */ + re_fold_t folder; + const U8 * fold_array; + const char * s; + U32 fold_utf8_flags; + + folder = foldEQ_locale; + fold_array = PL_fold_locale; + fold_utf8_flags = FOLDEQ_LOCALE; + goto do_exactf; + + case EXACTFU_SS: /* /\x{df}/iu */ + case EXACTFU: /* /abc/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; + goto do_exactf; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: /* /abc/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf; + + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + folder = foldEQ; + fold_array = PL_fold; + fold_utf8_flags = 0; + + do_exactf: + s = STRING(scan); + ln = STR_LEN(scan); + + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { + /* Either target or the pattern are utf8, or has the issue where + * the fold lengths may differ. */ + const char * const l = locinput; + char *e = reginfo->strend; + + if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat, + l, &e, 0, utf8_target, fold_utf8_flags)) + { + sayNO; + } + locinput = e; + break; + } + + /* Neither the target nor the pattern are utf8 */ + if (UCHARAT(s) != nextchr + && !NEXTCHR_IS_EOS + && UCHARAT(s) != fold_array[nextchr]) + { + sayNO; + } + if (reginfo->strend - locinput < ln) + sayNO; + if (ln > 1 && ! folder(s, locinput, ln)) + sayNO; + locinput += ln; + break; + } + + /* XXX Could improve efficiency by separating these all out using a + * macro or in-line function. At that point regcomp.c would no longer + * have to set the FLAGS fields of these */ + case BOUNDL: /* /\b/l */ + case NBOUNDL: /* /\B/l */ + case BOUND: /* /\b/ */ + case BOUNDU: /* /\b/u */ + case BOUNDA: /* /\b/a */ + case NBOUND: /* /\B/ */ + case NBOUNDU: /* /\B/u */ + case NBOUNDA: /* /\B/a */ + /* was last char in word? */ + if (utf8_target + && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET + && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET) + { + if (locinput == reginfo->strbeg) + ln = '\n'; + else { + const U8 * const r = + reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); + + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); + } + if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { + ln = isWORDCHAR_uni(ln); + if (NEXTCHR_IS_EOS) + n = 0; + else { + LOAD_UTF8_CHARCLASS_ALNUM(); + n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput, + utf8_target); + } + } + else { + ln = isWORDCHAR_LC_uvchr(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); + } + } + else { + + /* Here the string isn't utf8, or is utf8 and only ascii + * characters are to match \w. In the latter case looking at + * the byte just prior to the current one may be just the final + * byte of a multi-byte character. This is ok. There are two + * cases: + * 1) it is a single byte character, and then the test is doing + * just what it's supposed to. + * 2) it is a multi-byte character, in which case the final + * byte is never mistakable for ASCII, and so the test + * will say it is not a word character, which is the + * correct answer. */ + ln = (locinput != reginfo->strbeg) ? + UCHARAT(locinput - 1) : '\n'; + switch (FLAGS(scan)) { + case REGEX_UNICODE_CHARSET: + ln = isWORDCHAR_L1(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr); + break; + case REGEX_LOCALE_CHARSET: + ln = isWORDCHAR_LC(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr); + break; + case REGEX_DEPENDS_CHARSET: + ln = isWORDCHAR(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr); + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + ln = isWORDCHAR_A(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr); + break; + default: + Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); + } + } + /* Note requires that all BOUNDs be lower than all NBOUNDs in + * regcomp.sym */ + if (((!ln) == (!n)) == (OP(scan) < NBOUND)) + sayNO; + break; + + case ANYOF: /* /[abc]/ */ + if (NEXTCHR_IS_EOS) + sayNO; + if (utf8_target) { + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) + sayNO; + locinput += UTF8SKIP(locinput); + } + else { + if (!REGINCLASS(rex, scan, (U8*)locinput)) + sayNO; + locinput++; + } + break; + + /* The argument (FLAGS) to all the POSIX node types is the class number + * */ + + case NPOSIXL: /* \W or [:^punct:] etc. under /l */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: /* \w or [:punct:] etc. under /l */ + if (NEXTCHR_IS_EOS) + sayNO; + + /* Use isFOO_lc() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) { + sayNO; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, + *(locinput + 1)))))) + { + sayNO; + } + } + else { /* Here, must be an above Latin-1 code point */ + goto utf8_posix_not_eos; + } + + /* Here, must be utf8 */ + locinput += UTF8SKIP(locinput); + break; + + case NPOSIXD: /* \W or [:^punct:] etc. under /d */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: /* \w or [:punct:] etc. under /d */ + if (utf8_target) { + goto utf8_posix; + } + goto posixa; + + case NPOSIXA: /* \W or [:^punct:] etc. under /a */ + + if (NEXTCHR_IS_EOS) { + sayNO; + } + + /* All UTF-8 variants match */ + if (! UTF8_IS_INVARIANT(nextchr)) { + goto increment_locinput; + } + + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXA: /* \w or [:punct:] etc. under /a */ + + posixa: + /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in + * UTF-8, and also from NPOSIXA even in UTF-8 when the current + * character is a single byte */ + + if (NEXTCHR_IS_EOS + || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, + FLAGS(scan))))) + { + sayNO; + } + + /* Here we are either not in utf8, or we matched a utf8-invariant, + * so the next char is the next byte */ + locinput++; + break; + + case NPOSIXU: /* \W or [:^punct:] etc. under /u */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: /* \w or [:punct:] etc. under /u */ + utf8_posix: + if (NEXTCHR_IS_EOS) { + sayNO; + } + utf8_posix_not_eos: + + /* Use _generic_isCC() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(_generic_isCC(nextchr, + FLAGS(scan))))) + { + sayNO; + } + locinput++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, + *(locinput + 1)), + FLAGS(scan))))) + { + sayNO; + } + locinput += 2; + } + else { /* Handle above Latin-1 code points */ + classnum = (_char_class_number) FLAGS(scan); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, uses a swash to find such code points. Load if if + * not done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] + = _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + if (! (to_complement + ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) locinput, TRUE)))) + { + sayNO; + } + } + else { /* Here, uses macros to find above Latin-1 code points */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate + code if we revert the change + of \v matching this */ + case _CC_ENUM_PSXSPC: + if (! (to_complement + ^ cBOOL(is_XPERLSPACE_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_BLANK: + if (! (to_complement + ^ cBOOL(is_HORIZWS_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_XDIGIT: + if (! (to_complement + ^ cBOOL(is_XDIGIT_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_VERTSPACE: + if (! (to_complement + ^ cBOOL(is_VERTWS_high(locinput)))) + { + sayNO; + } + break; + default: /* The rest, e.g. [:cntrl:], can't match + above Latin1 */ + if (! to_complement) { + sayNO; + } + break; + } + } + locinput += UTF8SKIP(locinput); + } + break; + + case CLUMP: /* Match \X: logical Unicode character. This is defined as + a Unicode extended Grapheme Cluster */ + /* From http://www.unicode.org/reports/tr29 (5.2 version). An + extended Grapheme Cluster is: + + CR LF + | Prepend* Begin Extend* + | . + + Begin is: ( Special_Begin | ! Control ) + Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) + Extend is: ( Grapheme_Extend | Spacing_Mark ) + Control is: [ GCB_Control | CR | LF ] + Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) + + If we create a 'Regular_Begin' = Begin - Special_Begin, then + we can rewrite + + Begin is ( Regular_Begin + Special Begin ) + + It turns out that 98.4% of all Unicode code points match + Regular_Begin. Doing it this way eliminates a table match in + the previous implementation for almost all Unicode code points. + + There is a subtlety with Prepend* which showed up in testing. + Note that the Begin, and only the Begin is required in: + | Prepend* Begin Extend* + Also, Begin contains '! Control'. A Prepend must be a + '! Control', which means it must also be a Begin. What it + comes down to is that if we match Prepend* and then find no + suitable Begin afterwards, that if we backtrack the last + Prepend, that one will be a suitable Begin. + */ + + if (NEXTCHR_IS_EOS) + sayNO; + if (! utf8_target) { + + /* Match either CR LF or '.', as all the other possibilities + * require utf8 */ + locinput++; /* Match the . or CR */ + if (nextchr == '\r' /* And if it was CR, and the next is LF, + match the LF */ + && locinput < reginfo->strend + && UCHARAT(locinput) == '\n') + { + locinput++; + } + } + else { + + /* Utf8: See if is ( CR LF ); already know that locinput < + * reginfo->strend, so locinput+1 is in bounds */ + if ( nextchr == '\r' && locinput+1 < reginfo->strend + && UCHARAT(locinput + 1) == '\n') + { + locinput += 2; + } + else { + STRLEN len; + + /* In case have to backtrack to beginning, then match '.' */ + char *starting = locinput; + + /* In case have to backtrack the last prepend */ + char *previous_prepend = NULL; + + LOAD_UTF8_CHARCLASS_GCB(); + + /* Match (prepend)* */ + while (locinput < reginfo->strend + && (len = is_GCB_Prepend_utf8(locinput))) + { + previous_prepend = locinput; + locinput += len; + } + + /* As noted above, if we matched a prepend character, but + * the next thing won't match, back off the last prepend we + * matched, as it is guaranteed to match the begin */ + if (previous_prepend + && (locinput >= reginfo->strend + || (! swash_fetch(PL_utf8_X_regular_begin, + (U8*)locinput, utf8_target) + && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput))) + ) + { + locinput = previous_prepend; + } + + /* Note that here we know reginfo->strend > locinput, as we + * tested that upon input to this switch case, and if we + * moved locinput forward, we tested the result just above + * and it either passed, or we backed off so that it will + * now pass */ + if (swash_fetch(PL_utf8_X_regular_begin, + (U8*)locinput, utf8_target)) { + locinput += UTF8SKIP(locinput); + } + else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) { + + /* Here did not match the required 'Begin' in the + * second term. So just match the very first + * character, the '.' of the final term of the regex */ + locinput = starting + UTF8SKIP(starting); + goto exit_utf8; + } else { + + /* Here is a special begin. It can be composed of + * several individual characters. One possibility is + * RI+ */ + if ((len = is_GCB_RI_utf8(locinput))) { + locinput += len; + while (locinput < reginfo->strend + && (len = is_GCB_RI_utf8(locinput))) + { + locinput += len; + } + } else if ((len = is_GCB_T_utf8(locinput))) { + /* Another possibility is T+ */ + locinput += len; + while (locinput < reginfo->strend + && (len = is_GCB_T_utf8(locinput))) + { + locinput += len; + } + } else { + + /* Here, neither RI+ nor T+; must be some other + * Hangul. That means it is one of the others: L, + * LV, LVT or V, and matches: + * L* (L | LVT T* | V * V* T* | LV V* T*) */ + + /* Match L* */ + while (locinput < reginfo->strend + && (len = is_GCB_L_utf8(locinput))) + { + locinput += len; + } + + /* Here, have exhausted L*. If the next character + * is not an LV, LVT nor V, it means we had to have + * at least one L, so matches L+ in the original + * equation, we have a complete hangul syllable. + * Are done. */ + + if (locinput < reginfo->strend + && is_GCB_LV_LVT_V_utf8(locinput)) + { + /* Otherwise keep going. Must be LV, LVT or V. + * See if LVT, by first ruling out V, then LV */ + if (! is_GCB_V_utf8(locinput) + /* All but every TCount one is LV */ + && (valid_utf8_to_uvchr((U8 *) locinput, + NULL) + - SBASE) + % TCount != 0) + { + locinput += UTF8SKIP(locinput); + } else { + + /* Must be V or LV. Take it, then match + * V* */ + locinput += UTF8SKIP(locinput); + while (locinput < reginfo->strend + && (len = is_GCB_V_utf8(locinput))) + { + locinput += len; + } + } + + /* And any of LV, LVT, or V can be followed + * by T* */ + while (locinput < reginfo->strend + && (len = is_GCB_T_utf8(locinput))) + { + locinput += len; + } + } + } + } + + /* Match any extender */ + while (locinput < reginfo->strend + && swash_fetch(PL_utf8_X_extend, + (U8*)locinput, utf8_target)) + { + locinput += UTF8SKIP(locinput); + } + } + exit_utf8: + if (locinput > reginfo->strend) sayNO; + } + break; + + case NREFFL: /* /\g{name}/il */ + { /* The capture buffer cases. The ones beginning with N for the + named buffers just convert to the equivalent numbered and + pretend they were called as the corresponding numbered buffer + op. */ + /* don't initialize these in the declaration, it makes C++ + unhappy */ + const char *s; + char type; + re_fold_t folder; + const U8 *fold_array; + UV utf8_fold_flags; + + folder = foldEQ_locale; + fold_array = PL_fold_locale; + type = REFFL; + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_nref; + + case NREFFA: /* /\g{name}/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFA; + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_nref; + + case NREFFU: /* /\g{name}/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFU; + utf8_fold_flags = 0; + goto do_nref; + + case NREFF: /* /\g{name}/i */ + folder = foldEQ; + fold_array = PL_fold; + type = REFF; + utf8_fold_flags = 0; + goto do_nref; + + case NREF: /* /\g{name}/ */ + type = REF; + folder = NULL; + fold_array = NULL; + utf8_fold_flags = 0; + do_nref: + + /* For the named back references, find the corresponding buffer + * number */ + n = reg_check_named_buff_matched(rex,scan); + + if ( ! n ) { + sayNO; + } + goto do_nref_ref_common; + + case REFFL: /* /\1/il */ + folder = foldEQ_locale; + fold_array = PL_fold_locale; + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_ref; + + case REFFA: /* /\1/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_ref; + + case REFFU: /* /\1/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + utf8_fold_flags = 0; + goto do_ref; + + case REFF: /* /\1/i */ + folder = foldEQ; + fold_array = PL_fold; + utf8_fold_flags = 0; + goto do_ref; + + case REF: /* /\1/ */ + folder = NULL; + fold_array = NULL; + utf8_fold_flags = 0; + + do_ref: + type = OP(scan); + n = ARG(scan); /* which paren pair */ + + do_nref_ref_common: + ln = rex->offs[n].start; + reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ + if (rex->lastparen < n || ln == -1) + sayNO; /* Do not match unless seen CLOSEn. */ + if (ln == rex->offs[n].end) + break; + + s = reginfo->strbeg + ln; + if (type != REF /* REF can do byte comparison */ + && (utf8_target || type == REFFU || type == REFFL)) + { + char * limit = reginfo->strend; + + /* This call case insensitively compares the entire buffer + * at s, with the current input starting at locinput, but + * not going off the end given by reginfo->strend, and + * returns in upon success, how much of the + * current input was matched */ + if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, + locinput, &limit, 0, utf8_target, utf8_fold_flags)) + { + sayNO; + } + locinput = limit; + break; + } + + /* Not utf8: Inline the first character, for speed. */ + if (!NEXTCHR_IS_EOS && + UCHARAT(s) != nextchr && + (type == REF || + UCHARAT(s) != fold_array[nextchr])) + sayNO; + ln = rex->offs[n].end - ln; + if (locinput + ln > reginfo->strend) + sayNO; + if (ln > 1 && (type == REF + ? memNE(s, locinput, ln) + : ! folder(s, locinput, ln))) + sayNO; + locinput += ln; + break; + } + + case NOTHING: /* null op; e.g. the 'nothing' following + * the '*' in m{(a+|b)*}' */ + break; + case TAIL: /* placeholder while compiling (A|B|C) */ + break; + + case BACK: /* ??? doesn't appear to be used ??? */ + break; + +#undef ST +#define ST st->u.eval + { + SV *ret; + REGEXP *re_sv; + regexp *re; + regexp_internal *rei; + regnode *startpoint; + + case GOSTART: /* (?R) */ + case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) + Perl_croak(aTHX_ "Infinite recursion in regex"); + if ( ++nochange_depth > max_nochange_depth ) + Perl_croak(aTHX_ + "Pattern subroutine nesting without pos change" + " exceeded limit in regex"); + } else { + nochange_depth = 0; + } + re_sv = rex_sv; + re = rex; + rei = rexi; + if (OP(scan)==GOSUB) { + startpoint = scan + ARG2L(scan); + ST.close_paren = ARG(scan); + } else { + startpoint = rei->program+1; + ST.close_paren = 0; + } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ + goto eval_recurse_doit; + + assert(0); /* NOTREACHED */ + + case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if ( ++nochange_depth > max_nochange_depth ) + Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); + } else { + nochange_depth = 0; + } + { + /* execute the code in the {...} */ + + dSP; + IV before; + OP * const oop = PL_op; + COP * const ocurcop = PL_curcop; + OP *nop; + CV *newcv; + + /* save *all* paren positions */ + regcppush(rex, 0, maxopenparen); + REGCP_SET(runops_cp); + + if (!caller_cv) + caller_cv = find_runcv(NULL); + + n = ARG(scan); + + if (rexi->data->what[n] == 'r') { /* code from an external qr */ + newcv = (ReANY( + (REGEXP*)(rexi->data->data[n]) + ))->qr_anoncv + ; + nop = (OP*)rexi->data->data[n+1]; + } + else if (rexi->data->what[n] == 'l') { /* literal code */ + newcv = caller_cv; + nop = (OP*)rexi->data->data[n]; + assert(CvDEPTH(newcv)); + } + else { + /* literal with own CV */ + assert(rexi->data->what[n] == 'L'); + newcv = rex->qr_anoncv; + nop = (OP*)rexi->data->data[n]; + } + + /* normally if we're about to execute code from the same + * CV that we used previously, we just use the existing + * CX stack entry. However, its possible that in the + * meantime we may have backtracked, popped from the save + * stack, and undone the SAVECOMPPAD(s) associated with + * PUSH_MULTICALL; in which case PL_comppad no longer + * points to newcv's pad. */ + if (newcv != last_pushed_cv || PL_comppad != last_pad) + { + U8 flags = (CXp_SUB_RE | + ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); + if (last_pushed_cv) { + CHANGE_MULTICALL_FLAGS(newcv, flags); + } + else { + PUSH_MULTICALL_FLAGS(newcv, flags); + } + last_pushed_cv = newcv; + } + else { + /* these assignments are just to silence compiler + * warnings */ + multicall_cop = NULL; + newsp = NULL; + } + last_pad = PL_comppad; + + /* the initial nextstate you would normally execute + * at the start of an eval (which would cause error + * messages to come from the eval), may be optimised + * away from the execution path in the regex code blocks; + * so manually set PL_curcop to it initially */ + { + OP *o = cUNOPx(nop)->op_first; + assert(o->op_type == OP_NULL); + if (o->op_targ == OP_SCOPE) { + o = cUNOPo->op_first; + } + else { + assert(o->op_targ == OP_LEAVE); + o = cUNOPo->op_first; + assert(o->op_type == OP_ENTER); + o = OP_SIBLING(o); + } + + if (o->op_type != OP_STUB) { + assert( o->op_type == OP_NEXTSTATE + || o->op_type == OP_DBSTATE + || (o->op_type == OP_NULL + && ( o->op_targ == OP_NEXTSTATE + || o->op_targ == OP_DBSTATE + ) + ) + ); + PL_curcop = (COP*)o; + } + } + nop = nop->op_next; + + DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, + " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); + + rex->offs[0].end = locinput - reginfo->strbeg; + if (reginfo->info_aux_eval->pos_magic) + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); + + if (sv_yes_mark) { + SV *sv_mrk = get_sv("REGMARK", 1); + sv_setsv(sv_mrk, sv_yes_mark); + } + + /* we don't use MULTICALL here as we want to call the + * first op of the block of interest, rather than the + * first op of the sub */ + before = (IV)(SP-PL_stack_base); + PL_op = nop; + CALLRUNOPS(aTHX); /* Scalar context. */ + SPAGAIN; + if ((IV)(SP-PL_stack_base) == before) + ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ + else { + ret = POPs; + PUTBACK; + } + + /* before restoring everything, evaluate the returned + * value, so that 'uninit' warnings don't use the wrong + * PL_op or pad. Also need to process any magic vars + * (e.g. $1) *before* parentheses are restored */ + + PL_op = NULL; + + re_sv = NULL; + if (logical == 0) /* (?{})/ */ + sv_setsv(save_scalar(PL_replgv), ret); /* $^R */ + else if (logical == 1) { /* /(?(?{...})X|Y)/ */ + sw = cBOOL(SvTRUE(ret)); + logical = 0; + } + else { /* /(??{}) */ + /* if its overloaded, let the regex compiler handle + * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); + if (!SvAMAGIC(ret)) { + SV *sv = ret; + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) == SVt_REGEXP) + re_sv = (REGEXP*) sv; + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); + if (mg) + re_sv = (REGEXP *) mg->mg_obj; + } + + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { + ret = sv_mortalcopy(ret); + (void) SvPV_force_nolen(ret); + } + } + + } + + /* *** Note that at this point we don't restore + * PL_comppad, (or pop the CxSUB) on the assumption it may + * be used again soon. This is safe as long as nothing + * in the regexp code uses the pad ! */ + PL_op = oop; + PL_curcop = ocurcop; + S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); + PL_curpm = PL_reg_curpm; + + if (logical != 2) + break; + } + + /* only /(??{})/ from now on */ + logical = 0; + { + /* extract RE object from returned value; compiling if + * necessary */ + + if (re_sv) { + re_sv = reg_temp_copy(NULL, re_sv); + } + else { + U32 pm_flags = 0; + + if (SvUTF8(ret) && IN_BYTES) { + /* In use 'bytes': make a copy of the octet + * sequence, but without the flag on */ + STRLEN len; + const char *const p = SvPV(ret, len); + ret = newSVpvn_flags(p, len, SVs_TEMP); + } + if (rex->intflags & PREGf_USE_RE_EVAL) + pm_flags |= PMf_USE_RE_EVAL; + + /* if we got here, it should be an engine which + * supports compiling code blocks and stuff */ + assert(rex->engine && rex->engine->op_comp); + assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); + re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, + rex->engine, NULL, NULL, + /* copy /msix etc to inner pattern */ + scan->flags, + pm_flags); + + if (!(SvFLAGS(ret) + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { + /* This isn't a first class regexp. Instead, it's + caching a regexp onto an existing, Perl visible + scalar. */ + sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); + } + } + SAVEFREESV(re_sv); + re = ReANY(re_sv); + } + RXp_MATCH_COPIED_off(re); + re->subbeg = rex->subbeg; + re->sublen = rex->sublen; + re->suboffset = rex->suboffset; + re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; + rei = RXi_GET(re); + DEBUG_EXECUTE_r( + debug_start_match(re_sv, utf8_target, locinput, + reginfo->strend, "Matching embedded"); + ); + startpoint = rei->program + 1; + ST.close_paren = 0; /* only used for GOSUB */ + /* Save all the seen positions so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + /* and set maxopenparen to 0, since we are starting a "fresh" match */ + maxopenparen = 0; + /* run the pattern returned from (??{...}) */ + + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ + + /* invalidate the S-L poscache. We're now executing a + * different set of WHILEM ops (and their associated + * indexes) against the same string, so the bits in the + * cache are meaningless. Setting maxiter to zero forces + * the cache to be invalidated and zeroed before reuse. + * XXX This is too dramatic a measure. Ideally we should + * save the old cache and restore when running the outer + * pattern again */ + reginfo->poscache_maxiter = 0; + + /* the new regexp might have a different is_utf8_pat than we do */ + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); + + ST.prev_rex = rex_sv; + ST.prev_curlyx = cur_curlyx; + rex_sv = re_sv; + SET_reg_curpm(rex_sv); + rex = re; + rexi = rei; + cur_curlyx = NULL; + ST.B = next; + ST.prev_eval = cur_eval; + cur_eval = st; + /* now continue from first node in postoned RE */ + PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput); + assert(0); /* NOTREACHED */ + } + + case EVAL_AB: /* cleanup after a successful (??{A})B */ + /* note: this is called twice; first after popping B, then A */ + rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } + cur_eval = ST.prev_eval; + cur_curlyx = ST.prev_curlyx; + + /* Invalidate cache. See "invalidate" comment above. */ + reginfo->poscache_maxiter = 0; + if ( nochange_depth ) + nochange_depth--; + sayYES; + + + case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ + /* note: this is called twice; first after popping B, then A */ + rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + cur_eval = ST.prev_eval; + cur_curlyx = ST.prev_curlyx; + /* Invalidate cache. See "invalidate" comment above. */ + reginfo->poscache_maxiter = 0; + if ( nochange_depth ) + nochange_depth--; + sayNO_SILENT; +#undef ST + + case OPEN: /* ( */ + n = ARG(scan); /* which paren pair */ + rex->offs[n].start_tmp = locinput - reginfo->strbeg; + if (n > maxopenparen) + maxopenparen = n; + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n", + PTR2UV(rex), + PTR2UV(rex->offs), + (UV)n, + (IV)rex->offs[n].start_tmp, + (UV)maxopenparen + )); + lastopen = n; + break; + +/* XXX really need to log other places start/end are set too */ +#define CLOSE_CAPTURE \ + rex->offs[n].start = rex->offs[n].start_tmp; \ + rex->offs[n].end = locinput - reginfo->strbeg; \ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \ + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \ + PTR2UV(rex), \ + PTR2UV(rex->offs), \ + (UV)n, \ + (IV)rex->offs[n].start, \ + (IV)rex->offs[n].end \ + )) + + case CLOSE: /* ) */ + n = ARG(scan); /* which paren pair */ + CLOSE_CAPTURE; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; + if (cur_eval && cur_eval->u.eval.close_paren == n) { + goto fake_end; + } + break; + + case ACCEPT: /* (*ACCEPT) */ + if (ARG(scan)){ + regnode *cursor; + for (cursor=scan; + cursor && OP(cursor)!=END; + cursor=regnext(cursor)) + { + if ( OP(cursor)==CLOSE ){ + n = ARG(cursor); + if ( n <= lastopen ) { + CLOSE_CAPTURE; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; + if ( n == ARG(scan) || (cur_eval && + cur_eval->u.eval.close_paren == n)) + break; + } + } + } + } + goto fake_end; + /*NOTREACHED*/ + + case GROUPP: /* (?(1)) */ + n = ARG(scan); /* which paren pair */ + sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); + break; + + case NGROUPP: /* (?()) */ + /* reg_check_named_buff_matched returns 0 for no match */ + sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); + break; + + case INSUBP: /* (?(R)) */ + n = ARG(scan); + sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n)); + break; + + case DEFINEP: /* (?(DEFINE)) */ + sw = 0; + break; + + case IFTHEN: /* (?(cond)A|B) */ + reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ + if (sw) + next = NEXTOPER(NEXTOPER(scan)); + else { + next = scan + ARG(scan); + if (OP(next) == IFTHEN) /* Fake one. */ + next = NEXTOPER(NEXTOPER(next)); + } + break; + + case LOGICAL: /* modifier for EVAL and IFMATCH */ + logical = scan->flags; + break; + +/******************************************************************* + +The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ +pattern, where A and B are subpatterns. (For simple A, CURLYM or +STAR/PLUS/CURLY/CURLYN are used instead.) + +A*B is compiled as + +On entry to the subpattern, CURLYX is called. This pushes a CURLYX +state, which contains the current count, initialised to -1. It also sets +cur_curlyx to point to this state, with any previous value saved in the +state block. + +CURLYX then jumps straight to the WHILEM op, rather than executing A, +since the pattern may possibly match zero times (i.e. it's a while {} loop +rather than a do {} while loop). + +Each entry to WHILEM represents a successful match of A. The count in the +CURLYX block is incremented, another WHILEM state is pushed, and execution +passes to A or B depending on greediness and the current count. + +For example, if matching against the string a1a2a3b (where the aN are +substrings that match /A/), then the match progresses as follows: (the +pushed states are interspersed with the bits of strings matched so far): + + + + a1 + a1 a2 + a1 a2 a3 + a1 a2 a3 b + +(Contrast this with something like CURLYM, which maintains only a single +backtrack state: + + a1 + a1 a2 + a1 a2 a3 + a1 a2 a3 b +) + +Each WHILEM state block marks a point to backtrack to upon partial failure +of A or B, and also contains some minor state data related to that +iteration. The CURLYX block, pointed to by cur_curlyx, contains the +overall state, such as the count, and pointers to the A and B ops. + +This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx +must always point to the *current* CURLYX block, the rules are: + +When executing CURLYX, save the old cur_curlyx in the CURLYX state block, +and set cur_curlyx to point the new block. + +When popping the CURLYX block after a successful or unsuccessful match, +restore the previous cur_curlyx. + +When WHILEM is about to execute B, save the current cur_curlyx, and set it +to the outer one saved in the CURLYX block. + +When popping the WHILEM block after a successful or unsuccessful B match, +restore the previous cur_curlyx. + +Here's an example for the pattern (AI* BI)*BO +I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: + +cur_ +curlyx backtrack stack +------ --------------- +NULL +CO +CI ai +CO ai bi +NULL ai bi bo + +At this point the pattern succeeds, and we work back down the stack to +clean up, restoring as we go: + +CO ai bi +CI ai +CO +NULL + +*******************************************************************/ + +#define ST st->u.curlyx + + case CURLYX: /* start of /A*B/ (for complex A) */ + { + /* No need to save/restore up to this paren */ + I32 parenfloor = scan->flags; + + assert(next); /* keep Coverity happy */ + if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ + next += ARG(next); + + /* XXXX Probably it is better to teach regpush to support + parenfloor > maxopenparen ... */ + if (parenfloor > (I32)rex->lastparen) + parenfloor = rex->lastparen; /* Pessimization... */ + + ST.prev_curlyx= cur_curlyx; + cur_curlyx = st; + ST.cp = PL_savestack_ix; + + /* these fields contain the state of the current curly. + * they are accessed by subsequent WHILEMs */ + ST.parenfloor = parenfloor; + ST.me = scan; + ST.B = next; + ST.minmod = minmod; + minmod = 0; + ST.count = -1; /* this will be updated by WHILEM */ + ST.lastloc = NULL; /* this will be updated by WHILEM */ + + PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput); + assert(0); /* NOTREACHED */ + } + + case CURLYX_end: /* just finished matching all of A*B */ + cur_curlyx = ST.prev_curlyx; + sayYES; + assert(0); /* NOTREACHED */ + + case CURLYX_end_fail: /* just failed to match all of A*B */ + regcpblow(ST.cp); + cur_curlyx = ST.prev_curlyx; + sayNO; + assert(0); /* NOTREACHED */ + + +#undef ST +#define ST st->u.whilem + + case WHILEM: /* just matched an A in /A*B/ (for complex A) */ + { + /* see the discussion above about CURLYX/WHILEM */ + I32 n; + int min, max; + regnode *A; + + assert(cur_curlyx); /* keep Coverity happy */ + + min = ARG1(cur_curlyx->u.curlyx.me); + max = ARG2(cur_curlyx->u.curlyx.me); + A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS; + n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ + ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; + ST.cache_offset = 0; + ST.cache_mask = 0; + + + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: matched %ld out of %d..%d\n", + REPORT_CODE_OFF+depth*2, "", (long)n, min, max) + ); + + /* First just match a string of min A's. */ + + if (n < min) { + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + cur_curlyx->u.curlyx.lastloc = locinput; + REGCP_SET(ST.lastcp); + + PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput); + assert(0); /* NOTREACHED */ + } + + /* If degenerate A matches "", assume A done. */ + + if (locinput == cur_curlyx->u.curlyx.lastloc) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: empty match detected, trying continuation...\n", + REPORT_CODE_OFF+depth*2, "") + ); + goto do_whilem_B_max; + } + + /* super-linear cache processing. + * + * The idea here is that for certain types of CURLYX/WHILEM - + * principally those whose upper bound is infinity (and + * excluding regexes that have things like \1 and other very + * non-regular expresssiony things), then if a pattern like + * /....A*.../ fails and we backtrack to the WHILEM, then we + * make a note that this particular WHILEM op was at string + * position 47 (say) when the rest of pattern failed. Then, if + * we ever find ourselves back at that WHILEM, and at string + * position 47 again, we can just fail immediately rather than + * running the rest of the pattern again. + * + * This is very handy when patterns start to go + * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up + * with a combinatorial explosion of backtracking. + * + * The cache is implemented as a bit array, with one bit per + * string byte position per WHILEM op (up to 16) - so its + * between 0.25 and 2x the string size. + * + * To avoid allocating a poscache buffer every time, we do an + * initially countdown; only after we have executed a WHILEM + * op (string-length x #WHILEMs) times do we allocate the + * cache. + * + * The top 4 bits of scan->flags byte say how many different + * relevant CURLLYX/WHILEM op pairs there are, while the + * bottom 4-bits is the identifying index number of this + * WHILEM. + */ + + if (scan->flags) { + + if (!reginfo->poscache_maxiter) { + /* start the countdown: Postpone detection until we + * know the match is not *that* much linear. */ + reginfo->poscache_maxiter + = (reginfo->strend - reginfo->strbeg + 1) + * (scan->flags>>4); + /* possible overflow for long strings and many CURLYX's */ + if (reginfo->poscache_maxiter < 0) + reginfo->poscache_maxiter = I32_MAX; + reginfo->poscache_iter = reginfo->poscache_maxiter; + } + + if (reginfo->poscache_iter-- == 0) { + /* initialise cache */ + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; + regmatch_info_aux *const aux = reginfo->info_aux; + if (aux->poscache) { + if ((SSize_t)reginfo->poscache_size < size) { + Renew(aux->poscache, size, char); + reginfo->poscache_size = size; + } + Zero(aux->poscache, size, char); + } + else { + reginfo->poscache_size = size; + Newxz(aux->poscache, size, char); + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%swhilem: Detected a super-linear match, switching on caching%s...\n", + PL_colors[4], PL_colors[5]) + ); + } + + if (reginfo->poscache_iter < 0) { + /* have we already failed at this position? */ + SSize_t offset, mask; + + reginfo->poscache_iter = -1; /* stop eventual underflow */ + offset = (scan->flags & 0xf) - 1 + + (locinput - reginfo->strbeg) + * (scan->flags>>4); + mask = 1 << (offset % 8); + offset /= 8; + if (reginfo->info_aux->poscache[offset] & mask) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: (cache) already tried at this position...\n", + REPORT_CODE_OFF+depth*2, "") + ); + sayNO; /* cache records failure */ + } + ST.cache_offset = offset; + ST.cache_mask = mask; + } + } + + /* Prefer B over A for minimal matching. */ + + if (cur_curlyx->u.curlyx.minmod) { + ST.save_curlyx = cur_curlyx; + cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; + ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor, + maxopenparen); + REGCP_SET(ST.lastcp); + PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, + locinput); + assert(0); /* NOTREACHED */ + } + + /* Prefer A over B for maximal matching. */ + + if (n < max) { /* More greed allowed? */ + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + cur_curlyx->u.curlyx.lastloc = locinput; + REGCP_SET(ST.lastcp); + PUSH_STATE_GOTO(WHILEM_A_max, A, locinput); + assert(0); /* NOTREACHED */ + } + goto do_whilem_B_max; + } + assert(0); /* NOTREACHED */ + + case WHILEM_B_min: /* just matched B in a minimal match */ + case WHILEM_B_max: /* just matched B in a maximal match */ + cur_curlyx = ST.save_curlyx; + sayYES; + assert(0); /* NOTREACHED */ + + case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ + cur_curlyx = ST.save_curlyx; + cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + assert(0); /* NOTREACHED */ + + case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ + /* FALLTHROUGH */ + case WHILEM_A_pre_fail: /* just failed to match even minimal A */ + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + assert(0); /* NOTREACHED */ + + case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); /* Restore some previous $s? */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%*s whilem: failed, trying continuation...\n", + REPORT_CODE_OFF+depth*2, "") + ); + do_whilem_B_max: + if (cur_curlyx->u.curlyx.count >= REG_INFTY + && ckWARN(WARN_REGEXP) + && !reginfo->warned) + { + reginfo->warned = TRUE; + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion limit (%d) " + "exceeded", + REG_INFTY - 1); + } + + /* now try B */ + ST.save_curlyx = cur_curlyx; + cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; + PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, + locinput); + assert(0); /* NOTREACHED */ + + case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ + cur_curlyx = ST.save_curlyx; + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + + if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { + /* Maximum greed exceeded */ + if (cur_curlyx->u.curlyx.count >= REG_INFTY + && ckWARN(WARN_REGEXP) + && !reginfo->warned) + { + reginfo->warned = TRUE; + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion " + "limit (%d) exceeded", + REG_INFTY - 1); + } + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "") + ); + /* Try grabbing another A and see if it helps. */ + cur_curlyx->u.curlyx.lastloc = locinput; + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + REGCP_SET(ST.lastcp); + PUSH_STATE_GOTO(WHILEM_A_min, + /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, + locinput); + assert(0); /* NOTREACHED */ + +#undef ST +#define ST st->u.branch + + case BRANCHJ: /* /(...|A|...)/ with long next pointer */ + next = scan + ARG(scan); + if (next == scan) + next = NULL; + scan = NEXTOPER(scan); + /* FALLTHROUGH */ + + case BRANCH: /* /(...|A|...)/ */ + scan = NEXTOPER(scan); /* scan now points to inner node */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + ST.next_branch = next; + REGCP_SET(ST.cp); + + /* Now go into the branch */ + if (has_cutgroup) { + PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput); + } else { + PUSH_STATE_GOTO(BRANCH_next, scan, locinput); + } + assert(0); /* NOTREACHED */ + + case CUTGROUP: /* /(*THEN)/ */ + sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : + MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + PUSH_STATE_GOTO(CUTGROUP_next, next, locinput); + assert(0); /* NOTREACHED */ + + case CUTGROUP_next_fail: + do_cutgroup = 1; + no_final = 1; + if (st->u.mark.mark_name) + sv_commit = st->u.mark.mark_name; + sayNO; + assert(0); /* NOTREACHED */ + + case BRANCH_next: + sayYES; + assert(0); /* NOTREACHED */ + + case BRANCH_next_fail: /* that branch failed; try the next, if any */ + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + scan = ST.next_branch; + /* no more branches? */ + if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sBRANCH failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); + sayNO_SILENT; + } + continue; /* execute next BRANCH[J] op */ + assert(0); /* NOTREACHED */ + + case MINMOD: /* next op will be non-greedy, e.g. A*? */ + minmod = 1; + break; + +#undef ST +#define ST st->u.curlym + + case CURLYM: /* /A{m,n}B/ where A is fixed-length */ + + /* This is an optimisation of CURLYX that enables us to push + * only a single backtracking state, no matter how many matches + * there are in {m,n}. It relies on the pattern being constant + * length, with no parens to influence future backrefs + */ + + ST.me = scan; + scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + + /* if paren positive, emulate an OPEN/CLOSE around A */ + if (ST.me->flags) { + U32 paren = ST.me->flags; + if (paren > maxopenparen) + maxopenparen = paren; + scan += NEXT_OFF(scan); /* Skip former OPEN. */ + } + ST.A = scan; + ST.B = next; + ST.alen = 0; + ST.count = 0; + ST.minmod = minmod; + minmod = 0; + ST.c1 = CHRTEST_UNINIT; + REGCP_SET(ST.cp); + + if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ + goto curlym_do_B; + + curlym_do_A: /* execute the A in /A{m,n}B/ */ + PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */ + assert(0); /* NOTREACHED */ + + case CURLYM_A: /* we've just matched an A */ + ST.count++; + /* after first match, determine A's length: u.curlym.alen */ + if (ST.count == 1) { + if (reginfo->is_utf8_target) { + char *s = st->locinput; + while (s < locinput) { + ST.alen++; + s += UTF8SKIP(s); + } + } + else { + ST.alen = locinput - st->locinput; + } + if (ST.alen == 0) + ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); + } + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n", + (int)(REPORT_CODE_OFF+(depth*2)), "", + (IV) ST.count, (IV)ST.alen) + ); + + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags) + goto fake_end; + + { + I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); + if ( max == REG_INFTY || ST.count < max ) + goto curlym_do_A; /* try to match another A */ + } + goto curlym_do_B; /* try to match B */ + + case CURLYM_A_fail: /* just failed to match an A */ + REGCP_UNWIND(ST.cp); + + if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ + || (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags)) + sayNO; + + curlym_do_B: /* execute the B in /A{m,n}B/ */ + if (ST.c1 == CHRTEST_UNINIT) { + /* calculate c1 and c2 for possible match of 1st char + * following curly */ + ST.c1 = ST.c2 = CHRTEST_VOID; + assert(ST.B); + if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { + regnode *text_node = ST.B; + if (! HAS_TEXT(text_node)) + FIND_NEXT_IMPT(text_node); + /* this used to be + + (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT) + + But the former is redundant in light of the latter. + + if this changes back then the macro for + IS_TEXT and friends need to change. + */ + if (PL_regkind[OP(text_node)] == EXACT) { + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, + reginfo)) + { + sayNO; + } + } + } + } + + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM trying tail with matches=%"IVdf"...\n", + (int)(REPORT_CODE_OFF+(depth*2)), + "", (IV)ST.count) + ); + if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { + if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { + if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + valid_utf8_to_uvchr((U8 *) locinput, NULL), + valid_utf8_to_uvchr(ST.c1_utf8, NULL), + valid_utf8_to_uvchr(ST.c2_utf8, NULL)) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + else if (nextchr != ST.c1 && nextchr != ST.c2) { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + (int) nextchr, ST.c1, ST.c2) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + + if (ST.me->flags) { + /* emulate CLOSE: mark current A as captured */ + I32 paren = ST.me->flags; + if (ST.count) { + rex->offs[paren].start + = HOPc(locinput, -ST.alen) - reginfo->strbeg; + rex->offs[paren].end = locinput - reginfo->strbeg; + if ((U32)paren > rex->lastparen) + rex->lastparen = paren; + rex->lastcloseparen = paren; + } + else + rex->offs[paren].end = -1; + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags) + { + if (ST.count) + goto fake_end; + else + sayNO; + } + } + + PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */ + assert(0); /* NOTREACHED */ + + case CURLYM_B_fail: /* just failed to match a B */ + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + if (ST.minmod) { + I32 max = ARG2(ST.me); + if (max != REG_INFTY && ST.count == max) + sayNO; + goto curlym_do_A; /* try to match a further A */ + } + /* backtrack one A */ + if (ST.count == ARG1(ST.me) /* min */) + sayNO; + ST.count--; + SET_locinput(HOPc(locinput, -ST.alen)); + goto curlym_do_B; /* try to match B */ + +#undef ST +#define ST st->u.curly + +#define CURLY_SETPAREN(paren, success) \ + if (paren) { \ + if (success) { \ + rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \ + rex->offs[paren].end = locinput - reginfo->strbeg; \ + if (paren > rex->lastparen) \ + rex->lastparen = paren; \ + rex->lastcloseparen = paren; \ + } \ + else { \ + rex->offs[paren].end = -1; \ + rex->lastparen = ST.lastparen; \ + rex->lastcloseparen = ST.lastcloseparen; \ + } \ + } + + case STAR: /* /A*B/ where A is width 1 char */ + ST.paren = 0; + ST.min = 0; + ST.max = REG_INFTY; + scan = NEXTOPER(scan); + goto repeat; + + case PLUS: /* /A+B/ where A is width 1 char */ + ST.paren = 0; + ST.min = 1; + ST.max = REG_INFTY; + scan = NEXTOPER(scan); + goto repeat; + + case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ + ST.paren = scan->flags; /* Which paren to set */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + if (ST.paren > maxopenparen) + maxopenparen = ST.paren; + ST.min = ARG1(scan); /* min to match */ + ST.max = ARG2(scan); /* max to match */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + ST.min=1; + ST.max=1; + } + scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); + goto repeat; + + case CURLY: /* /A{m,n}B/ where A is width 1 char */ + ST.paren = 0; + ST.min = ARG1(scan); /* min to match */ + ST.max = ARG2(scan); /* max to match */ + scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + repeat: + /* + * Lookahead to avoid useless match attempts + * when we know what character comes next. + * + * Used to only do .*x and .*?x, but now it allows + * for )'s, ('s and (?{ ... })'s to be in the way + * of the quantifier and the EXACT-like node. -- japhy + */ + + assert(ST.min <= ST.max); + if (! HAS_TEXT(next) && ! JUMPABLE(next)) { + ST.c1 = ST.c2 = CHRTEST_VOID; + } + else { + regnode *text_node = next; + + if (! HAS_TEXT(text_node)) + FIND_NEXT_IMPT(text_node); + + if (! HAS_TEXT(text_node)) + ST.c1 = ST.c2 = CHRTEST_VOID; + else { + if ( PL_regkind[OP(text_node)] != EXACT ) { + ST.c1 = ST.c2 = CHRTEST_VOID; + } + else { + + /* Currently we only get here when + + PL_rekind[OP(text_node)] == EXACT + + if this changes back then the macro for IS_TEXT and + friends need to change. */ + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, + reginfo)) + { + sayNO; + } + } + } + } + + ST.A = scan; + ST.B = next; + if (minmod) { + char *li = locinput; + minmod = 0; + if (ST.min && + regrepeat(rex, &li, ST.A, reginfo, ST.min, depth) + < ST.min) + sayNO; + SET_locinput(li); + ST.count = ST.min; + REGCP_SET(ST.cp); + if (ST.c1 == CHRTEST_VOID) + goto curly_try_B_min; + + ST.oldloc = locinput; + + /* set ST.maxpos to the furthest point along the + * string that could possibly match */ + if (ST.max == REG_INFTY) { + ST.maxpos = reginfo->strend - 1; + if (utf8_target) + while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) + ST.maxpos--; + } + else if (utf8_target) { + int m = ST.max - ST.min; + for (ST.maxpos = locinput; + m >0 && ST.maxpos < reginfo->strend; m--) + ST.maxpos += UTF8SKIP(ST.maxpos); + } + else { + ST.maxpos = locinput + ST.max - ST.min; + if (ST.maxpos >= reginfo->strend) + ST.maxpos = reginfo->strend - 1; + } + goto curly_try_B_min_known; + + } + else { + /* avoid taking address of locinput, so it can remain + * a register var */ + char *li = locinput; + ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth); + if (ST.count < ST.min) + sayNO; + SET_locinput(li); + if ((ST.count > ST.min) + && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) + { + /* A{m,n} must come at the end of the string, there's + * no point in backing off ... */ + ST.min = ST.count; + /* ...except that $ and \Z can match before *and* after + newline at the end. Consider "\n\n" =~ /\n+\Z\n/. + We may back off by one in this case. */ + if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) + ST.min--; + } + REGCP_SET(ST.cp); + goto curly_try_B_max; + } + assert(0); /* NOTREACHED */ + + + case CURLY_B_min_known_fail: + /* failed to find B in a non-greedy match where c1,c2 valid */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* Couldn't or didn't -- move forward. */ + ST.oldloc = locinput; + if (utf8_target) + locinput += UTF8SKIP(locinput); + else + locinput++; + ST.count++; + curly_try_B_min_known: + /* find the next place where 'B' could work, then call B */ + { + int n; + if (utf8_target) { + n = (ST.oldloc == locinput) ? 0 : 1; + if (ST.c1 == ST.c2) { + /* set n to utf8_distance(oldloc, locinput) */ + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); + n++; + } + } + else { + /* set n to utf8_distance(oldloc, locinput) */ + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); + n++; + } + } + } + else { /* Not utf8_target */ + if (ST.c1 == ST.c2) { + while (locinput <= ST.maxpos && + UCHARAT(locinput) != ST.c1) + locinput++; + } + else { + while (locinput <= ST.maxpos + && UCHARAT(locinput) != ST.c1 + && UCHARAT(locinput) != ST.c2) + locinput++; + } + n = locinput - ST.oldloc; + } + if (locinput > ST.maxpos) + sayNO; + if (n) { + /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is + * at b; check that everything between oldloc and + * locinput matches */ + char *li = ST.oldloc; + ST.count += n; + if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n) + sayNO; + assert(n == REG_INFTY || locinput == li); + } + CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput); + } + assert(0); /* NOTREACHED */ + + + case CURLY_B_min_fail: + /* failed to find B in a non-greedy match where c1,c2 invalid */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* failed -- move forward one */ + { + char *li = locinput; + if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) { + sayNO; + } + locinput = li; + } + { + ST.count++; + if (ST.count <= ST.max || (ST.max == REG_INFTY && + ST.count > 0)) /* count overflow ? */ + { + curly_try_B_min: + CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput); + } + } + sayNO; + assert(0); /* NOTREACHED */ + + + curly_try_B_max: + /* a successful greedy match: now try to match B */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + { + bool could_match = locinput < reginfo->strend; + + /* If it could work, try it. */ + if (ST.c1 != CHRTEST_VOID && could_match) { + if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) + { + could_match = memEQ(locinput, + ST.c1_utf8, + UTF8SKIP(locinput)) + || memEQ(locinput, + ST.c2_utf8, + UTF8SKIP(locinput)); + } + else { + could_match = UCHARAT(locinput) == ST.c1 + || UCHARAT(locinput) == ST.c2; + } + } + if (ST.c1 == CHRTEST_VOID || could_match) { + CURLY_SETPAREN(ST.paren, ST.count); + PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput); + assert(0); /* NOTREACHED */ + } + } + /* FALLTHROUGH */ + + case CURLY_B_max_fail: + /* failed to find B in a greedy match */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* back up. */ + if (--ST.count < ST.min) + sayNO; + locinput = HOPc(locinput, -1); + goto curly_try_B_max; + +#undef ST + + case END: /* last op of main pattern */ + fake_end: + if (cur_eval) { + /* we've just finished A in /(??{A})B/; now continue with B */ + + st->u.eval.prev_rex = rex_sv; /* inner */ + + /* Save *all* the positions. */ + st->u.eval.cp = regcppush(rex, 0, maxopenparen); + rex_sv = cur_eval->u.eval.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + cur_curlyx = cur_eval->u.eval.prev_curlyx; + + REGCP_SET(st->u.eval.lastcp); + + /* Restore parens of the outer rex without popping the + * savestack */ + S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp, + &maxopenparen); + + st->u.eval.prev_eval = cur_eval; + cur_eval = cur_eval->u.eval.prev_eval; + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n", + REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval));); + if ( nochange_depth ) + nochange_depth--; + + PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B, + locinput); /* match B */ + } + + if (locinput < reginfo->till) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", + PL_colors[4], + (long)(locinput - startpos), + (long)(reginfo->till - startpos), + PL_colors[5])); + + sayNO_SILENT; /* Cannot match: too short. */ + } + sayYES; /* Success! */ + + case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %ssubpattern success...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])); + sayYES; /* Success! */ + +#undef ST +#define ST st->u.ifmatch + + { + char *newstart; + + case SUSPEND: /* (?>A) */ + ST.wanted = 1; + newstart = locinput; + goto do_ifmatch; + + case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?flags) { + char * const s = HOPBACKc(locinput, scan->flags); + if (!s) { + /* trivial fail */ + if (logical) { + logical = 0; + sw = 1 - cBOOL(ST.wanted); + } + else if (ST.wanted) + sayNO; + next = scan + ARG(scan); + if (next == scan) + next = NULL; + break; + } + newstart = s; + } + else + newstart = locinput; + + do_ifmatch: + ST.me = scan; + ST.logical = logical; + logical = 0; /* XXX: reset state of logical once it has been saved into ST */ + + /* execute body of (?...A) */ + PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart); + assert(0); /* NOTREACHED */ + } + + case IFMATCH_A_fail: /* body of (?...A) failed */ + ST.wanted = !ST.wanted; + /* FALLTHROUGH */ + + case IFMATCH_A: /* body of (?...A) succeeded */ + if (ST.logical) { + sw = cBOOL(ST.wanted); + } + else if (!ST.wanted) + sayNO; + + if (OP(ST.me) != SUSPEND) { + /* restore old position except for (?>...) */ + locinput = st->locinput; + } + scan = ST.me + ARG(ST.me); + if (scan == ST.me) + scan = NULL; + continue; /* execute B */ + +#undef ST + + case LONGJMP: /* alternative with many branches compiles to + * (BRANCHJ; EXACT ...; LONGJMP ) x N */ + next = scan + ARG(scan); + if (next == scan) + next = NULL; + break; + + case COMMIT: /* (*COMMIT) */ + reginfo->cutpoint = reginfo->strend; + /* FALLTHROUGH */ + + case PRUNE: /* (*PRUNE) */ + if (!scan->flags) + sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + PUSH_STATE_GOTO(COMMIT_next, next, locinput); + assert(0); /* NOTREACHED */ + + case COMMIT_next_fail: + no_final = 1; + /* FALLTHROUGH */ + + case OPFAIL: /* (*FAIL) */ + sayNO; + assert(0); /* NOTREACHED */ + +#define ST st->u.mark + case MARKPOINT: /* (*MARK:foo) */ + ST.prev_mark = mark_state; + ST.mark_name = sv_commit = sv_yes_mark + = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + mark_state = st; + ST.mark_loc = locinput; + PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput); + assert(0); /* NOTREACHED */ + + case MARKPOINT_next: + mark_state = ST.prev_mark; + sayYES; + assert(0); /* NOTREACHED */ + + case MARKPOINT_next_fail: + if (popmark && sv_eq(ST.mark_name,popmark)) + { + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + popmark = NULL; /* we found our mark */ + sv_commit = ST.mark_name; + + DEBUG_EXECUTE_r({ + PerlIO_printf(Perl_debug_log, + "%*s %ssetting cutpoint to mark:%"SVf"...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], SVfARG(sv_commit), PL_colors[5]); + }); + } + mark_state = ST.prev_mark; + sv_yes_mark = mark_state ? + mark_state->u.mark.mark_name : NULL; + sayNO; + assert(0); /* NOTREACHED */ + + case SKIP: /* (*SKIP) */ + if (scan->flags) { + /* (*SKIP) : if we fail we cut here*/ + ST.mark_name = NULL; + ST.mark_loc = locinput; + PUSH_STATE_GOTO(SKIP_next,next, locinput); + } else { + /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, + otherwise do nothing. Meaning we need to scan + */ + regmatch_state *cur = mark_state; + SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + + while (cur) { + if ( sv_eq( cur->u.mark.mark_name, + find ) ) + { + ST.mark_name = find; + PUSH_STATE_GOTO( SKIP_next, next, locinput); + } + cur = cur->u.mark.prev_mark; + } + } + /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ + break; + + case SKIP_next_fail: + if (ST.mark_name) { + /* (*CUT:NAME) - Set up to search for the name as we + collapse the stack*/ + popmark = ST.mark_name; + } else { + /* (*CUT) - No name, we cut here.*/ + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + /* but we set sv_commit to latest mark_name if there + is one so they can test to see how things lead to this + cut */ + if (mark_state) + sv_commit=mark_state->u.mark.mark_name; + } + no_final = 1; + sayNO; + assert(0); /* NOTREACHED */ +#undef ST + + case LNBREAK: /* \R */ + if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) { + locinput += n; + } else + sayNO; + break; + + default: + PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", + PTR2UV(scan), OP(scan)); + Perl_croak(aTHX_ "regexp memory corruption"); + + /* this is a point to jump to in order to increment + * locinput by one character */ + increment_locinput: + assert(!NEXTCHR_IS_EOS); + if (utf8_target) { + locinput += PL_utf8skip[nextchr]; + /* locinput is allowed to go 1 char off the end, but not 2+ */ + if (locinput > reginfo->strend) + sayNO; + } + else + locinput++; + break; + + } /* end switch */ + + /* switch break jumps here */ + scan = next; /* prepare to execute the next op and ... */ + continue; /* ... jump back to the top, reusing st */ + assert(0); /* NOTREACHED */ + + push_yes_state: + /* push a state that backtracks on success */ + st->u.yes.prev_yes_state = yes_state; + yes_state = st; + /* FALLTHROUGH */ + push_state: + /* push a new regex state, then continue at scan */ + { + regmatch_state *newst; + + DEBUG_STACK_r({ + regmatch_state *cur = st; + regmatch_state *curyes = yes_state; + int curd = depth; + regmatch_slab *slab = PL_regmatch_slab; + for (;curd > -1;cur--,curd--) { + if (cur < SLAB_FIRST(slab)) { + slab = slab->prev; + cur = SLAB_LAST(slab); + } + PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n", + REPORT_CODE_OFF + 2 + depth * 2,"", + curd, PL_reg_name[cur->resume_state], + (curyes == cur) ? "yes" : "" + ); + if (curyes == cur) + curyes = cur->u.yes.prev_yes_state; + } + } else + DEBUG_STATE_pp("push") + ); + depth++; + st->locinput = locinput; + newst = st+1; + if (newst > SLAB_LAST(PL_regmatch_slab)) + newst = S_push_slab(aTHX); + PL_regmatch_state = newst; + + locinput = pushinput; + st = newst; + continue; + assert(0); /* NOTREACHED */ + } + } + + /* + * We get here only if there's trouble -- normally "case END" is + * the terminating point. + */ + Perl_croak(aTHX_ "corrupted regexp pointers"); + /*NOTREACHED*/ + sayNO; + +yes: + if (yes_state) { + /* we have successfully completed a subexpression, but we must now + * pop to the state marked by yes_state and continue from there */ + assert(st != yes_state); +#ifdef DEBUGGING + while (st != yes_state) { + st--; + if (st < SLAB_FIRST(PL_regmatch_slab)) { + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + DEBUG_STATE_r({ + if (no_final) { + DEBUG_STATE_pp("pop (no final)"); + } else { + DEBUG_STATE_pp("pop (yes)"); + } + }); + depth--; + } +#else + while (yes_state < SLAB_FIRST(PL_regmatch_slab) + || yes_state > SLAB_LAST(PL_regmatch_slab)) + { + /* not in this slab, pop slab */ + depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + depth -= (st - yes_state); +#endif + st = yes_state; + yes_state = st->u.yes.prev_yes_state; + PL_regmatch_state = st; + + if (no_final) + locinput= st->locinput; + state_num = st->resume_state + no_final; + goto reenter_switch; + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", + PL_colors[4], PL_colors[5])); + + if (reginfo->info_aux_eval) { + /* each successfully executed (?{...}) block does the equivalent of + * local $^R = do {...} + * When popping the save stack, all these locals would be undone; + * bypass this by setting the outermost saved $^R to the latest + * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ + if (oreplsv != GvSV(PL_replgv)) + sv_setsv(oreplsv, GvSV(PL_replgv)); + } + result = 1; + goto final_exit; + +no: + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], PL_colors[5]) + ); + +no_silent: + if (no_final) { + if (yes_state) { + goto yes; + } else { + goto final_exit; + } + } + if (depth) { + /* there's a previous state to backtrack to */ + st--; + if (st < SLAB_FIRST(PL_regmatch_slab)) { + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + PL_regmatch_state = st; + locinput= st->locinput; + + DEBUG_STATE_pp("pop"); + depth--; + if (yes_state == st) + yes_state = st->u.yes.prev_yes_state; + + state_num = st->resume_state + 1; /* failure = success + 1 */ + goto reenter_switch; + } + result = 0; + + final_exit: + if (rex->intflags & PREGf_VERBARG_SEEN) { + SV *sv_err = get_sv("REGERROR", 1); + SV *sv_mrk = get_sv("REGMARK", 1); + if (result) { + sv_commit = &PL_sv_no; + if (!sv_yes_mark) + sv_yes_mark = &PL_sv_yes; + } else { + if (!sv_commit) + sv_commit = &PL_sv_yes; + sv_yes_mark = &PL_sv_no; + } + assert(sv_err); + assert(sv_mrk); + sv_setsv(sv_err, sv_commit); + sv_setsv(sv_mrk, sv_yes_mark); + } + + + if (last_pushed_cv) { + dSP; + POP_MULTICALL; + PERL_UNUSED_VAR(SP); + } + + assert(!result || locinput - reginfo->strbeg >= 0); + return result ? locinput - reginfo->strbeg : -1; +} + +/* + - regrepeat - repeatedly match something simple, report how many + * + * What 'simple' means is a node which can be the operand of a quantifier like + * '+', or {1,3} + * + * startposp - pointer a pointer to the start position. This is updated + * to point to the byte following the highest successful + * match. + * p - the regnode to be repeatedly matched against. + * reginfo - struct holding match state, such as strend + * max - maximum number of things to match. + * depth - (for debugging) backtracking depth. + */ +STATIC I32 +S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, + regmatch_info *const reginfo, I32 max, int depth) +{ + char *scan; /* Pointer to current position in target string */ + I32 c; + char *loceol = reginfo->strend; /* local version */ + I32 hardcount = 0; /* How many matches so far */ + bool utf8_target = reginfo->is_utf8_target; + int to_complement = 0; /* Invert the result? */ + UV utf8_flags; + _char_class_number classnum; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_REGREPEAT; + + scan = *startposp; + if (max == REG_INFTY) + max = I32_MAX; + else if (! utf8_target && loceol - scan > max) + loceol = scan + max; + + /* Here, for the case of a non-UTF-8 target we have adjusted down + * to the maximum of how far we should go in it (leaving it set to the real + * end, if the maximum permissible would take us beyond that). This allows + * us to make the loop exit condition that we haven't gone past to + * also mean that we haven't exceeded the max permissible count, saving a + * test each time through the loop. But it assumes that the OP matches a + * single byte, which is true for most of the OPs below when applied to a + * non-UTF-8 target. Those relatively few OPs that don't have this + * characteristic will have to compensate. + * + * There is no adjustment for UTF-8 targets, as the number of bytes per + * character varies. OPs will have to test both that the count is less + * than the max permissible (using to keep track), and that we + * are still within the bounds of the string (using . A few OPs + * match a single byte no matter what the encoding. They can omit the max + * test if, for the UTF-8 case, they do the adjustment that was skipped + * above. + * + * Thus, the code above sets things up for the common case; and exceptional + * cases need extra work; the common case is to make sure doesn't + * go past , and for UTF-8 to also use to make sure the + * count doesn't exceed the maximum permissible */ + + switch (OP(p)) { + case REG_ANY: + if (utf8_target) { + while (scan < loceol && hardcount < max && *scan != '\n') { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && *scan != '\n') + scan++; + } + break; + case SANY: + if (utf8_target) { + while (scan < loceol && hardcount < max) { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else + scan = loceol; + break; + case CANY: /* Move forward bytes, unless goes off end */ + if (utf8_target && loceol - scan > max) { + + /* hadn't been adjusted in the UTF-8 case */ + scan += max; + } + else { + scan = loceol; + } + break; + case EXACT: + assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + + c = (U8)*STRING(p); + + /* Can use a simple loop if the pattern char to match on is invariant + * under UTF-8, or both target and pattern aren't UTF-8. Note that we + * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's + * true iff it doesn't matter if the argument is in UTF-8 or not */ + if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) { + if (utf8_target && loceol - scan > max) { + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match at all, 1 char == 1 byte */ + loceol = scan + max; + } + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else if (reginfo->is_utf8_pat) { + if (utf8_target) { + STRLEN scan_char_len; + + /* When both target and pattern are UTF-8, we have to do + * string EQ */ + while (hardcount < max + && scan < loceol + && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) + && memEQ(scan, STRING(p), scan_char_len)) + { + scan += scan_char_len; + hardcount++; + } + } + else if (! UTF8_IS_ABOVE_LATIN1(c)) { + + /* Target isn't utf8; convert the character in the UTF-8 + * pattern to non-UTF8, and do a simple loop */ + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } /* else pattern char is above Latin1, can't possibly match the + non-UTF-8 target */ + } + else { + + /* Here, the string must be utf8; pattern isn't, and is + * different in utf8 than not, so can't compare them directly. + * Outside the loop, find the two utf8 bytes that represent c, and + * then look for those in sequence in the utf8 string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } + break; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf; + + case EXACTFL: + utf8_flags = FOLDEQ_LOCALE; + goto do_exactf; + + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; + + case EXACTFU_SS: + case EXACTFU: + utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; + + do_exactf: { + int c1, c2; + U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; + + assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + + if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, + reginfo)) + { + if (c1 == CHRTEST_VOID) { + /* Use full Unicode fold matching */ + char *tmpeol = reginfo->strend; + STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; + while (hardcount < max + && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, pat_len, + reginfo->is_utf8_pat, utf8_flags)) + { + scan = tmpeol; + tmpeol = reginfo->strend; + hardcount++; + } + } + else if (utf8_target) { + if (c1 == c2) { + while (scan < loceol + && hardcount < max + && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while (scan < loceol + && hardcount < max + && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) + || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + } + else if (c1 == c2) { + while (scan < loceol && UCHARAT(scan) == c1) { + scan++; + } + } + else { + while (scan < loceol && + (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) + { + scan++; + } + } + } + break; + } + case ANYOF: + if (utf8_target) { + while (hardcount < max + && scan < loceol + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && REGINCLASS(prog, p, (U8*)scan)) + scan++; + } + break; + + /* The argument (FLAGS) to all the POSIX node types is the class number */ + + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: + if (! utf8_target) { + while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), + *scan))) + { + scan++; + } + } else { + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), + (U8 *) scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case POSIXD: + if (utf8_target) { + goto utf8_posix; + } + /* FALLTHROUGH */ + + case POSIXA: + if (utf8_target && loceol - scan > max) { + + /* We didn't adjust at the beginning of this routine + * because is UTF-8, but it is actually ok to do so, since here, to + * match, 1 char == 1 byte. */ + loceol = scan + max; + } + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + break; + + case NPOSIXD: + if (utf8_target) { + to_complement = 1; + goto utf8_posix; + } + /* FALLTHROUGH */ + + case NPOSIXA: + if (! utf8_target) { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + } + else { + + /* The complement of something that matches only ASCII matches all + * non-ASCII, plus everything in ASCII that isn't in the class. */ + while (hardcount < max && scan < loceol + && (! isASCII_utf8(scan) + || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + while (scan < loceol && to_complement + ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p)))) + { + scan++; + } + } + else { + utf8_posix: + classnum = (_char_class_number) FLAGS(p); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, a swash is needed for above-Latin1 code points. + * Process as many Latin1 code points using the built-in rules. + * Go to another loop to finish processing upon encountering + * the first Latin1 code point. We could do that in this loop + * as well, but the other way saves having to test if the swash + * has been loaded every time through the loop: extra space to + * save a test. */ + while (hardcount < max && scan < loceol) { + if (UTF8_IS_INVARIANT(*scan)) { + if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan, + classnum)))) + { + break; + } + scan++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), + classnum)))) + { + break; + } + scan += 2; + } + else { + goto found_above_latin1; + } + + hardcount++; + } + } + else { + /* For these character classes, the knowledge of how to handle + * every code point is compiled in to Perl via a macro. This + * code is written for making the loops as tight as possible. + * It could be refactored to save space instead */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate code + if we revert the change of \v + matching this */ + /* FALLTHROUGH */ + case _CC_ENUM_PSXSPC: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isSPACE_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_BLANK: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isBLANK_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_XDIGIT: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isXDIGIT_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_VERTSPACE: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isVERTWS_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_CNTRL: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isCNTRL_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + default: + Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum); + } + } + } + break; + + found_above_latin1: /* Continuation of POSIXU and NPOSIXU */ + + /* Load the swash if not already present */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = _core_swash_init( + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(_generic_utf8( + classnum, + scan, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) scan, + TRUE)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + + case LNBREAK: + if (utf8_target) { + while (hardcount < max && scan < loceol && + (c=is_LNBREAK_utf8_safe(scan, loceol))) { + scan += c; + hardcount++; + } + } else { + /* LNBREAK can match one or two latin chars, which is ok, but we + * have to use hardcount in this situation, and throw away the + * adjustment to done before the switch statement */ + loceol = reginfo->strend; + while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { + scan+=c; + hardcount++; + } + } + break; + + case BOUND: + case BOUNDA: + case BOUNDL: + case BOUNDU: + case EOS: + case GPOS: + case KEEPS: + case NBOUND: + case NBOUNDA: + case NBOUNDL: + case NBOUNDU: + case OPFAIL: + case SBOL: + case SEOL: + /* These are all 0 width, so match right here or not at all. */ + break; + + default: + Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); + assert(0); /* NOTREACHED */ + + } + + if (hardcount) + c = hardcount; + else + c = scan - *startposp; + *startposp = scan; + + DEBUG_r({ + GET_RE_DEBUG_FLAGS_DECL; + DEBUG_EXECUTE_r({ + SV * const prop = sv_newmortal(); + regprop(prog, prop, p, reginfo); + PerlIO_printf(Perl_debug_log, + "%*s %s can match %"IVdf" times out of %"IVdf"...\n", + REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); + }); + }); + + return(c); +} + + +#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) +/* +- regclass_swash - prepare the utf8 swash. Wraps the shared core version to +create a copy so that changes the caller makes won't change the shared one. +If is non-null, will return NULL in it, for back-compat. + */ +SV * +Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp) +{ + PERL_ARGS_ASSERT_REGCLASS_SWASH; + + if (altsvp) { + *altsvp = NULL; + } + + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); +} + +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) +{ + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. + * If is 'true', will attempt to create the swash if not already + * done. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). + * Tied intimately to how regcomp.c sets up the data structure */ + + SV *sw = NULL; + SV *si = NULL; /* Input swash initialization string */ + SV* invlist = NULL; + + RXi_GET_DECL(prog,progi); + const struct reg_data * const data = prog ? progi->data : NULL; + + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; + + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + if (data && data->count) { + const U32 n = ARG(node); + + if (data->what[n] == 's') { + SV * const rv = MUTABLE_SV(data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + + si = *ary; /* ary[0] = the string to initialize the swash with */ + + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if + * that inversion list has any user-defined properties in it. */ + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + assert(only_utf8_locale_ptr); + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; + } + } + + /* Element [1] is reserved for the set-up swash. If already there, + * return it; if not, create it and store it there */ + if (ary[1] && SvROK(ary[1])) { + sw = ary[1]; + } + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { + assert(si); + sw = _core_swash_init("utf8", /* the utf8 package */ + "", /* nameless */ + si, + 1, /* binary */ + 0, /* not from tr/// */ + invlist, + &swash_init_flags); + (void)av_store(av, 1, sw); + } + } + } + + /* If requested, return a printable version of what this swash matches */ + if (listsvp) { + SV* matches_string = newSVpvs(""); + + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ + if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) + && (si && si != &PL_sv_undef)) + { + sv_catsv(matches_string, si); + } + + /* Add the inversion list to whatever we have. This may have come from + * the swash, or from an input parameter */ + if (invlist) { + sv_catsv(matches_string, _invlist_contents(invlist)); + } + *listsvp = matches_string; + } + + return sw; +} +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ + +/* + - reginclass - determine if a character falls into a character class + + n is the ANYOF regnode + p is the target string + p_end points to one byte beyond the end of the target string + utf8_target tells whether p is in UTF-8. + + Returns true if matched; false otherwise. + + Note that this can be a synthetic start class, a combination of various + nodes, so things you think might be mutually exclusive, such as locale, + aren't. It can match both locale and non-locale + + */ + +STATIC bool +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) +{ + dVAR; + const char flags = ANYOF_FLAGS(n); + bool match = FALSE; + UV c = *p; + + PERL_ARGS_ASSERT_REGINCLASS; + + /* If c is not already the code point, get it. Note that + * UTF8_IS_INVARIANT() works even if not in UTF-8 */ + if (! UTF8_IS_INVARIANT(c) && utf8_target) { + STRLEN c_len = 0; + c = utf8n_to_uvchr(p, p_end - p, &c_len, + (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) + | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); + /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for + * UTF8_ALLOW_FFFF */ + if (c_len == (STRLEN)-1) + Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + } + + /* If this character is potentially in the bitmap, check it */ + if (c < 256) { + if (ANYOF_BITMAP_TEST(n, c)) + match = TRUE; + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL + && ! utf8_target + && ! isASCII(c)) + { + match = TRUE; + } + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } + } + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { + + /* The data structure is arranged so bits 0, 2, 4, ... are set + * if the class includes the Posix character class given by + * bit/2; and 1, 3, 5, ... are set if the class includes the + * complemented Posix class given by int(bit/2). So we loop + * through the bits, each time changing whether we complement + * the result or not. Suppose for the sake of illustration + * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0 + * is set, it means there is a match for this ANYOF node if the + * character is in the class given by the expression (0 / 2 = 0 + * = \w). If it is in that class, isFOO_lc() will return 1, + * and since 'to_complement' is 0, the result will stay TRUE, + * and we exit the loop. Suppose instead that bit 0 is 0, but + * bit 1 is 1. That means there is a match if the character + * matches \W. We won't bother to call isFOO_lc() on bit 0, + * but will on bit 1. On the second iteration 'to_complement' + * will be 1, so the exclusive or will reverse things, so we + * are testing for \W. On the third iteration, 'to_complement' + * will be 0, and we would be testing for \s; the fourth + * iteration would test for \S, etc. + * + * Note that this code assumes that all the classes are closed + * under folding. For example, if a character matches \w, then + * its fold does too; and vice versa. This should be true for + * any well-behaved locale for all the currently defined Posix + * classes, except for :lower: and :upper:, which are handled + * by the pseudo-class :cased: which matches if either of the + * other two does. To get rid of this assumption, an outer + * loop could be used below to iterate over both the source + * character, and its fold (if different) */ + + int count = 0; + int to_complement = 0; + + while (count < ANYOF_MAX) { + if (ANYOF_POSIXL_TEST(n, count) + && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) + { + match = TRUE; + break; + } + count++; + to_complement ^= 1; + } + } + } + } + + + /* If the bitmap didn't (or couldn't) match, and something outside the + * bitmap could match, try that. */ + if (!match) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { + match = TRUE; /* Everything above 255 matches */ + } + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) + { + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); + if (sw) { + U8 utf8_buffer[2]; + U8 * utf8_p; + if (utf8_target) { + utf8_p = (U8 *) p; + } else { /* Convert to utf8 */ + utf8_p = utf8_buffer; + append_utf8_from_native_byte(*p, &utf8_p); + utf8_p = utf8_buffer; + } + + if (swash_fetch(sw, utf8_p, TRUE)) { + match = TRUE; + } + } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } + } + + if (UNICODE_IS_SUPER(c) + && (flags & ANYOF_WARN_SUPER) + && ckWARN_d(WARN_NON_UNICODE)) + { + Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); + } + } + +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ + return (flags & ANYOF_INVERT) ^ match; +} + +STATIC U8 * +S_reghop3(U8 *s, SSize_t off, const U8* lim) +{ + /* return the position 'off' UTF-8 characters away from 's', forward if + * 'off' >= 0, backwards if negative. But don't go outside of position + * 'lim', which better be < s if off < 0 */ + + PERL_ARGS_ASSERT_REGHOP3; + + if (off >= 0) { + while (off-- && s < lim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + } + else { + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + } + return s; +} + +STATIC U8 * +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) +{ + PERL_ARGS_ASSERT_REGHOP4; + + if (off >= 0) { + while (off-- && s < rlim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + } + else { + while (off++ && s > llim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > llim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + } + return s; +} + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ + +STATIC U8 * +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) +{ + PERL_ARGS_ASSERT_REGHOPMAYBE3; + + if (off >= 0) { + while (off-- && s < lim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + if (off >= 0) + return NULL; + } + else { + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + if (off <= 0) + return NULL; + } + return s; +} + + +/* when executing a regex that may have (?{}), extra stuff needs setting + up that will be visible to the called code, even before the current + match has finished. In particular: + + * $_ is localised to the SV currently being matched; + * pos($_) is created if necessary, ready to be updated on each call-out + to code; + * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm + isn't set until the current pattern is successfully finished), so that + $1 etc of the match-so-far can be seen; + * save the old values of subbeg etc of the current regex, and set then + to the current string (again, this is normally only done at the end + of execution) +*/ + +static void +S_setup_eval_state(pTHX_ regmatch_info *const reginfo) +{ + MAGIC *mg; + regexp *const rex = ReANY(reginfo->prog); + regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; + + eval_state->rex = rex; + + if (reginfo->sv) { + /* Make $_ available to executed code. */ + if (reginfo->sv != DEFSV) { + SAVE_DEFSV; + DEFSV_set(reginfo->sv); + } + + if (!(mg = mg_find_mglob(reginfo->sv))) { + /* prepare for quick setting of pos */ + mg = sv_magicext_mglob(reginfo->sv); + mg->mg_len = -1; + } + eval_state->pos_magic = mg; + eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; + } + else + eval_state->pos_magic = NULL; + + if (!PL_reg_curpm) { + /* PL_reg_curpm is a fake PMOP that we can attach the current + * regex to and point PL_curpm at, so that $1 et al are visible + * within a /(?{})/. It's just allocated once per interpreter the + * first time its needed */ + Newxz(PL_reg_curpm, 1, PMOP); +#ifdef USE_ITHREADS + { + SV* const repointer = &PL_sv_undef; + /* this regexp is also owned by the new PL_reg_curpm, which + will try to free it. */ + av_push(PL_regex_padav, repointer); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); + PL_regex_pad = AvARRAY(PL_regex_padav); + } +#endif + } + SET_reg_curpm(reginfo->prog); + eval_state->curpm = PL_curpm; + PL_curpm = PL_reg_curpm; + if (RXp_MATCH_COPIED(rex)) { + /* Here is a serious problem: we cannot rewrite subbeg, + since it may be needed if this match fails. Thus + $` inside (?{}) could fail... */ + eval_state->subbeg = rex->subbeg; + eval_state->sublen = rex->sublen; + eval_state->suboffset = rex->suboffset; + eval_state->subcoffset = rex->subcoffset; +#ifdef PERL_ANY_COW + eval_state->saved_copy = rex->saved_copy; +#endif + RXp_MATCH_COPIED_off(rex); + } + else + eval_state->subbeg = NULL; + rex->subbeg = (char *)reginfo->strbeg; + rex->suboffset = 0; + rex->subcoffset = 0; + rex->sublen = reginfo->strend - reginfo->strbeg; +} + + +/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ + +static void +S_cleanup_regmatch_info_aux(pTHX_ void *arg) +{ + regmatch_info_aux *aux = (regmatch_info_aux *) arg; + regmatch_info_aux_eval *eval_state = aux->info_aux_eval; + regmatch_slab *s; + + Safefree(aux->poscache); + + if (eval_state) { + + /* undo the effects of S_setup_eval_state() */ + + if (eval_state->subbeg) { + regexp * const rex = eval_state->rex; + rex->subbeg = eval_state->subbeg; + rex->sublen = eval_state->sublen; + rex->suboffset = eval_state->suboffset; + rex->subcoffset = eval_state->subcoffset; +#ifdef PERL_ANY_COW + rex->saved_copy = eval_state->saved_copy; +#endif + RXp_MATCH_COPIED_on(rex); + } + if (eval_state->pos_magic) + { + eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } + + PL_curpm = eval_state->curpm; + } + + PL_regmatch_state = aux->old_regmatch_state; + PL_regmatch_slab = aux->old_regmatch_slab; + + /* free all slabs above current one - this must be the last action + * of this function, as aux and eval_state are allocated within + * slabs and may be freed here */ + + s = PL_regmatch_slab->next; + if (s) { + PL_regmatch_slab->next = NULL; + while (s) { + regmatch_slab * const osl = s; + s = s->next; + Safefree(osl); + } + } +} + + +STATIC void +S_to_utf8_substr(pTHX_ regexp *prog) +{ + /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile + * on the converted value */ + + int i = 1; + + PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; + + do { + if (prog->substrs->data[i].substr + && !prog->substrs->data[i].utf8_substr) { + SV* const sv = newSVsv(prog->substrs->data[i].substr); + prog->substrs->data[i].utf8_substr = sv; + sv_utf8_upgrade(sv); + if (SvVALID(prog->substrs->data[i].substr)) { + if (SvTAIL(prog->substrs->data[i].substr)) { + /* Trim the trailing \n that fbm_compile added last + time. */ + SvCUR_set(sv, SvCUR(sv) - 1); + /* Whilst this makes the SV technically "invalid" (as its + buffer is no longer followed by "\0") when fbm_compile() + adds the "\n" back, a "\0" is restored. */ + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } + if (prog->substrs->data[i].substr == prog->check_substr) + prog->check_utf8 = sv; + } + } while (i--); +} + +STATIC bool +S_to_byte_substr(pTHX_ regexp *prog) +{ + /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile + * on the converted value; returns FALSE if can't be converted. */ + + int i = 1; + + PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; + + do { + if (prog->substrs->data[i].utf8_substr + && !prog->substrs->data[i].substr) { + SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); + if (! sv_utf8_downgrade(sv, TRUE)) { + return FALSE; + } + if (SvVALID(prog->substrs->data[i].utf8_substr)) { + if (SvTAIL(prog->substrs->data[i].utf8_substr)) { + /* Trim the trailing \n that fbm_compile added last + time. */ + SvCUR_set(sv, SvCUR(sv) - 1); + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } + prog->substrs->data[i].substr = sv; + if (prog->substrs->data[i].utf8_substr == prog->check_utf8) + prog->check_substr = sv; + } + } while (i--); + + return TRUE; +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021003/dquote_static.c b/src/5021003/dquote_static.c new file mode 100644 index 0000000..dd47f14 --- /dev/null +++ b/src/5021003/dquote_static.c @@ -0,0 +1,328 @@ +/* dquote_static.c + * + * This file contains static functions that are related to + * parsing double-quotish expressions, but are used in more than + * one file. + * + * It is currently #included by regcomp.c and toke.c. +*/ + +#define PERL_IN_DQUOTE_STATIC_C +#include "embed.h" + +/* + - regcurly - a little FSA that accepts {\d+,?\d*} + Pulled from regcomp.c. + */ +PERL_STATIC_INLINE I32 +S_regcurly(const char *s) +{ + PERL_ARGS_ASSERT_REGCURLY; + + if (*s++ != '{') + return FALSE; + if (!isDIGIT(*s)) + return FALSE; + while (isDIGIT(*s)) + s++; + if (*s == ',') { + s++; + while (isDIGIT(*s)) + s++; + } + + return *s == '}'; +} + +/* XXX Add documentation after final interface and behavior is decided */ +/* May want to show context for error, so would pass Perl_bslash_c(pTHX_ const char* current, const char* start, const bool output_warning) + U8 source = *current; +*/ + +STATIC char +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) +{ + + U8 result; + + if (! isPRINT_A(source)) { + Perl_croak(aTHX_ "%s", + "Character following \"\\c\" must be printable ASCII"); + } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); + + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); + } + + result = toCTRL(source); + if (output_warning && isPRINT_A(result)) { + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; + } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); + } + + return result; +} + +STATIC bool +S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg, + const bool output_warning, const bool strict, + const bool silence_non_portable, + const bool UTF) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns FALSE if there is an error which the caller need not recover + * from; , otherwise TRUE. In either case the caller should look at *len + * On input: + * s is the address of a pointer to a NULL terminated string that begins + * with 'o', and the previous character was a backslash. At exit, *s + * will be advanced to the byte just after those absorbed by this + * function. Hence the caller can continue parsing from there. In + * the case of an error, this routine has generally positioned *s to + * point just to the right of the first bad spot, so that a message + * that has a "<--" to mark the spot will be correctly positioned. + * uv points to a UV that will hold the output value, valid only if the + * return from the function is TRUE + * error_msg is a pointer that will be set to an internal buffer giving an + * error message upon failure (the return is FALSE). Untouched if + * function succeeds + * output_warning says whether to output any warning messages, or suppress + * them + * strict is true if this should fail instead of warn if there are + * non-octal digits within the braces + * silence_non_portable is true if to suppress warnings about the code + * point returned being too large to fit on all platforms. + * UTF is true iff the string *s is encoded in UTF-8. + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + /* XXX Until the message is improved in grok_oct, handle errors + * ourselves */ + | PERL_SCAN_SILENT_ILLDIGIT; + + PERL_ARGS_ASSERT_GROK_BSLASH_O; + + + assert(**s == 'o'); + (*s)++; + + if (**s != '{') { + *error_msg = "Missing braces on \\o{}"; + return FALSE; + } + + e = strchr(*s, '}'); + if (!e) { + (*s)++; /* Move past the '{' */ + while (isOCTAL(**s)) { /* Position beyond the legal digits */ + (*s)++; + } + *error_msg = "Missing right brace on \\o{"; + return FALSE; + } + + (*s)++; /* Point to expected first digit (could be first byte of utf8 + sequence if not a digit) */ + numbers_len = e - *s; + if (numbers_len == 0) { + (*s)++; /* Move past the } */ + *error_msg = "Number with no digits"; + return FALSE; + } + + if (silence_non_portable) { + flags |= PERL_SCAN_SILENT_NON_PORTABLE; + } + + *uv = grok_oct(*s, &numbers_len, &flags, NULL); + /* Note that if has non-octal, will ignore everything starting with that up + * to the '}' */ + + if (numbers_len != (STRLEN) (e - *s)) { + if (strict) { + *s += numbers_len; + *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1; + *error_msg = "Non-octal character"; + return FALSE; + } + else if (output_warning) { + Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT), + /* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */ + "Non-octal character '%c'. Resolved as \"\\o{%.*s}\"", + *(*s + numbers_len), + (int) numbers_len, + *s); + } + } + + /* Return past the '}' */ + *s = e + 1; + + return TRUE; +} + +PERL_STATIC_INLINE bool +S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg, + const bool output_warning, const bool strict, + const bool silence_non_portable, + const bool UTF) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns FALSE if there is an error which the caller need not recover + * from; , otherwise TRUE. In either case the caller should look at *len + * On input: + * s is the address of a pointer to a NULL terminated string that begins + * with 'x', and the previous character was a backslash. At exit, *s + * will be advanced to the byte just after those absorbed by this + * function. Hence the caller can continue parsing from there. In + * the case of an error, this routine has generally positioned *s to + * point just to the right of the first bad spot, so that a message + * that has a "<--" to mark the spot will be correctly positioned. + * uv points to a UV that will hold the output value, valid only if the + * return from the function is TRUE + * error_msg is a pointer that will be set to an internal buffer giving an + * error message upon failure (the return is FALSE). Untouched if + * function succeeds + * output_warning says whether to output any warning messages, or suppress + * them + * strict is true if anything out of the ordinary should cause this to + * fail instead of warn or be silent. For example, it requires + * exactly 2 digits following the \x (when there are no braces). + * 3 digits could be a mistake, so is forbidden in this mode. + * silence_non_portable is true if to suppress warnings about the code + * point returned being too large to fit on all platforms. + * UTF is true iff the string *s is encoded in UTF-8. + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + + PERL_ARGS_ASSERT_GROK_BSLASH_X; + + PERL_UNUSED_ARG(output_warning); + + assert(**s == 'x'); + (*s)++; + + if (strict) { + flags |= PERL_SCAN_SILENT_ILLDIGIT; + } + + if (**s != '{') { + STRLEN len = (strict) ? 3 : 2; + + *uv = grok_hex(*s, &len, &flags, NULL); + *s += len; + if (strict && len != 2) { + if (len < 2) { + *s += (UTF) ? UTF8SKIP(*s) : 1; + *error_msg = "Non-hex character"; + } + else { + *error_msg = "Use \\x{...} for more than two hex characters"; + } + return FALSE; + } + return TRUE; + } + + e = strchr(*s, '}'); + if (!e) { + (*s)++; /* Move past the '{' */ + while (isXDIGIT(**s)) { /* Position beyond the legal digits */ + (*s)++; + } + /* XXX The corresponding message above for \o is just '\\o{'; other + * messages for other constructs include the '}', so are inconsistent. + */ + *error_msg = "Missing right brace on \\x{}"; + return FALSE; + } + + (*s)++; /* Point to expected first digit (could be first byte of utf8 + sequence if not a digit) */ + numbers_len = e - *s; + if (numbers_len == 0) { + if (strict) { + (*s)++; /* Move past the } */ + *error_msg = "Number with no digits"; + return FALSE; + } + return TRUE; + } + + flags |= PERL_SCAN_ALLOW_UNDERSCORES; + if (silence_non_portable) { + flags |= PERL_SCAN_SILENT_NON_PORTABLE; + } + + *uv = grok_hex(*s, &numbers_len, &flags, NULL); + /* Note that if has non-hex, will ignore everything starting with that up + * to the '}' */ + + if (strict && numbers_len != (STRLEN) (e - *s)) { + *s += numbers_len; + *s += (UTF) ? UTF8SKIP(*s) : 1; + *error_msg = "Non-hex character"; + return FALSE; + } + + /* Return past the '}' */ + *s = e + 1; + + return TRUE; +} + +STATIC char* +S_form_short_octal_warning(pTHX_ + const char * const s, /* Points to first non-octal */ + const STRLEN len /* Length of octals string, so + (s-len) points to first + octal */ +) { + /* Return a character string consisting of a warning message for when a + * string constant in octal is weird, like "\078". */ + + const char * sans_leading_zeros = s - len; + + PERL_ARGS_ASSERT_FORM_SHORT_OCTAL_WARNING; + + assert(*s == '8' || *s == '9'); + + /* Remove the leading zeros, retaining one zero so won't be zero length */ + while (*sans_leading_zeros == '0') sans_leading_zeros++; + if (sans_leading_zeros == s) { + sans_leading_zeros--; + } + + return Perl_form(aTHX_ + "'%.*s' resolved to '\\o{%.*s}%c'", + (int) (len + 2), s - len - 1, + (int) (s - sans_leading_zeros), sans_leading_zeros, + *s); +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021003/inline_invlist.c b/src/5021003/inline_invlist.c new file mode 100644 index 0000000..6cdeff4 --- /dev/null +++ b/src/5021003/inline_invlist.c @@ -0,0 +1,66 @@ +/* inline_invlist.c + * + * Copyright (C) 2012 by Larry Wall and others + * + * You may distribute under the terms of either the GNU General Public + * License or the Artistic License, as specified in the README file. + */ + +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) + +/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, + * etc */ +#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) +#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) + +/* This converts to/from our UVs to what the SV code is expecting: bytes. */ +#define TO_INTERNAL_SIZE(x) ((x) * sizeof(UV)) +#define FROM_INTERNAL_SIZE(x) ((x)/ sizeof(UV)) + +PERL_STATIC_INLINE bool* +S_get_invlist_offset_addr(SV* invlist) +{ + /* Return the address of the field that says whether the inversion list is + * offset (it contains 1) or not (contains 0) */ + PERL_ARGS_ASSERT_GET_INVLIST_OFFSET_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->is_offset); +} + +PERL_STATIC_INLINE UV +S__invlist_len(SV* const invlist) +{ + /* Returns the current number of elements stored in the inversion list's + * array */ + + PERL_ARGS_ASSERT__INVLIST_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return (SvCUR(invlist) == 0) + ? 0 + : FROM_INTERNAL_SIZE(SvCUR(invlist)) - *get_invlist_offset_addr(invlist); +} + +PERL_STATIC_INLINE bool +S__invlist_contains_cp(SV* const invlist, const UV cp) +{ + /* Does contain code point as part of the set? */ + + IV index = _invlist_search(invlist, cp); + + PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; + + return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); +} + +# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGEXEC_C) + +/* These symbols are only needed later in regcomp.c */ +# undef TO_INTERNAL_SIZE +# undef FROM_INTERNAL_SIZE +# endif + +#endif diff --git a/src/5021003/orig/dquote_static.c b/src/5021003/orig/dquote_static.c new file mode 100644 index 0000000..802d83b --- /dev/null +++ b/src/5021003/orig/dquote_static.c @@ -0,0 +1,328 @@ +/* dquote_static.c + * + * This file contains static functions that are related to + * parsing double-quotish expressions, but are used in more than + * one file. + * + * It is currently #included by regcomp.c and toke.c. +*/ + +#define PERL_IN_DQUOTE_STATIC_C +#include "embed.h" + +/* + - regcurly - a little FSA that accepts {\d+,?\d*} + Pulled from regcomp.c. + */ +PERL_STATIC_INLINE I32 +S_regcurly(const char *s) +{ + PERL_ARGS_ASSERT_REGCURLY; + + if (*s++ != '{') + return FALSE; + if (!isDIGIT(*s)) + return FALSE; + while (isDIGIT(*s)) + s++; + if (*s == ',') { + s++; + while (isDIGIT(*s)) + s++; + } + + return *s == '}'; +} + +/* XXX Add documentation after final interface and behavior is decided */ +/* May want to show context for error, so would pass Perl_bslash_c(pTHX_ const char* current, const char* start, const bool output_warning) + U8 source = *current; +*/ + +STATIC char +S_grok_bslash_c(pTHX_ const char source, const bool output_warning) +{ + + U8 result; + + if (! isPRINT_A(source)) { + Perl_croak(aTHX_ "%s", + "Character following \"\\c\" must be printable ASCII"); + } + else if (source == '{') { + assert(isPRINT_A(toCTRL('{'))); + + /* diag_listed_as: Use "%s" instead of "%s" */ + Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", toCTRL('{')); + } + + result = toCTRL(source); + if (output_warning && isPRINT_A(result)) { + U8 clearer[3]; + U8 i = 0; + if (! isWORDCHAR(result)) { + clearer[i++] = '\\'; + } + clearer[i++] = result; + clearer[i++] = '\0'; + + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), + "\"\\c%c\" is more clearly written simply as \"%s\"", + source, + clearer); + } + + return result; +} + +STATIC bool +S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg, + const bool output_warning, const bool strict, + const bool silence_non_portable, + const bool UTF) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns FALSE if there is an error which the caller need not recover + * from; , otherwise TRUE. In either case the caller should look at *len + * On input: + * s is the address of a pointer to a NULL terminated string that begins + * with 'o', and the previous character was a backslash. At exit, *s + * will be advanced to the byte just after those absorbed by this + * function. Hence the caller can continue parsing from there. In + * the case of an error, this routine has generally positioned *s to + * point just to the right of the first bad spot, so that a message + * that has a "<--" to mark the spot will be correctly positioned. + * uv points to a UV that will hold the output value, valid only if the + * return from the function is TRUE + * error_msg is a pointer that will be set to an internal buffer giving an + * error message upon failure (the return is FALSE). Untouched if + * function succeeds + * output_warning says whether to output any warning messages, or suppress + * them + * strict is true if this should fail instead of warn if there are + * non-octal digits within the braces + * silence_non_portable is true if to suppress warnings about the code + * point returned being too large to fit on all platforms. + * UTF is true iff the string *s is encoded in UTF-8. + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + /* XXX Until the message is improved in grok_oct, handle errors + * ourselves */ + | PERL_SCAN_SILENT_ILLDIGIT; + + PERL_ARGS_ASSERT_GROK_BSLASH_O; + + + assert(**s == 'o'); + (*s)++; + + if (**s != '{') { + *error_msg = "Missing braces on \\o{}"; + return FALSE; + } + + e = strchr(*s, '}'); + if (!e) { + (*s)++; /* Move past the '{' */ + while (isOCTAL(**s)) { /* Position beyond the legal digits */ + (*s)++; + } + *error_msg = "Missing right brace on \\o{"; + return FALSE; + } + + (*s)++; /* Point to expected first digit (could be first byte of utf8 + sequence if not a digit) */ + numbers_len = e - *s; + if (numbers_len == 0) { + (*s)++; /* Move past the } */ + *error_msg = "Number with no digits"; + return FALSE; + } + + if (silence_non_portable) { + flags |= PERL_SCAN_SILENT_NON_PORTABLE; + } + + *uv = grok_oct(*s, &numbers_len, &flags, NULL); + /* Note that if has non-octal, will ignore everything starting with that up + * to the '}' */ + + if (numbers_len != (STRLEN) (e - *s)) { + if (strict) { + *s += numbers_len; + *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1; + *error_msg = "Non-octal character"; + return FALSE; + } + else if (output_warning) { + Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT), + /* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */ + "Non-octal character '%c'. Resolved as \"\\o{%.*s}\"", + *(*s + numbers_len), + (int) numbers_len, + *s); + } + } + + /* Return past the '}' */ + *s = e + 1; + + return TRUE; +} + +PERL_STATIC_INLINE bool +S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg, + const bool output_warning, const bool strict, + const bool silence_non_portable, + const bool UTF) +{ + +/* Documentation to be supplied when interface nailed down finally + * This returns FALSE if there is an error which the caller need not recover + * from; , otherwise TRUE. In either case the caller should look at *len + * On input: + * s is the address of a pointer to a NULL terminated string that begins + * with 'x', and the previous character was a backslash. At exit, *s + * will be advanced to the byte just after those absorbed by this + * function. Hence the caller can continue parsing from there. In + * the case of an error, this routine has generally positioned *s to + * point just to the right of the first bad spot, so that a message + * that has a "<--" to mark the spot will be correctly positioned. + * uv points to a UV that will hold the output value, valid only if the + * return from the function is TRUE + * error_msg is a pointer that will be set to an internal buffer giving an + * error message upon failure (the return is FALSE). Untouched if + * function succeeds + * output_warning says whether to output any warning messages, or suppress + * them + * strict is true if anything out of the ordinary should cause this to + * fail instead of warn or be silent. For example, it requires + * exactly 2 digits following the \x (when there are no braces). + * 3 digits could be a mistake, so is forbidden in this mode. + * silence_non_portable is true if to suppress warnings about the code + * point returned being too large to fit on all platforms. + * UTF is true iff the string *s is encoded in UTF-8. + */ + char* e; + STRLEN numbers_len; + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + + PERL_ARGS_ASSERT_GROK_BSLASH_X; + + PERL_UNUSED_ARG(output_warning); + + assert(**s == 'x'); + (*s)++; + + if (strict) { + flags |= PERL_SCAN_SILENT_ILLDIGIT; + } + + if (**s != '{') { + STRLEN len = (strict) ? 3 : 2; + + *uv = grok_hex(*s, &len, &flags, NULL); + *s += len; + if (strict && len != 2) { + if (len < 2) { + *s += (UTF) ? UTF8SKIP(*s) : 1; + *error_msg = "Non-hex character"; + } + else { + *error_msg = "Use \\x{...} for more than two hex characters"; + } + return FALSE; + } + return TRUE; + } + + e = strchr(*s, '}'); + if (!e) { + (*s)++; /* Move past the '{' */ + while (isXDIGIT(**s)) { /* Position beyond the legal digits */ + (*s)++; + } + /* XXX The corresponding message above for \o is just '\\o{'; other + * messages for other constructs include the '}', so are inconsistent. + */ + *error_msg = "Missing right brace on \\x{}"; + return FALSE; + } + + (*s)++; /* Point to expected first digit (could be first byte of utf8 + sequence if not a digit) */ + numbers_len = e - *s; + if (numbers_len == 0) { + if (strict) { + (*s)++; /* Move past the } */ + *error_msg = "Number with no digits"; + return FALSE; + } + return TRUE; + } + + flags |= PERL_SCAN_ALLOW_UNDERSCORES; + if (silence_non_portable) { + flags |= PERL_SCAN_SILENT_NON_PORTABLE; + } + + *uv = grok_hex(*s, &numbers_len, &flags, NULL); + /* Note that if has non-hex, will ignore everything starting with that up + * to the '}' */ + + if (strict && numbers_len != (STRLEN) (e - *s)) { + *s += numbers_len; + *s += (UTF) ? UTF8SKIP(*s) : 1; + *error_msg = "Non-hex character"; + return FALSE; + } + + /* Return past the '}' */ + *s = e + 1; + + return TRUE; +} + +STATIC char* +S_form_short_octal_warning(pTHX_ + const char * const s, /* Points to first non-octal */ + const STRLEN len /* Length of octals string, so + (s-len) points to first + octal */ +) { + /* Return a character string consisting of a warning message for when a + * string constant in octal is weird, like "\078". */ + + const char * sans_leading_zeros = s - len; + + PERL_ARGS_ASSERT_FORM_SHORT_OCTAL_WARNING; + + assert(*s == '8' || *s == '9'); + + /* Remove the leading zeros, retaining one zero so won't be zero length */ + while (*sans_leading_zeros == '0') sans_leading_zeros++; + if (sans_leading_zeros == s) { + sans_leading_zeros--; + } + + return Perl_form(aTHX_ + "'%.*s' resolved to '\\o{%.*s}%c'", + (int) (len + 2), s - len - 1, + (int) (s - sans_leading_zeros), sans_leading_zeros, + *s); +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021003/orig/inline_invlist.c b/src/5021003/orig/inline_invlist.c new file mode 100644 index 0000000..1875a05 --- /dev/null +++ b/src/5021003/orig/inline_invlist.c @@ -0,0 +1,66 @@ +/* inline_invlist.c + * + * Copyright (C) 2012 by Larry Wall and others + * + * You may distribute under the terms of either the GNU General Public + * License or the Artistic License, as specified in the README file. + */ + +#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) + +/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, + * etc */ +#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) +#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) + +/* This converts to/from our UVs to what the SV code is expecting: bytes. */ +#define TO_INTERNAL_SIZE(x) ((x) * sizeof(UV)) +#define FROM_INTERNAL_SIZE(x) ((x)/ sizeof(UV)) + +PERL_STATIC_INLINE bool* +S_get_invlist_offset_addr(SV* invlist) +{ + /* Return the address of the field that says whether the inversion list is + * offset (it contains 1) or not (contains 0) */ + PERL_ARGS_ASSERT_GET_INVLIST_OFFSET_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->is_offset); +} + +PERL_STATIC_INLINE UV +S__invlist_len(SV* const invlist) +{ + /* Returns the current number of elements stored in the inversion list's + * array */ + + PERL_ARGS_ASSERT__INVLIST_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return (SvCUR(invlist) == 0) + ? 0 + : FROM_INTERNAL_SIZE(SvCUR(invlist)) - *get_invlist_offset_addr(invlist); +} + +PERL_STATIC_INLINE bool +S__invlist_contains_cp(SV* const invlist, const UV cp) +{ + /* Does contain code point as part of the set? */ + + IV index = _invlist_search(invlist, cp); + + PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; + + return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); +} + +# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGEXEC_C) + +/* These symbols are only needed later in regcomp.c */ +# undef TO_INTERNAL_SIZE +# undef FROM_INTERNAL_SIZE +# endif + +#endif diff --git a/src/5021003/orig/regcomp.c b/src/5021003/orig/regcomp.c new file mode 100644 index 0000000..991d2f8 --- /dev/null +++ b/src/5021003/orig/regcomp.c @@ -0,0 +1,16890 @@ +/* regcomp.c + */ + +/* + * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee + * + * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] + */ + +/* This file contains functions for compiling a regular expression. See + * also regexec.c which funnily enough, contains functions for executing + * a regular expression. + * + * This file is also copied at build time to ext/re/re_comp.c, where + * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. + * This causes the main functions to be compiled under new names and with + * debugging support added, which makes "use re 'debug'" work. + */ + +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +/* + * pregcomp and pregexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + * + **** Alterations to Henry's code are... + **** + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others + **** + **** You may distribute under the terms of either the GNU General Public + **** License or the Artistic License, as specified in the README file. + + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#define PERL_IN_REGCOMP_C +#include "perl.h" + +#ifndef PERL_IN_XSUB_RE +# include "INTERN.h" +#endif + +#define REG_COMP_C +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +EXTERN_C const struct regexp_engine my_reg_engine; +#else +# include "regcomp.h" +#endif + +#include "dquote_static.c" +#include "charclass_invlists.h" +#include "inline_invlist.c" +#include "unicode_constants.h" + +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) +#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) + +#ifndef STATIC +#define STATIC static +#endif + + +struct RExC_state_t { + U32 flags; /* RXf_* are we folding, multilining? */ + U32 pm_flags; /* PMf_* stuff from the calling PMOP */ + char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ + regexp *rx; /* perl core regexp structure */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ + char *start; /* Start of input for compile */ + char *end; /* End of input for compile */ + char *parse; /* Input-scan pointer. */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ + regnode *emit_start; /* Start of emitted-code area */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ + regnode *emit; /* Code-emit pointer; if = &emit_dummy, + implies compiling, so don't emit */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ + I32 naughty; /* How bad is this pattern? */ + I32 sawback; /* Did we see \1, ...? */ + U32 seen; + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ + I32 extralen; + I32 seen_zerolen; + regnode **open_parens; /* pointers to open parens */ + regnode **close_parens; /* pointers to close parens */ + regnode *opend; /* END node in program */ + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ + I32 uni_semantics; /* If a d charset modifier should use unicode + rules, even if the pattern is not in + utf8 */ + HV *paren_names; /* Paren names */ + + regnode **recurse; /* Recurse regops */ + I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ + I32 in_lookbehind; + I32 contains_locale; + I32 contains_i; + I32 override_recoding; + I32 in_multi_char_class; + struct reg_code_block *code_blocks; /* positions of literal (?{}) + within pattern */ + int num_code_blocks; /* size of code_blocks[] */ + int code_index; /* next code_blocks[] slot */ + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ +#define RExC_starttry (pRExC_state->starttry) +#endif + SV *runtime_code_qr; /* qr with the runtime code blocks */ +#ifdef DEBUGGING + const char *lastparse; + I32 lastnum; + AV *paren_name_list; /* idx -> name */ +#define RExC_lastparse (pRExC_state->lastparse) +#define RExC_lastnum (pRExC_state->lastnum) +#define RExC_paren_name_list (pRExC_state->paren_name_list) +#endif +}; + +#define RExC_flags (pRExC_state->flags) +#define RExC_pm_flags (pRExC_state->pm_flags) +#define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) +#define RExC_rx (pRExC_state->rx) +#define RExC_rxi (pRExC_state->rxi) +#define RExC_start (pRExC_state->start) +#define RExC_end (pRExC_state->end) +#define RExC_parse (pRExC_state->parse) +#define RExC_whilem_seen (pRExC_state->whilem_seen) +#ifdef RE_TRACK_PATTERN_OFFSETS +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ +#endif +#define RExC_emit (pRExC_state->emit) +#define RExC_emit_dummy (pRExC_state->emit_dummy) +#define RExC_emit_start (pRExC_state->emit_start) +#define RExC_emit_bound (pRExC_state->emit_bound) +#define RExC_naughty (pRExC_state->naughty) +#define RExC_sawback (pRExC_state->sawback) +#define RExC_seen (pRExC_state->seen) +#define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) +#define RExC_npar (pRExC_state->npar) +#define RExC_nestroot (pRExC_state->nestroot) +#define RExC_extralen (pRExC_state->extralen) +#define RExC_seen_zerolen (pRExC_state->seen_zerolen) +#define RExC_utf8 (pRExC_state->utf8) +#define RExC_uni_semantics (pRExC_state->uni_semantics) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) +#define RExC_open_parens (pRExC_state->open_parens) +#define RExC_close_parens (pRExC_state->close_parens) +#define RExC_opend (pRExC_state->opend) +#define RExC_paren_names (pRExC_state->paren_names) +#define RExC_recurse (pRExC_state->recurse) +#define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) +#define RExC_in_lookbehind (pRExC_state->in_lookbehind) +#define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) +#define RExC_override_recoding (pRExC_state->override_recoding) +#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) + + +#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ + ((*s) == '{' && regcurly(s))) + +/* + * Flags to be passed up and down. + */ +#define WORST 0 /* Worst case. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ + +/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single + * character. (There needs to be a case: in the switch statement in regexec.c + * for any node marked SIMPLE.) Note that this is not the same thing as + * REGNODE_SIMPLE */ +#define SIMPLE 0x02 +#define SPSTART 0x04 /* Starts with * or + */ +#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ +#define TRYAGAIN 0x10 /* Weeded out a declaration. */ +#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */ + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) + +/* whether trie related optimizations are enabled */ +#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION +#define TRIE_STUDY_OPT +#define FULL_TRIE_STUDY +#define TRIE_STCLASS +#endif + + + +#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] +#define PBITVAL(paren) (1 << ((paren) & 7)) +#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) +#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) +#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) + +#define REQUIRE_UTF8 STMT_START { \ + if (!UTF) { \ + *flagp = RESTART_UTF8; \ + return NULL; \ + } \ + } STMT_END + +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((int) ((class) / 2)) +#define classnum_to_namedclass(classnum) ((classnum) * 2) + +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + +/* About scan_data_t. + + During optimisation we recurse through the regexp program performing + various inplace (keyhole style) optimisations. In addition study_chunk + and scan_commit populate this data structure with information about + what strings MUST appear in the pattern. We look for the longest + string that must appear at a fixed location, and we look for the + longest string that may appear at a floating location. So for instance + in the pattern: + + /FOO[xX]A.*B[xX]BAR/ + + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating + strings (because they follow a .* construct). study_chunk will identify + both FOO and BAR as being the longest fixed and floating strings respectively. + + The strings can be composites, for instance + + /(f)(o)(o)/ + + will result in a composite fixed substring 'foo'. + + For each string some basic information is maintained: + + - offset or min_offset + This is the position the string must appear at, or not before. + It also implicitly (when combined with minlenp) tells us how many + characters must match before the string we are searching for. + Likewise when combined with minlenp and the length of the string it + tells us how many characters must appear after the string we have + found. + + - max_offset + Only used for floating strings. This is the rightmost point that + the string can appear at. If set to SSize_t_MAX it indicates that the + string can occur infinitely far to the right. + + - minlenp + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive + lookahead or positive lookbehind we can have multiple patterns + involved. Consider + + /(?=FOO).*F/ + + The minimum length of the pattern overall is 3, the minimum length + of the lookahead part is 3, but the minimum length of the part that + will actually match is 1. So 'FOO's minimum length is 3, but the + minimum length for the F is 1. This is important as the minimum length + is used to determine offsets in front of and behind the string being + looked for. Since strings can be composites this is the length of the + pattern at the time it was committed with a scan_commit. Note that + the length is calculated by study_chunk, so that the minimum lengths + are not known until the full pattern has been compiled, thus the + pointer to the value. + + - lookbehind + + In the case of lookbehind the string being searched for can be + offset past the start point of the final matching string. + If this value was just blithely removed from the min_offset it would + invalidate some of the calculations for how many chars must match + before or after (as they are derived from min_offset and minlen and + the length of the string being searched for). + When the final pattern is compiled and the data is moved from the + scan_data_t structure into the regexp structure the information + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the + associated string. + + The fields pos_min and pos_delta are used to store the minimum offset + and the delta to the maximum offset at the current point in the pattern. + +*/ + +typedef struct scan_data_t { + /*I32 len_min; unused */ + /*I32 len_delta; unused */ + SSize_t pos_min; + SSize_t pos_delta; + SV *last_found; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; + SV **longest; /* Either &l_fixed, or &l_float. */ + SV *longest_fixed; /* longest fixed string found in pattern */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ + I32 lookbehind_fixed; /* is the position of the string modfied by LB */ + SV *longest_float; /* longest floating string found in pattern */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ + I32 flags; + I32 whilem_c; + SSize_t *last_closep; + regnode_ssc *start_class; +} scan_data_t; + +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + +/* + * Forward declarations for pregcomp()'s friends. + */ + +static const scan_data_t zero_scan_data = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; + +#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) +#define SF_BEFORE_SEOL 0x0001 +#define SF_BEFORE_MEOL 0x0002 +#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) +#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) + +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) + +#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) +#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) + +#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) +#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ +#define SF_IS_INF 0x0040 +#define SF_HAS_PAR 0x0080 +#define SF_IN_PAR 0x0100 +#define SF_HAS_EVAL 0x0200 +#define SCF_DO_SUBSTR 0x0400 +#define SCF_DO_STCLASS_AND 0x0800 +#define SCF_DO_STCLASS_OR 0x1000 +#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 + +#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ +#define SCF_SEEN_ACCEPT 0x8000 +#define SCF_TRIE_DOING_RESTUDY 0x10000 + +#define UTF cBOOL(RExC_utf8) + +/* The enums for all these are ordered so things work out correctly */ +#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) + +#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) + +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + +#define OOB_NAMEDCLASS -1 + +/* There is no code point that is out-of-bounds, so this is problematic. But + * its only current use is to initialize a variable that is always set before + * looked at. */ +#define OOB_UNICODE 0xDEADBEEF + +#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) + + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "<-- HERE" /* marker as it appears in the description */ +#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define _FAIL(code) STMT_START { \ + const char *ellipses = ""; \ + IV len = RExC_end - RExC_precomp; \ + \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + code; \ +} STMT_END + +#define FAIL(msg) _FAIL( \ + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +#define FAIL2(msg,arg) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL(m); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL2(m, a1); \ +} STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL3(m, a1, a2); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vFAIL4(m,a1,a2,a3) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL4(m, a1, a2, a3); \ +} STMT_END + +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + +/* m is not necessarily a "literal string", in this macro */ +#define reg_warn_non_literal_string(loc, m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNreg(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN_dep(loc, m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNdep(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNregdep(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN2reg(loc, m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN3(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN4(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + +/* Allow for side effects in s */ +#define REGC(c,s) STMT_START { \ + if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ +} STMT_END + +/* Macros for recording node offsets. 20001227 mjd@plover.com + * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in + * element 2*n-1 of the array. Element #2n holds the byte length node #n. + * Element 0 holds the number n. + * Position is 1 indexed. + */ +#ifndef RE_TRACK_PATTERN_OFFSETS +#define Set_Node_Offset_To_R(node,byte) +#define Set_Node_Offset(node,byte) +#define Set_Cur_Node_Offset +#define Set_Node_Length_To_R(node,len) +#define Set_Node_Length(node,len) +#define Set_Node_Cur_Length(node,start) +#define Node_Offset(n) +#define Node_Length(n) +#define Set_Node_Offset_Length(node,offset,len) +#define ProgLen(ri) ri->u.proglen +#define SetProgLen(ri,x) ri->u.proglen = x +#else +#define ProgLen(ri) ri->u.offsets[0] +#define SetProgLen(ri,x) ri->u.offsets[0] = x +#define Set_Node_Offset_To_R(node,byte) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(byte))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ + } else { \ + RExC_offsets[2*(node)-1] = (byte); \ + } \ + } \ +} STMT_END + +#define Set_Node_Offset(node,byte) \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) +#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) + +#define Set_Node_Length_To_R(node,len) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(len))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ + } else { \ + RExC_offsets[2*(node)] = (len); \ + } \ + } \ +} STMT_END + +#define Set_Node_Length(node,len) \ + Set_Node_Length_To_R((node)-RExC_emit_start, len) +#define Set_Node_Cur_Length(node, start) \ + Set_Node_Length(node, RExC_parse - start) + +/* Get offsets and lengths */ +#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) +#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) + +#define Set_Node_Offset_Length(node,offset,len) STMT_START { \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ + Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ +} STMT_END +#endif + +#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS +#define EXPERIMENTAL_INPLACESCAN +#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ + +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + +#define DEBUG_STUDYDATA(str,data,depth) \ +DEBUG_OPTIMISE_MORE_r(if(data){ \ + PerlIO_printf(Perl_debug_log, \ + "%*s" str "Pos:%"IVdf"/%"IVdf \ + " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ + (int)(depth)*2, "", \ + (IV)((data)->pos_min), \ + (IV)((data)->pos_delta), \ + (UV)((data)->flags), \ + (IV)((data)->whilem_c), \ + (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ + is_inf ? "INF " : "" \ + ); \ + if ((data)->last_found) \ + PerlIO_printf(Perl_debug_log, \ + "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ + " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ + SvPVX_const((data)->last_found), \ + (IV)((data)->last_end), \ + (IV)((data)->last_start_min), \ + (IV)((data)->last_start_max), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ + SvPVX_const((data)->longest_fixed), \ + (IV)((data)->offset_fixed), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_float)) ? "*" : "", \ + SvPVX_const((data)->longest_float), \ + (IV)((data)->offset_float_min), \ + (IV)((data)->offset_float_max) \ + ); \ + PerlIO_printf(Perl_debug_log,"\n"); \ +}); + +/* Mark that we cannot extend a found fixed substring at this point. + Update the longest found anchored substring and the longest found + floating substrings if needed. */ + +STATIC void +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) +{ + const STRLEN l = CHR_SVLEN(data->last_found); + const STRLEN old_l = CHR_SVLEN(*data->longest); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_SCAN_COMMIT; + + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { + SvSetMagicSV(*data->longest, data->last_found); + if (*data->longest == data->longest_fixed) { + data->offset_fixed = l ? data->last_start_min : data->pos_min; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); + else + data->flags &= ~SF_FIX_BEFORE_EOL; + data->minlen_fixed=minlenp; + data->lookbehind_fixed=0; + } + else { /* *data->longest == data->longest_float */ + data->offset_float_min = l ? data->last_start_min : data->pos_min; + data->offset_float_max = (l + ? data->last_start_max + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); + else + data->flags &= ~SF_FL_BEFORE_EOL; + data->minlen_float=minlenp; + data->lookbehind_float=0; + } + } + SvCUR_set(data->last_found, 0); + { + SV * const sv = data->last_found; + if (SvUTF8(sv) && SvMAGICAL(sv)) { + MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); + if (mg) + mg->mg_len = 0; + } + } + data->last_end = -1; + data->flags &= ~SF_BEFORE_EOL; + DEBUG_STUDYDATA("commit: ",data,0); +} + +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + +STATIC void +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ + + PERL_ARGS_ASSERT_SSC_INIT; + + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); + + /* If any portion of the regex is to operate under locale rules, + * initialization includes it. The reason this isn't done for all regexes + * is that the optimizer was written under the assumption that locale was + * all-or-nothing. Given the complexity and lack of documentation in the + * optimizer, and that there are inadequate test cases for locale, many + * parts of it may not work properly, it is safest to avoid locale unless + * necessary. */ + if (RExC_contains_locale) { + ANYOF_POSIXL_SETALL(ssc); + } + else { + ANYOF_POSIXL_ZERO(ssc); + } +} + +STATIC int +S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; +} + +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) +{ + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ + + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } + + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } + + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } + + return invlist; +} + +/* These two functions currently do the exact same thing */ +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ + +STATIC void +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) +{ + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ + + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; + + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); + + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); + + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } + } + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } + } +} + +STATIC void +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) +{ + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ + + SV* ored_cp_list; + U8 ored_flags; + + PERL_ARGS_ASSERT_SSC_OR; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ + + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); + } + } + } + } + + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} + +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; + } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); +} + +#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] +#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) +#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) + + +#ifdef DEBUGGING +/* + dump_trie(trie,widecharmap,revcharmap) + dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) + dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) + + These routines dump out a trie in a somewhat readable format. + The _interim_ variants are used for debugging the interim + tables that are used to generate the final compressed + representation which is what dump_trie expects. + + Part of the reason for their existence is to provide a form + of documentation as to how the different representations function. + +*/ + +/* + Dumps the final compressed table form of the trie to Perl_debug_log. + Used for debugging make_trie(). +*/ + +STATIC void +S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, + AV *revcharmap, U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + U16 word; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE; + + PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", + (int)depth * 2 + 2,"", + "Match","Base","Ofs" ); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) { + SV ** const tmp = av_fetch( revcharmap, state, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", + (int)depth * 2 + 2,""); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) + PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); + PerlIO_printf( Perl_debug_log, "\n"); + + for( state = 1 ; state < trie->statecount ; state++ ) { + const U32 base = trie->states[ state ].trans.base; + + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); + + if ( trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); + } else { + PerlIO_printf( Perl_debug_log, "%6s", "" ); + } + + PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); + + if ( base ) { + U32 ofs = 0; + + while( ( base + ofs < trie->uniquecharcount ) || + ( base + ofs - trie->uniquecharcount < trie->lasttrans + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) + ofs++; + + PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) + { + PerlIO_printf( Perl_debug_log, "%*"UVXf, + colwidth, + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); + } else { + PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); + } + } + + PerlIO_printf( Perl_debug_log, "]"); + + } + PerlIO_printf( Perl_debug_log, "\n" ); + } + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); + for (word=1; word <= trie->wordcount; word++) { + PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", + (int)word, (int)(trie->wordinfo[word].prev), + (int)(trie->wordinfo[word].len)); + } + PerlIO_printf(Perl_debug_log, "\n" ); +} +/* + Dumps a fully constructed but uncompressed trie in list form. + List tries normally only are used for construction when the number of + possible chars (trie->uniquecharcount) is very high. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + + /* print out the table precompression. */ + PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", + (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", + "------:-----+-----------------\n" ); + + for( state=1 ; state < next_alloc ; state ++ ) { + U16 charid; + + PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", + (int)depth * 2 + 2,"", (UV)state ); + if ( ! trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, "%5s| ",""); + } else { + PerlIO_printf( Perl_debug_log, "W%4x| ", + trie->states[ state ].wordnum + ); + } + for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR + ) , + TRIE_LIST_ITEM(state,charid).forid, + (UV)TRIE_LIST_ITEM(state,charid).newstate + ); + if (!(charid % 10)) + PerlIO_printf(Perl_debug_log, "\n%*s| ", + (int)((depth * 2) + 14), ""); + } + } + PerlIO_printf( Perl_debug_log, "\n"); + } +} + +/* + Dumps a fully constructed but uncompressed trie in table form. + This is the normal DFA style state transition table, with a few + twists to facilitate compression later. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + U16 charid; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; + + /* + print out the table precompression so that we can do a visual check + that they are identical. + */ + + PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, charid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + + PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); + + for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { + PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); + } + + PerlIO_printf( Perl_debug_log, "\n" ); + + for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { + + PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", + (int)depth * 2 + 2,"", + (UV)TRIE_NODENUM( state ) ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); + if (v) + PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); + else + PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); + } + if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); + } else { + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, + trie->states[ TRIE_NODENUM( state ) ].wordnum ); + } + } +} + +#endif + + +/* make_trie(startbranch,first,last,tail,word_count,flags,depth) + startbranch: the first branch in the whole branch sequence + first : start branch of sequence of branch-exact nodes. + May be the same as startbranch + last : Thing following the last branch. + May be the same as tail. + tail : item following the branch sequence + count : words in the sequence + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/ + depth : indent depth + +Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. + +A trie is an N'ary tree where the branches are determined by digital +decomposition of the key. IE, at the root node you look up the 1st character and +follow that branch repeat until you find the end of the branches. Nodes can be +marked as "accepting" meaning they represent a complete word. Eg: + + /he|she|his|hers/ + +would convert into the following structure. Numbers represent states, letters +following numbers represent valid transitions on the letter from that state, if +the number is in square brackets it represents an accepting state, otherwise it +will be in parenthesis. + + +-h->+-e->[3]-+-r->(8)-+-s->[9] + | | + | (2) + | | + (1) +-i->(6)-+-s->[7] + | + +-s->(3)-+-h->(4)-+-e->[5] + + Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) + +This shows that when matching against the string 'hers' we will begin at state 1 +read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, +then read 'r' and go to state 8 followed by 's' which takes us to state 9 which +is also accepting. Thus we know that we can match both 'he' and 'hers' with a +single traverse. We store a mapping from accepting to state to which word was +matched, and then when we have multiple possibilities we try to complete the +rest of the regex in the order in which they occured in the alternation. + +The only prior NFA like behaviour that would be changed by the TRIE support is +the silent ignoring of duplicate alternations which are of the form: + + / (DUPE|DUPE) X? (?{ ... }) Y /x + +Thus EVAL blocks following a trie may be called a different number of times with +and without the optimisation. With the optimisations dupes will be silently +ignored. This inconsistent behaviour of EVAL type nodes is well established as +the following demonstrates: + + 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ + +which prints out 'word' three times, but + + 'words'=~/(word|word|word)(?{ print $1 })S/ + +which doesnt print it out at all. This is due to other optimisations kicking in. + +Example of what happens on a structural level: + +The regexp /(ac|ad|ab)+/ will produce the following debug output: + + 1: CURLYM[1] {1,32767}(18) + 5: BRANCH(8) + 6: EXACT (16) + 8: BRANCH(11) + 9: EXACT (16) + 11: BRANCH(14) + 12: EXACT (16) + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +This would be optimizable with startbranch=5, first=5, last=16, tail=16 +and should turn into: + + 1: CURLYM[1] {1,32767}(18) + 5: TRIE(16) + [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] + + + + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +Cases where tail != last would be like /(?foo|bar)baz/: + + 1: BRANCH(4) + 2: EXACT (8) + 4: BRANCH(7) + 5: EXACT (8) + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + +which would be optimizable with startbranch=1, first=1, last=7, tail=8 +and would end up looking like: + + 1: TRIE(8) + [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] + + + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + + d = uvchr_to_utf8_flags(d, uv, 0); + +is the recommended Unicode-aware way of saying + + *(d++) = uv; +*/ + +#define TRIE_STORE_REVCHAR(val) \ + STMT_START { \ + if (UTF) { \ + SV *zlopp = newSV(7); /* XXX: optimize me */ \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)val; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END + +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ +} STMT_END + + + +#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ + if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ + U32 ging = TRIE_LIST_LEN( state ) *= 2; \ + Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ + } \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ + TRIE_LIST_CUR( state )++; \ +} STMT_END + +#define TRIE_LIST_NEW(state) STMT_START { \ + Newxz( trie->states[ state ].trans.list, \ + 4, reg_trie_trans_le ); \ + TRIE_LIST_CUR( state ) = 1; \ + TRIE_LIST_LEN( state ) = 4; \ +} STMT_END + +#define TRIE_HANDLE_WORD(state) STMT_START { \ + U16 dupe= trie->states[ state ].wordnum; \ + regnode * const noper_next = regnext( noper ); \ + \ + DEBUG_r({ \ + /* store the word for dumping */ \ + SV* tmp; \ + if (OP(noper) != NOTHING) \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ + else \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ + av_push( trie_words, tmp ); \ + }); \ + \ + curword++; \ + trie->wordinfo[curword].prev = 0; \ + trie->wordinfo[curword].len = wordlen; \ + trie->wordinfo[curword].accept = state; \ + \ + if ( noper_next < tail ) { \ + if (!trie->jump) \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ + trie->jump[curword] = (U16)(noper_next - convert); \ + if (!jumper) \ + jumper = noper_next; \ + if (!nextbranch) \ + nextbranch= regnext(cur); \ + } \ + \ + if ( dupe ) { \ + /* It's a dupe. Pre-insert into the wordinfo[].prev */\ + /* chain, so that when the bits of chain are later */\ + /* linked together, the dups appear in the chain */\ + trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ + trie->wordinfo[dupe].prev = curword; \ + } else { \ + /* we haven't inserted this word yet. */ \ + trie->states[ state ].wordnum = curword; \ + } \ +} STMT_END + + +#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ + ( ( base + charid >= ucharcount \ + && base + charid < ubound \ + && state == trie->trans[ base - ucharcount + charid ].check \ + && trie->trans[ base - ucharcount + charid ].next ) \ + ? trie->trans[ base - ucharcount + charid ].next \ + : ( state==1 ? special : 0 ) \ + ) + +#define MADE_TRIE 1 +#define MADE_JUMP_TRIE 2 +#define MADE_EXACT_TRIE 4 + +STATIC I32 +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) +{ + /* first pass, loop through and scan words */ + reg_trie_data *trie; + HV *widecharmap = NULL; + AV *revcharmap = newAV(); + regnode *cur; + STRLEN len = 0; + UV uvc = 0; + U16 curword = 0; + U32 next_alloc = 0; + regnode *jumper = NULL; + regnode *nextbranch = NULL; + regnode *convert = NULL; + U32 *prev_states; /* temp array mapping each state to previous one */ + /* we just use folder as a flag in utf8 */ + const U8 * folder = NULL; + +#ifdef DEBUGGING + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); + AV *trie_words = NULL; + /* along with revcharmap, this only used during construction but both are + * useful during debugging so we store them in the struct when debugging. + */ +#else + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); + STRLEN trie_charcount=0; +#endif + SV *re_trie_maxbuff; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + switch (flags) { + case EXACT: break; + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: folder = PL_fold_latin1; break; + case EXACTF: folder = PL_fold; break; + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); + } + + trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); + trie->refcount = 1; + trie->startstate = 1; + trie->wordcount = word_count; + RExC_rxi->data->data[ data_slot ] = (void*)trie; + trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); + if (flags == EXACT) + trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); + trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( + trie->wordcount+1, sizeof(reg_trie_wordinfo)); + + DEBUG_r({ + trie_words = newAV(); + }); + + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + assert(re_trie_maxbuff); + if (!SvIOK(re_trie_maxbuff)) { + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); + }); + + /* Find the node we are going to overwrite */ + if ( first == startbranch && OP( last ) != BRANCH ) { + /* whole branch chain */ + convert = first; + } else { + /* branch sub-chain */ + convert = NEXTOPER( first ); + } + + /* -- First loop and Setup -- + + We first traverse the branches and scan each word to determine if it + contains widechars, and how many unique chars there are, this is + important as we have to build a table with at least as many columns as we + have unique chars. + + We use an array of integers to represent the character codes 0..255 + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. + + *TODO* If we keep track of how many times each character is used we can + remap the columns so that the table compression later on is more + efficient in terms of memory by ensuring the most common value is in the + middle and the least common are on the outside. IMO this would be better + than a most to least common mapping as theres a decent chance the most + common letter will share a node with the least common, meaning the node + will not be compressible. With a middle is most common approach the worst + case is when we have the least common nodes twice. + + */ + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + regnode *noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + int foldlen = 0; + U32 wordlen = 0; /* required init */ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + trie->minlen= STR_LEN(noper); + } else { + trie->minlen= 0; + continue; + } + } + + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ + TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte + regardless of encoding */ + if (OP( noper ) == EXACTFU_SS) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); + } + } + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ + TRIE_CHARCOUNT(trie)++; + TRIE_READ_CHAR; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ + if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } + if ( !trie->charmap[ uvc ] ) { + trie->charmap[ uvc ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( uvc ); + } + if ( set_bit ) { + /* store the codepoint in the bitmap, and its folded + * equivalent. */ + TRIE_BITMAP_SET(trie, uvc); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + variant codepoints */ + if (! UVCHR_IS_INVARIANT(uvc)) { + TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); + } + } + set_bit = 0; /* We've done our bit :-) */ + } + } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + + SV** svpp; + if ( !widecharmap ) + widecharmap = newHV(); + + svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); + + if ( !svpp ) + Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); + + if ( !SvTRUE( *svpp ) ) { + sv_setiv( *svpp, ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR(uvc); + } + } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ + if( cur == first ) { + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; + } + } /* end first pass */ + DEBUG_TRIE_COMPILE_r( + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + (int)depth * 2 + 2,"", + ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, + (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, + (int)trie->minlen, (int)trie->maxlen ) + ); + + /* + We now know what we are dealing with in terms of unique chars and + string sizes so we can calculate how much memory a naive + representation using a flat table will take. If it's over a reasonable + limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory + conservative but potentially much slower representation using an array + of lists. + + At the end we convert both representations into the same compressed + form that will be used in regexec.c for matching with. The latter + is a form that cannot be used to construct with but has memory + properties similar to the list form and access properties similar + to the table form making it both suitable for fast searches and + small enough that its feasable to store for the duration of a program. + + See the comment in the code where the compressed table is produced + inplace from the flat tabe representation for an explanation of how + the compression works. + + */ + + + Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); + prev_states[1] = 0; + + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { + /* + Second Pass -- Array Of Lists Representation + + Each state will be represented by a list of charid:state records + (reg_trie_trans_le) the first such element holds the CUR and LEN + points of the allocated array. (See defines above). + + We build the initial structure using the lists, and then convert + it into the compressed table form which allows faster lookups + (but cant be modified once converted). + */ + + STRLEN transcount = 1; + + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using list compiler\n", + (int)depth * 2 + 2, "")); + + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + TRIE_LIST_NEW(1); + next_alloc = 2; + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = NEXTOPER( cur ); + U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + U32 state = 1; /* required init */ + U16 charid = 0; /* sanity init */ + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + + if (OP(noper) != NOTHING) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + if ( !svpp ) { + charid = 0; + } else { + charid=(U16)SvIV( *svpp ); + } + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + if ( charid ) { + + U16 check; + U32 newstate = 0; + + charid--; + if ( !trie->states[ state ].trans.list ) { + TRIE_LIST_NEW( state ); + } + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { + newstate = TRIE_LIST_ITEM( state, check ).newstate; + break; + } + } + if ( ! newstate ) { + newstate = next_alloc++; + prev_states[newstate] = state; + TRIE_LIST_PUSH( state, charid, newstate ); + transcount++; + } + state = newstate; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + } + } + TRIE_HANDLE_WORD(state); + + } /* end second pass */ + + /* next alloc is the NEXT state to be allocated */ + trie->statecount = next_alloc; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, + next_alloc + * sizeof(reg_trie_state) ); + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, + revcharmap, next_alloc, + depth+1) + ); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); + { + U32 state; + U32 tp = 0; + U32 zp = 0; + + + for( state=1 ; state < next_alloc ; state ++ ) { + U32 base=0; + + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) + ); + */ + + if (trie->states[state].trans.list) { + U16 minid=TRIE_LIST_ITEM( state, 1).forid; + U16 maxid=minid; + U16 idx; + + for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U16 forid = TRIE_LIST_ITEM( state, idx).forid; + if ( forid < minid ) { + minid=forid; + } else if ( forid > maxid ) { + maxid=forid; + } + } + if ( transcount < tp + maxid - minid + 1) { + transcount *= 2; + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, + transcount + * sizeof(reg_trie_trans) ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); + } + base = trie->uniquecharcount + tp - minid; + if ( maxid == minid ) { + U32 set = 0; + for ( ; zp < tp ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + base = trie->uniquecharcount + zp - minid; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ zp ].check = state; + set = 1; + break; + } + } + if ( !set ) { + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ tp ].check = state; + tp++; + zp = tp; + } + } else { + for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; + trie->trans[ tid ].check = state; + } + tp += ( maxid - minid + 1 ); + } + Safefree(trie->states[ state ].trans.list); + } + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, " base: %d\n",base); + ); + */ + trie->states[ state ].trans.base=base; + } + trie->lasttrans = tp + 1; + } + } else { + /* + Second Pass -- Flat Table Representation. + + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. + + We then construct the trie using only the .next slots of the entry + structs. + + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. + + Since trans are numbered from 1 any 0 pointer in the table is a FAIL + transition. + + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: + + A B A B + 1. 2 4 1. 3 7 + 2. 0 3 3. 0 5 + 3. 0 0 5. 0 0 + 4. 0 0 7. 0 0 + + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. + + */ + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using table compiler\n", + (int)depth * 2 + 2, "")); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) + * trie->uniquecharcount + 1, + sizeof(reg_trie_trans) ); + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + next_alloc = trie->uniquecharcount + 1; + + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + + U32 state = 1; /* required init */ + + U16 charid = 0; /* sanity init */ + U32 accept_state = 0; /* sanity init */ + + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + + if ( OP(noper) != NOTHING ) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + charid = svpp ? (U16)SvIV(*svpp) : 0; + } + if ( charid ) { + charid--; + if ( !trie->trans[ state + charid ].next ) { + trie->trans[ state + charid ].next = next_alloc; + trie->trans[ state ].check++; + prev_states[TRIE_NODENUM(next_alloc)] + = TRIE_NODENUM(state); + next_alloc += trie->uniquecharcount; + } + state = trie->trans[ state + charid ].next; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + } + } + accept_state = TRIE_NODENUM( state ); + TRIE_HANDLE_WORD(accept_state); + + } /* end second pass */ + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, + revcharmap, + next_alloc, depth+1)); + + { + /* + * Inplace compress the table.* + + For sparse data sets the table constructed by the trie algorithm will + be mostly 0/FAIL transitions or to put it another way mostly empty. + (Note that leaf nodes will not contain any transitions.) + + This algorithm compresses the tables by eliminating most such + transitions, at the cost of a modest bit of extra work during lookup: + + - Each states[] entry contains a .base field which indicates the + index in the state[] array wheres its transition data is stored. + + - If .base is 0 there are no valid transitions from that node. + + - If .base is nonzero then charid is added to it to find an entry in + the trans array. + + -If trans[states[state].base+charid].check!=state then the + transition is taken to be a 0/Fail transition. Thus if there are fail + transitions at the front of the node then the .base offset will point + somewhere inside the previous nodes data (or maybe even into a node + even earlier), but the .check field determines if the transition is + valid. + + XXX - wrong maybe? + The following process inplace converts the table to the compressed + table: We first do not compress the root node 1,and mark all its + .check pointers as 1 and set its .base pointer as 1 as well. This + allows us to do a DFA construction from the compressed table later, + and ensures that any .base pointers we calculate later are greater + than 0. + + - We set 'pos' to indicate the first entry of the second node. + + - We then iterate over the columns of the node, finding the first and + last used entry at l and m. We then copy l..m into pos..(pos+m-l), + and set the .check pointers accordingly, and advance pos + appropriately and repreat for the next node. Note that when we copy + the next pointers we have to convert them from the original + NODEIDX form to NODENUM form as the former is not valid post + compression. + + - If a node has no transitions used we mark its base as 0 and do not + advance the pos pointer. + + - If a node only has one transition we use a second pointer into the + structure to fill in allocated fail transitions from other states. + This pointer is independent of the main pointer and scans forward + looking for null transitions that are allocated to a state. When it + finds one it writes the single transition into the "hole". If the + pointer doesnt find one the single transition is appended as normal. + + - Once compressed we can Renew/realloc the structures to release the + excess space. + + See "Table-Compression Methods" in sec 3.9 of the Red Dragon, + specifically Fig 3.47 and the associated pseudocode. + + demq + */ + const U32 laststate = TRIE_NODENUM( next_alloc ); + U32 state, charid; + U32 pos = 0, zp=0; + trie->statecount = laststate; + + for ( state = 1 ; state < laststate ; state++ ) { + U8 flag = 0; + const U32 stateidx = TRIE_NODEIDX( state ); + const U32 o_used = trie->trans[ stateidx ].check; + U32 used = trie->trans[ stateidx ].check; + trie->trans[ stateidx ].check = 0; + + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { + if ( flag || trie->trans[ stateidx + charid ].next ) { + if ( trie->trans[ stateidx + charid ].next ) { + if (o_used == 1) { + for ( ; zp < pos ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + break; + } + } + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); + trie->trans[ zp ].check = state; + if ( ++zp > pos ) pos = zp; + break; + } + used--; + } + if ( !flag ) { + flag = 1; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; + } + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].check = state; + pos++; + } + } + } + trie->lasttrans = pos + 1; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, laststate + * sizeof(reg_trie_state) ); + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + ); + + } /* end table compress */ + } + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + (int)depth * 2 + 2, "", + (UV)trie->statecount, + (UV)trie->lasttrans) + ); + /* resize the trans array to remove unused space */ + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, trie->lasttrans + * sizeof(reg_trie_trans) ); + + { /* Modify the program and insert the new TRIE node */ + U8 nodetype =(U8)(flags & 0xFF); + char *str=NULL; + +#ifdef DEBUGGING + regnode *optimize = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + + U32 mjd_offset = 0; + U32 mjd_nodelen = 0; +#endif /* RE_TRACK_PATTERN_OFFSETS */ +#endif /* DEBUGGING */ + /* + This means we convert either the first branch or the first Exact, + depending on whether the thing following (in 'last') is a branch + or not and whther first is the startbranch (ie is it a sub part of + the alternation or is it the whole thing.) + Assuming its a sub part we convert the EXACT otherwise we convert + the whole branch sequence, including the first. + */ + /* Find the node we are going to overwrite */ + if ( first != startbranch || OP( last ) == BRANCH ) { + /* branch sub-chain */ + NEXT_OFF( first ) = (U16)(last - first); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_r({ + mjd_offset= Node_Offset((convert)); + mjd_nodelen= Node_Length((convert)); + }); +#endif + /* whole branch chain */ + } +#ifdef RE_TRACK_PATTERN_OFFSETS + else { + DEBUG_r({ + const regnode *nop = NEXTOPER( convert ); + mjd_offset= Node_Offset((nop)); + mjd_nodelen= Node_Length((nop)); + }); + } + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + (int)depth * 2 + 2, "", + (UV)mjd_offset, (UV)mjd_nodelen) + ); +#endif + /* But first we check to see if there is a common prefix we can + split out as an EXACT and put in front of the TRIE node. */ + trie->startstate= 1; + if ( trie->bitmap && !widecharmap && !trie->jump ) { + U32 state; + for ( state = 1 ; state < trie->statecount-1 ; state++ ) { + U32 ofs = 0; + I32 idx = -1; + U32 count = 0; + const U32 base = trie->states[ state ].trans.base; + + if ( trie->states[state].wordnum ) + count = 1; + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) && + ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && + trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + { + if ( ++count > 1 ) { + SV **tmp = av_fetch( revcharmap, ofs, 0); + const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); + if ( state == 1 ) break; + if ( count == 2 ) { + Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*sNew Start State=%"UVuf" Class: [", + (int)depth * 2 + 2, "", + (UV)state)); + if (idx >= 0) { + SV ** const tmp = av_fetch( revcharmap, idx, 0); + const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); + + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie, folder[ *ch ]); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, "%s", (char*)ch) + ); + } + } + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie,folder[ *ch ]); + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); + } + idx = ofs; + } + } + if ( count == 1 ) { + SV **tmp = av_fetch( revcharmap, idx, 0); + STRLEN len; + char *ch = SvPV( *tmp, len ); + DEBUG_OPTIMISE_r({ + SV *sv=sv_newmortal(); + PerlIO_printf( Perl_debug_log, + "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", + (int)depth * 2 + 2, "", + (UV)state, (UV)idx, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + }); + if ( state==1 ) { + OP( convert ) = nodetype; + str=STRING(convert); + STR_LEN(convert)=0; + } + STR_LEN(convert) += len; + while (len--) + *str++ = *ch++; + } else { +#ifdef DEBUGGING + if (state>1) + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); +#endif + break; + } + } + trie->prefixlen = (state-1); + if (str) { + regnode *n = convert+NODE_SZ_STR(convert); + NEXT_OFF(convert) = NODE_SZ_STR(convert); + trie->startstate = state; + trie->minlen -= (state - 1); + trie->maxlen -= (state - 1); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + regnode *fix = convert; + U32 word = trie->wordcount; + mjd_nodelen++; + Set_Node_Offset_Length(convert, mjd_offset, state - 1); + while( ++fix < n ) { + Set_Node_Offset_Length(fix, 0, 0); + } + while (word--) { + SV ** const tmp = av_fetch( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif + if (trie->maxlen) { + convert = n; + } else { + NEXT_OFF(convert) = (U16)(tail - convert); + DEBUG_r(optimize= n); + } + } + } + if (!jumper) + jumper = last; + if ( trie->maxlen ) { + NEXT_OFF( convert ) = (U16)(tail - convert); + ARG_SET( convert, data_slot ); + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. + We use this when dumping a trie and during optimisation. */ + if (trie->jump) + trie->jump[0] = (U16)(nextbranch - convert); + + /* If the start state is not accepting (meaning there is no empty string/NOTHING) + * and there is a bitmap + * and the first "jump target" node we found leaves enough room + * then convert the TRIE node into a TRIEC node, with the bitmap + * embedded inline in the opcode - this is hypothetically faster. + */ + if ( !trie->states[trie->startstate].wordnum + && trie->bitmap + && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) + { + OP( convert ) = TRIEC; + Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); + PerlMemShared_free(trie->bitmap); + trie->bitmap= NULL; + } else + OP( convert ) = TRIE; + + /* store the type in the flags */ + convert->flags = nodetype; + DEBUG_r({ + optimize = convert + + NODE_STEP_REGNODE + + regarglen[ OP( convert ) ]; + }); + /* XXX We really should free up the resource in trie now, + as we won't use them - (which resources?) dmq */ + } + /* needed for dumping*/ + DEBUG_r(if (optimize) { + regnode *opt = convert; + + while ( ++opt < optimize) { + Set_Node_Offset_Length(opt,0,0); + } + /* + Try to clean up some of the debris left after the + optimisation. + */ + while( optimize < jumper ) { + mjd_nodelen += Node_Length((optimize)); + OP( optimize ) = OPTIMIZED; + Set_Node_Offset_Length(optimize,0,0); + optimize++; + } + Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); + }); + } /* end node insert */ + + /* Finish populating the prev field of the wordinfo array. Walk back + * from each accept state until we find another accept state, and if + * so, point the first word's .prev field at the second word. If the + * second already has a .prev field set, stop now. This will be the + * case either if we've already processed that word's accept state, + * or that state had multiple words, and the overspill words were + * already linked up earlier. + */ + { + U16 word; + U32 state; + U16 prev; + + for (word=1; word <= trie->wordcount; word++) { + prev = 0; + if (trie->wordinfo[word].prev) + continue; + state = trie->wordinfo[word].accept; + while (state) { + state = prev_states[state]; + if (!state) + break; + prev = trie->states[state].wordnum; + if (prev) + break; + } + trie->wordinfo[word].prev = prev; + } + Safefree(prev_states); + } + + + /* and now dump out the compressed format */ + DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); + + RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; +#ifdef DEBUGGING + RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; + RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; +#else + SvREFCNT_dec_NN(revcharmap); +#endif + return trie->jump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE + : MADE_TRIE; +} + +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) +{ +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed + + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 + ISBN 0-201-10088-6 + + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. + Consider + 'abcdgu'=~/abcdefg|cdgu/ + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. + */ + /* add a fail transition */ + const U32 trie_offset = ARG(source); + reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; + U32 *q; + const U32 ucharcount = trie->uniquecharcount; + const U32 numstates = trie->statecount; + const U32 ubound = trie->lasttrans + ucharcount; + U32 q_read = 0; + U32 q_write = 0; + U32 charid; + U32 base = trie->states[ 1 ].trans.base; + U32 *fail; + reg_ac_data *aho; + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; + PERL_UNUSED_CONTEXT; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ + + ARG_SET( stclass, data_slot ); + aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); + RExC_rxi->data->data[ data_slot ] = (void*)aho; + aho->trie=trie_offset; + aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); + Copy( trie->states, aho->states, numstates, reg_trie_state ); + Newxz( q, numstates, U32); + aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); + aho->refcount = 1; + fail = aho->fail; + /* initialize fail[0..1] to be 1 so that we always have + a valid final fail state */ + fail[ 0 ] = fail[ 1 ] = 1; + + for ( charid = 0; charid < ucharcount ; charid++ ) { + const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); + if ( newstate ) { + q[ q_write ] = newstate; + /* set to point at the root */ + fail[ q[ q_write++ ] ]=1; + } + } + while ( q_read < q_write) { + const U32 cur = q[ q_read++ % numstates ]; + base = trie->states[ cur ].trans.base; + + for ( charid = 0 ; charid < ucharcount ; charid++ ) { + const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); + if (ch_state) { + U32 fail_state = cur; + U32 fail_base; + do { + fail_state = fail[ fail_state ]; + fail_base = aho->states[ fail_state ].trans.base; + } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); + + fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); + fail[ ch_state ] = fail_state; + if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) + { + aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; + } + q[ q_write++ % numstates] = ch_state; + } + } + } + /* restore fail[0..1] to 0 so that we "fall out" of the AC loop + when we fail in state 1, this allows us to use the + charclass scan to find a valid start char. This is based on the principle + that theres a good chance the string being searched contains lots of stuff + that cant be a start char. + */ + fail[ 0 ] = fail[ 1 ] = 0; + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%*sStclass Failtable (%"UVuf" states): 0", + (int)(depth * 2), "", (UV)numstates + ); + for( q_read=1; q_read%3d: %s (%d)\n", \ + (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ + Next ? (REG_NODE_NUM(Next)) : 0 ); \ + }}); + + +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one. The regop may be changed if the node(s) contain certain sequences that + * require special handling. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING-kind nodes, and + * these get optimized out + * + * If a node is to match under /i (folded), the number of characters it matches + * can be different than its character length if it contains a multi-character + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. + * + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) + * + * This is as good a place as any to discuss the design of handling these + * multi-character fold sequences. It's been wrong in Perl for a very long + * time. There are three code points in Unicode whose multi-character folds + * were long ago discovered to mess things up. The previous designs for + * dealing with these involved assigning a special node for them. This + * approach doesn't always work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both sides fold to "sss", but if the pattern is parsed to create a node that + * would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFU_SS node + * that is "sss" in this case. + * + * It turns out that there are problems with all multi-character folds, and not + * just these three. Now the code is general, for all such cases. The + * approach taken is: + * 1) This routine examines each EXACTFish node that could contain multi- + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * is used for an EXACTFU node that contains at least one "ss" sequence in + * it. For non-UTF-8 patterns and strings, this is the only case where + * there is a possible fold length change. That means that a regular + * EXACTFU node without UTF-8 involvement doesn't have to concern itself + * with length changes, and so can be processed faster. regexec.c takes + * advantage of this. Generally, an EXACTFish node that is in UTF-8 is + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string are + * members of a fold-pair, and arrays are set up for all of them so that + * the other member of the pair can be found quickly. Code elsewhere in + * this file makes sure that in EXACTFU nodes, the sharp s gets folded to + * 'ss', even if the pattern isn't UTF-8. This avoids the issues + * described in the next item. + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid + * assumption. Thus, there is no optimization based on string lengths for + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ + if (PL_regkind[OP(scan)] == EXACT) \ + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) + +STATIC U32 +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ + /* Merge several consecutive EXACTish nodes into one. */ + regnode *n = regnext(scan); + U32 stringok = 1; + regnode *next = scan + NODE_SZ_STR(scan); + U32 merged = 0; + U32 stopnow = 0; +#ifdef DEBUGGING + regnode *stop = scan; + GET_RE_DEBUG_FLAGS_DECL; +#else + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_JOIN_EXACT; +#ifndef EXPERIMENTAL_INPLACESCAN + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(val); +#endif + DEBUG_PEEP("join",scan,depth); + + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) + && NEXT_OFF(n) + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { + + if (OP(n) == TAIL || n > next) + stringok = 0; + if (PL_regkind[OP(n)] == NOTHING) { + DEBUG_PEEP("skip:",n,depth); + NEXT_OFF(scan) += NEXT_OFF(n); + next = n + NODE_STEP_REGNODE; +#ifdef DEBUGGING + if (stringok) + stop = n; +#endif + n = regnext(n); + } + else if (stringok) { + const unsigned int oldl = STR_LEN(scan); + regnode * const nnext = regnext(n); + + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ + /* Don't join if the sum can't fit into a single node */ + if (oldl + STR_LEN(n) > U8_MAX) + break; + + DEBUG_PEEP("merg",n,depth); + merged++; + + NEXT_OFF(scan) += NEXT_OFF(n); + STR_LEN(scan) += STR_LEN(n); + next = n + NODE_SZ_STR(n); + /* Now we can overwrite *n : */ + Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); +#ifdef DEBUGGING + stop = next - 1; +#endif + n = nnext; + if (stopnow) break; + } + +#ifdef EXPERIMENTAL_INPLACESCAN + if (flags && !NEXT_OFF(n)) { + DEBUG_PEEP("atch", val, depth); + if (reg_off_by_arg[OP(n)]) { + ARG_SET(n, val - n); + } + else { + NEXT_OFF(n) = val - n; + } + stopnow = 1; + } +#endif + } + + *min_subtract = 0; + *unfolded_multi_char = FALSE; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ + if (OP(scan) != EXACT) { + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ + + /* One pass is made over the node's string looking for all the + * possibilities. To avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ + + /* Examine the string for a multi-character fold sequence. UTF-8 + * patterns have all characters pre-folded by the time this code is + * executed */ + while (s < s_end - 1) /* Can stop 1 before the end, as minimum + length sequence we are looking for is 2 */ + { + int count = 0; /* How many characters in a multi-char fold */ + int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); + if (! len) { /* Not a multi-char fold: get next char */ + s += UTF8SKIP(s); + continue; + } + + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ + if (len == 2 && *s == 's' && *(s+1) == 's' + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) + { + count = 2; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } + s += 2; + } + else { /* Here is a generic multi-char fold. */ + U8* multi_end = s + len; + + /* Count how many characters are in it. In the case of + * /aa, no folds which contain ASCII code points are + * allowed, so check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { + count = utf8_length(s, multi_end); + s = multi_end; + } + else { + while (s < multi_end) { + if (isASCII(*s)) { + s++; + goto next_iteration; + } + else { + s += UTF8SKIP(s); + } + count++; + } + } + } + + /* The delta is how long the sequence is minus 1 (1 is how long + * the character that folds to the sequence is) */ + total_count_delta += count - 1; + next_iteration: ; + } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); + } + else if (OP(scan) == EXACTFA) { + + /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char + * fold to the ASCII range (and there are no existing ones in the + * upper latin1 range). But, as outlined in the comments preceding + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ + while (s < s_end) { + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; + break; + } + s++; + continue; + } + } + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; + + while (s < upper) { + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); + if (! len) { /* Not a multi-char fold. */ + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) + { + *unfolded_multi_char = TRUE; + } + s++; + continue; + } + + if (len == 2 + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) + { + + /* EXACTF nodes need to know that the minimum length + * changed so that a sharp s in the string can match this + * ss in the pattern, but they remain EXACTF nodes, as they + * won't match this unless the target string is is UTF-8, + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } + } + + *min_subtract += len - 1; + s += len; + } + } + } + +#ifdef DEBUGGING + /* Allow dumping but overwriting the collection of skipped + * ops and/or strings with fake optimized ops */ + n = scan + NODE_SZ_STR(scan); + while (n <= stop) { + OP(n) = OPTIMIZED; + FLAGS(n) = 0; + NEXT_OFF(n) = 0; + n++; + } +#endif + DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); + return stopnow; +} + +/* REx optimizer. Converts nodes into quicker variants "in place". + Finds fixed substrings. */ + +/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set + to the position after last scanned or to NULL. */ + +#define INIT_AND_WITHP \ + assert(!and_withp); \ + Newx(and_withp,1, regnode_ssc); \ + SAVEFREEPV(and_withp) + +/* this is a chain of data about sub patterns we are processing that + need to be handled separately/specially in study_chunk. Its so + we can simulate recursion without losing state. */ +struct scan_frame; +typedef struct scan_frame { + regnode *last; /* last node to process in this frame */ + regnode *next; /* next node to process when last is reached */ + struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; + I32 stop; /* what stopparen do we use */ +} scan_frame; + + +STATIC SSize_t +S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, + SSize_t *minlenp, SSize_t *deltap, + regnode *last, + scan_data_t *data, + I32 stopparen, + U32 recursed_depth, + regnode_ssc *and_withp, + U32 flags, U32 depth) + /* scanp: Start here (read-write). */ + /* deltap: Write maxlen-minlen here. */ + /* last: Stop before this one. */ + /* data: string data about the pattern */ + /* stopparen: treat close N as END */ + /* recursed: which subroutines have we recursed into */ + /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ +{ + /* There must be at least this number of characters to match */ + SSize_t min = 0; + I32 pars = 0, code; + regnode *scan = *scanp, *next; + SSize_t delta = 0; + int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); + int is_inf_internal = 0; /* The studied chunk is infinite */ + I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; + scan_data_t data_fake; + SV *re_trie_maxbuff = NULL; + regnode *first_non_open = scan; + SSize_t stopmin = SSize_t_MAX; + scan_frame *frame = NULL; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_STUDY_CHUNK; + +#ifdef DEBUGGING + StructCopy(&zero_scan_data, &data_fake, scan_data_t); +#endif + if ( depth == 0 ) { + while (first_non_open && OP(first_non_open) == OPEN) + first_non_open=regnext(first_non_open); + } + + + fake_study_recurse: + while ( scan && OP(scan) != END && scan < last ){ + UV min_subtract = 0; /* How mmany chars to subtract from the minimum + node length to get a real minimum (because + the folded version may be shorter) */ + bool unfolded_multi_char = FALSE; + /* Peephole optimizer: */ + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); + + + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); + + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. */ + if (OP(scan) != CURLYX) { + const int max = (reg_off_by_arg[OP(scan)] + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); + int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); + int noff; + regnode *n = scan; + + /* Skip NOTHING and LONGJMP. */ + while ((n = regnext(n)) + && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n)))) + && off + noff < max) + off += noff; + if (reg_off_by_arg[OP(scan)]) + ARG(scan) = off; + else + NEXT_OFF(scan) = off; + } + + + + /* The principal pseudo-switch. Cannot be a switch, since we + look into several different things. */ + if (OP(scan) == BRANCH || OP(scan) == BRANCHJ + || OP(scan) == IFTHEN) { + next = regnext(scan); + code = OP(scan); + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ + + if (OP(next) == code || code == IFTHEN) { + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; + regnode * const startbranch=scan; + + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + while (OP(scan) == code) { + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; + + num++; + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + + data_fake.pos_delta = delta; + next = regnext(scan); + scan = NEXTOPER(scan); + if (code != BRANCH) + scan = NEXTOPER(scan); + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + /* we suppose the run is continuous, last=next...*/ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); + if (min1 > minnext) + min1 = minnext; + if (deltanext == SSize_t_MAX) { + is_inf = is_inf_internal = 1; + max1 = SSize_t_MAX; + } else if (max1 < minnext + deltanext) + max1 = minnext + deltanext; + scan = next; + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > minnext) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); + } + if (code == IFTHEN && num < 2) /* Empty ELSE branch */ + min1 = 0; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; + else + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; + else + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { + /* demq. + + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of + + BRANCH->EXACT=>x1 + BRANCH->EXACT=>x2 + tail + + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ + + If we can find such a subsequence we need to turn the first + element into a trie and then add the subsequent branch exact + strings to the trie. + + We have two cases + + 1. patterns where the whole set of branches can be + converted. + + 2. patterns where only a subset can be converted. + + In case 1 we can replace the whole set with a single regop + for the trie. In case 2 we need to keep the start and end + branches so + + 'BRANCH EXACT; BRANCH EXACT; BRANCH X' + becomes BRANCH TRIE; BRANCH X; + + There is an additional case, that being where there is a + common prefix, which gets split out into an EXACT like node + preceding the TRIE node. + + If x(1..n)==tail then we can do a simple trie, if not we make + a "jump" trie, such that when we match the appropriate word + we "jump" to the appropriate tail node. Essentially we turn + a nested if into a case structure of sorts. + + */ + + int made=0; + if (!re_trie_maxbuff) { + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + if (!SvIOK(re_trie_maxbuff)) + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + if ( SvIV(re_trie_maxbuff)>=0 ) { + regnode *cur; + regnode *first = (regnode *)NULL; + regnode *last = (regnode *)NULL; + regnode *tail = scan; + U8 trietype = 0; + U32 count=0; + +#ifdef DEBUGGING + SV * const mysv = sv_newmortal(); /* for dumping */ +#endif + /* var tail is used because there may be a TAIL + regop in the way. Ie, the exacts will point to the + thing following the TAIL, but the last branch will + point at the TAIL. So we advance tail. If we + have nested (?:) we may have to move through several + tails. + */ + + while ( OP( tail ) == TAIL ) { + /* this is the TAIL generated by (?:) */ + tail = regnext( tail ); + } + + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, tail, NULL); + PerlIO_printf( Perl_debug_log, "%*s%s%s\n", + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) + ); + }); + + /* + + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part + of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACTFU | EXACTFU + EXACTFU_SS | EXACTFU + EXACTFA | EXACTFA + + + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ + ( EXACT == (X) ) ? EXACT : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ + 0 ) + + /* dont use tail as the end marker for this traverse */ + for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { + regnode * const noper = NEXTOPER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); +#if defined(DEBUGGING) || defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; +#endif + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", + (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); + + regprop(RExC_rx, mysv, noper, NULL); + PerlIO_printf( Perl_debug_log, " -> %s", + SvPV_nolen_const(mysv)); + + if ( noper_next ) { + regprop(RExC_rx, mysv, noper_next, NULL); + PerlIO_printf( Perl_debug_log,"\t=> %s\t", + SvPV_nolen_const(mysv)); + } + PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] + ); + }); + + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ + if ( noper_trietype + && + ( + ( noper_trietype == NOTHING) + || ( trietype == NOTHING ) + || ( trietype == noper_trietype ) + ) +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + && count < U16_MAX) + { + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ + if ( !first ) { + first = cur; + if ( noper_trietype == NOTHING ) { +#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; +#endif + + if ( noper_next_trietype ) { + trietype = noper_next_trietype; + } else if (noper_next_type) { + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ + first = NULL; + } + } else { + trietype = noper_trietype; + } + } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; + last = cur; + } + if (first) + count++; + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ + if ( last ) { + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ + if ( trietype && trietype != NOTHING ) + make_trie( pRExC_state, + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ + } + if ( noper_trietype +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + ){ + /* noper is triable, so we can start a new + * trie sequence */ + count = 1; + first = cur; + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the + * current node is not triable then we have + * to reset the first information. */ + count = 0; + first = NULL; + trietype = 0; + } + } /* end handle unmergable node */ + } /* loop over branches */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + if ( last && trietype ) { + if ( trietype != NOTHING ) { + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); +#ifdef TRIE_STUDY_OPT + if ( ((made == MADE_EXACT_TRIE && + startbranch == first) + || ( first_non_open == first )) && + depth==0 ) { + flags |= SCF_TRIE_RESTUDY; + if ( startbranch == first + && scan == tail ) + { + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; + } + } +#endif + } else { + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING + */ + if ( startbranch == first ) { + regnode *opt; + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + OP(startbranch)= NOTHING; + NEXT_OFF(startbranch)= tail - startbranch; + for ( opt= startbranch + 1; opt < tail ; opt++ ) + OP(opt)= OPTIMIZED; + } + } + } /* end if ( last) */ + } /* TRIE_MAXBUF is non zero */ + + } /* do trie */ + + } + else if ( code == BRANCHJ ) { /* single branch is optimized. */ + scan = NEXTOPER(NEXTOPER(scan)); + } else /* single branch is optimized. */ + scan = NEXTOPER(scan); + continue; + } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { + scan_frame *newframe = NULL; + I32 paren; + regnode *start; + regnode *end; + U32 my_recursed_depth= recursed_depth; + + if (OP(scan) != SUSPEND) { + /* set the pointer */ + if (OP(scan) == GOSUB) { + paren = ARG(scan); + RExC_recurse[ARG2L(scan)] = scan; + start = RExC_open_parens[paren-1]; + end = RExC_close_parens[paren-1]; + } else { + paren = 0; + start = RExC_rxi->program + 1; + end = RExC_opend; + } + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; + Newx(newframe,1,scan_frame); + } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + } + } else { + Newx(newframe,1,scan_frame); + paren = stopparen; + start = scan+2; + end = regnext(scan); + } + if (newframe) { + assert(start); + assert(end); + SAVEFREEPV(newframe); + newframe->next = regnext(scan); + newframe->last = last; + newframe->stop = stopparen; + newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); + + frame = newframe; + scan = start; + stopparen = paren; + last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; + + continue; + } + } + else if (OP(scan) == EXACT) { + SSize_t l = STR_LEN(scan); + UV uc; + if (UTF) { + const U8 * const s = (U8*)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } else { + uc = *((U8*)STRING(scan)); + } + min += l; + if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ + /* The code below prefers earlier match for fixed + offset, later match for variable offset. */ + if (data->last_end == -1) { /* Update the start info. */ + data->last_start_min = data->pos_min; + data->last_start_max = is_inf + ? SSize_t_MAX : data->pos_min + data->pos_delta; + } + sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + if (UTF) + SvUTF8_on(data->last_found); + { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += utf8_length((U8*)STRING(scan), + (U8*)STRING(scan)+STR_LEN(scan)); + } + data->last_end = data->pos_min + l; + data->pos_min += l; /* As in the first entry. */ + data->flags &= ~SF_BEFORE_EOL; + } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ + if (flags & SCF_DO_STCLASS_AND) { + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + } + else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is + EXACTFish */ + SSize_t l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ + const U8 * s = (U8*)STRING(scan); + + /* Search for fixed substrings supports EXACT only. */ + if (flags & SCF_DO_SUBSTR) { + assert(data); + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (UTF) { + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; + } + min += l - min_subtract; + assert (min >= 0); + delta += min_subtract; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += l - min_subtract; + if (data->pos_min < 0) { + data->pos_min = 0; + } + data->pos_delta += min_subtract; + if (min_subtract) { + data->longest = &(data->longest_float); + } + } + + if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + + if (! UTF) { + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { + + /* Any Latin1 range character can potentially match any + * other depending on the locale */ + if (OP(scan) == EXACTFL) { + _invlist_union(EXACTF_invlist, PL_Latin1, + &EXACTF_invlist); + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) { + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); + } + } + + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, + (U8) uc, + &EXACTF_invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + STR_LEN(scan); + SV** listp; + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(scan) == EXACTFL + && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) + { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); + } + } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; + } + + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c); + } + } + } + } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); + } + else if (REGNODE_VARIES(OP(scan))) { + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; + regnode * const oscan = scan; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; + I32 next_is_eval = 0; + + switch (PL_regkind[OP(scan)]) { + case WHILEM: /* End of (?:...)* . */ + scan = NEXTOPER(scan); + goto finish; + case PLUS: + if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { + next = NEXTOPER(scan); + if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { + mincount = 1; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + } + if (flags & SCF_DO_SUBSTR) + data->pos_min++; + min++; + /* FALLTHROUGH */ + case STAR: + if (flags & SCF_DO_STCLASS) { + mincount = 0; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + scan = regnext(scan); + goto optimize_curly_tail; + case CURLY: + if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) + && (scan->flags == stopparen)) + { + mincount = 1; + maxcount = 1; + } else { + mincount = ARG1(scan); + maxcount = ARG2(scan); + } + next = regnext(scan); + if (OP(scan) == CURLYX) { + I32 lp = (data ? *(data->last_closep) : 0); + scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); + } + scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; + next_is_eval = (OP(scan) == EVAL); + do_curly: + if (flags & SCF_DO_SUBSTR) { + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + pos_before = data->pos_min; + } + if (data) { + fl = data->flags; + data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); + if (is_inf) + data->flags |= SF_IS_INF; + } + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + oclass = data->start_class; + data->start_class = &this_class; + f |= SCF_DO_STCLASS_AND; + f &= ~SCF_DO_STCLASS_OR; + } + /* Exclude from super-linear cache processing any {n,m} + regops for which the combination of input pos and regex + pos is not enough information to determine if a match + will be possible. + + For example, in the regex /foo(bar\s*){4,8}baz/ with the + regex pos at the \s*, the prospects for a match depend not + only on the input position but also on how many (bar\s*) + repeats into the {4,8} we are. */ + if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) + f &= ~SCF_WHILEM_VISITED_POS; + + /* This will finish on WHILEM, setting scan, or on NULL: */ + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); + + if (flags & SCF_DO_STCLASS) + data->start_class = oclass; + if (mincount == 0 || minnext == 0) { + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + } + else if (flags & SCF_DO_STCLASS_AND) { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&this_class, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + } + } else { /* Non-zero len */ + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + } + else if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + flags &= ~SCF_DO_STCLASS; + } + if (!scan) /* It was not CURLYX, but CURLY. */ + scan = next; + if (!(flags & SCF_TRIE_DOING_RESTUDY) + /* ? quantifier ok, except for (?{ ... }) */ + && (next_is_eval || !(mincount == 0 && maxcount == 1)) + && (minnext == 0) && (deltanext == 0) + && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ + { + /* Fatal warnings may leak the regexp without this: */ + SAVEFREESV(RExC_rx_sv); + ckWARNreg(RExC_parse, + "Quantifier unexpected on zero-length expression"); + (void)ReREFCNT_inc(RExC_rx_sv); + } + + min += minnext * mincount; + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf |= is_inf_internal; + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } + /* Try powerful optimization CURLYX => CURLYN. */ + if ( OP(oscan) == CURLYX && data + && data->flags & SF_IN_PAR + && !(data->flags & SF_HAS_EVAL) + && !deltanext && minnext == 1 ) { + /* Try to optimize to CURLYN. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; + regnode * const nxt1 = nxt; +#ifdef DEBUGGING + regnode *nxt2; +#endif + + /* Skip open. */ + nxt = regnext(nxt); + if (!REGNODE_SIMPLE(OP(nxt)) + && !(PL_regkind[OP(nxt)] == EXACT + && STR_LEN(nxt) == 1)) + goto nogo; +#ifdef DEBUGGING + nxt2 = nxt; +#endif + nxt = regnext(nxt); + if (OP(nxt) != CLOSE) + goto nogo; + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ + } + /* Now we know that nxt2 is the only contents: */ + oscan->flags = (U8)ARG(nxt); + OP(oscan) = CURLYN; + OP(nxt1) = NOTHING; /* was OPEN. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ +#endif + } + nogo: + + /* Try optimization CURLYX => CURLYM. */ + if ( OP(oscan) == CURLYX && data + && !(data->flags & SF_HAS_PAR) + && !(data->flags & SF_HAS_EVAL) + && !deltanext /* atom is fixed width */ + && minnext != 0 /* CURLYM can't handle zero width */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) + ) { + /* XXXX How to optimize if data == 0? */ + /* Optimize to a simpler form. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ + regnode *nxt2; + + OP(oscan) = CURLYM; + while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ + && (OP(nxt2) != WHILEM)) + nxt = nxt2; + OP(nxt2) = SUCCEED; /* Whas WHILEM */ + /* Need to optimize away parenths. */ + if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { + /* Set the parenth number. */ + regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ + + oscan->flags = (U8)ARG(nxt); + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ + } + OP(nxt1) = OPTIMIZED; /* was OPEN. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ +#endif +#if 0 + while ( nxt1 && (OP(nxt1) != WHILEM)) { + regnode *nnxt = regnext(nxt1); + if (nnxt == nxt) { + if (reg_off_by_arg[OP(nxt1)]) + ARG_SET(nxt1, nxt2 - nxt1); + else if (nxt2 - nxt1 < U16_MAX) + NEXT_OFF(nxt1) = nxt2 - nxt1; + else + OP(nxt) = NOTHING; /* Cannot beautify */ + } + nxt1 = nnxt; + } +#endif + /* Optimize again: */ + study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, + NULL, stopparen, recursed_depth, NULL, 0,depth+1); + } + else + oscan->flags = 0; + } + else if ((OP(oscan) == CURLYX) + && (flags & SCF_WHILEM_VISITED_POS) + /* See the comment on a similar expression above. + However, this time it's not a subexpression + we care about, but the expression itself. */ + && (maxcount == REG_INFTY) + && data && ++data->whilem_c < 16) { + /* This stays as CURLYX, we can put the count/of pair. */ + /* Find WHILEM (as in regexec.c) */ + regnode *nxt = oscan + NEXT_OFF(oscan); + + if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ + nxt += ARG(nxt); + PREVOPER(nxt)->flags = (U8)(data->whilem_c + | (RExC_whilem_seen << 4)); /* On WHILEM */ + } + if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (flags & SCF_DO_SUBSTR) { + SV *last_str = NULL; + STRLEN last_chrs = 0; + int counted = mincount != 0; + + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min + ? pos_before : data->last_start_min; + STRLEN l; + const char * const s = SvPV_const(data->last_found, l); + SSize_t old = b - data->last_start_min; + + if (UTF) + old = utf8_hop((U8*)s, old) - (U8*)s; + l -= old; + /* Get the added string: */ + last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; + if (deltanext == 0 && pos_before == b) { + /* What was added is a constant string */ + if (mincount > 1) { + + SvGROW(last_str, (mincount * l) + 1); + repeatcpy(SvPVX(last_str) + l, + SvPVX_const(last_str), l, + mincount - 1); + SvCUR_set(last_str, SvCUR(last_str) * mincount); + /* Add additional parts. */ + SvCUR_set(data->last_found, + SvCUR(data->last_found) - l); + sv_catsv(data->last_found, last_str); + { + SV * sv = data->last_found; + MAGIC *mg = + SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += last_chrs * (mincount-1); + } + last_chrs *= mincount; + data->last_end += l * (mincount - 1); + } + } else { + /* start offset must point into the last copy */ + data->last_start_min += minnext * (mincount - 1); + data->last_start_max += is_inf ? SSize_t_MAX + : (maxcount - 1) * (minnext + data->pos_delta); + } + } + /* It is counted once already... */ + data->pos_min += minnext * (mincount - counted); +#if 0 +PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf + " SSize_t_MAX=%"UVuf" minnext=%"UVuf + " maxcount=%"UVuf" mincount=%"UVuf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); +#endif + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; + else + data->pos_delta += - counted * deltanext + + (minnext + deltanext) * maxcount - minnext * mincount; + if (mincount != maxcount) { + /* Cannot extend fixed substrings found inside + the group. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + if (mincount && last_str) { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + + if (mg) + mg->mg_len = -1; + sv_setsv(sv, last_str); + data->last_end = data->pos_min; + data->last_start_min = data->pos_min - last_chrs; + data->last_start_max = is_inf + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; + } + data->longest = &(data->longest_float); + } + SvREFCNT_dec(last_str); + } + if (data && (fl & SF_HAS_EVAL)) + data->flags |= SF_HAS_EVAL; + optimize_curly_tail: + if (OP(oscan) != CURLYX) { + while (PL_regkind[OP(next = regnext(oscan))] == NOTHING + && NEXT_OFF(next)) + NEXT_OFF(oscan) += NEXT_OFF(next); + } + continue; + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } + flags &= ~SCF_DO_STCLASS; + break; + } + } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + } + min++; + delta++; /* Because of the 2 char string cr-lf */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + } + else if (REGNODE_SIMPLE(OP(scan))) { + + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min++; + } + min++; + if (flags & SCF_DO_STCLASS) { + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + + /* Some of the logic below assumes that switching + locale on will only add false positives. */ + switch (OP(scan)) { + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); +#endif + case CANY: + case SANY: + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_match_all_cp(data->start_class); + break; + + case REG_ANY: + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); + } + break; + + case ANYOF: + if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); + else + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); + break; + + case NPOSIXL: + invert = 1; + /* FALLTHROUGH */ + + case POSIXL: + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; + if (flags & SCF_DO_STCLASS_AND) { + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); + } + else { + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); + } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + } + break; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; + /* FALLTHROUGH */ + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + + case NPOSIXD: + case NPOSIXU: + invert = 1; + /* FALLTHROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } + + join_posix: + + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); + } + else { + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); + } + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { + data->flags |= (OP(scan) == MEOL + ? SF_BEFORE_MEOL + : SF_BEFORE_SEOL); + scan_commit(pRExC_state, data, minlenp, is_inf); + + } + else if ( PL_regkind[OP(scan)] == BRANCHJ + /* Lookbehind, or need to calculate parens/evals/stclass: */ + && (scan->flags || data || (flags & SCF_DO_STCLASS)) + && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) + { + if ( OP(scan) == UNLESSM && + scan->flags == 0 && + OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && + OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED + ) { + regnode *opt; + regnode *upto= regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_STUDYDATA("OPFAIL",data,depth); + + /*DEBUG_PARSE_MSG("opfail");*/ + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) + ); + }); + OP(scan) = OPFAIL; + NEXT_OFF(scan) = upto - scan; + for (opt= scan + 1; opt < upto ; opt++) + OP(opt) = OPTIMIZED; + scan= upto; + continue; + } + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + || OP(scan) == UNLESSM ) + { + /* Negative Lookahead/lookbehind + In this case we can't do fixed string optimisation. + */ + + SSize_t deltanext, minnext, fake = 0; + regnode *nscan; + regnode_ssc intrnl; + int f = 0; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (minnext > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); + } + scan->flags = (U8)minnext; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (f & SCF_DO_STCLASS_AND) { + if (flags & SCF_DO_STCLASS_OR) { + /* OR before, AND after: ideally we would recurse with + * data_fake to get the AND applied by study of the + * remainder of the pattern, and then derecurse; + * *** HACK *** for now just treat as "no information". + * See [perl #56690]. + */ + ssc_init(pRExC_state, data->start_class); + } else { + /* AND before and after: combine and continue. These + * assertions are zero-length, so can match an EMPTY + * string */ + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + } + } + } +#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY + else { + /* Positive Lookahead/lookbehind + In this case we can do fixed string optimisation, + but we must be careful about it. Note in the case of + lookbehind the positions will be offset by the minimum + length of the pattern, something we won't know about + until after the recurse. + */ + SSize_t deltanext, fake = 0; + regnode *nscan; + regnode_ssc intrnl; + int f = 0; + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated + minlens when it's all done. This way we don't + have to worry about freeing them when we know + they wont be used, which would be a pain. + */ + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); + SAVEFREEPV(minnextp); + + if (data) { + StructCopy(data, &data_fake, scan_data_t); + if ((flags & SCF_DO_SUBSTR) && data->last_found) { + f |= SCF_DO_SUBSTR; + if (scan->flags) + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); + data_fake.last_found=newSVsv(data->last_found); + } + } + else + data_fake.last_closep = &fake; + data_fake.flags = 0; + data_fake.pos_delta = delta; + if (is_inf) + data_fake.flags |= SF_IS_INF; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (*minnextp > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); + } + scan->flags = (U8)*minnextp; + } + + *minnextp += min; + + if (f & SCF_DO_STCLASS_AND) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { + if (RExC_rx->minlen<*minnextp) + RExC_rx->minlen=*minnextp; + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); + SvREFCNT_dec_NN(data_fake.last_found); + + if ( data_fake.minlen_fixed != minlenp ) + { + data->offset_fixed= data_fake.offset_fixed; + data->minlen_fixed= data_fake.minlen_fixed; + data->lookbehind_fixed+= scan->flags; + } + if ( data_fake.minlen_float != minlenp ) + { + data->minlen_float= data_fake.minlen_float; + data->offset_float_min=data_fake.offset_float_min; + data->offset_float_max=data_fake.offset_float_max; + data->lookbehind_float+= scan->flags; + } + } + } + } +#endif + } + else if (OP(scan) == OPEN) { + if (stopparen != (I32)ARG(scan)) + pars++; + } + else if (OP(scan) == CLOSE) { + if (stopparen == (I32)ARG(scan)) { + break; + } + if ((I32)ARG(scan) == is_par) { + next = regnext(scan); + + if ( next && (OP(next) != WHILEM) && next < last) + is_par = 0; /* Disable optimization */ + } + if (data) + *(data->last_closep) = ARG(scan); + } + else if (OP(scan) == EVAL) { + if (data) + data->flags |= SF_HAS_EVAL; + } + else if ( PL_regkind[OP(scan)] == ENDLIKE ) { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + flags &= ~SCF_DO_SUBSTR; + } + if (data && OP(scan)==ACCEPT) { + data->flags |= SCF_SEEN_ACCEPT; + if (stopmin > min) + stopmin = min; + } + } + else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ + { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + } + else if (OP(scan) == GPOS) { + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) + { + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) + RExC_rx->gofs = min; + } else { + RExC_rx->intflags |= PREGf_GPOS_FLOAT; + RExC_rx->gofs = 0; + } + } +#ifdef TRIE_STUDY_OPT +#ifdef FULL_TRIE_STUDY + else if (PL_regkind[OP(scan)] == TRIE) { + /* NOTE - There is similar code to this block above for handling + BRANCH nodes on the initial study. If you change stuff here + check there too. */ + regnode *trie_node= scan; + regnode *tail= regnext(scan); + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; + + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + if (!trie->jump) { + min1= trie->minlen; + max1= trie->maxlen; + } else { + const regnode *nextbranch= NULL; + U32 word; + + for ( word=1 ; word <= trie->wordcount ; word++) + { + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + if (trie->jump[word]) { + if (!nextbranch) + nextbranch = trie_node + trie->jump[0]; + scan= trie_node + trie->jump[word]; + /* We go from the jump point to the branch that follows + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed_depth, NULL, f,depth+1); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode*)nextbranch); + + if (min1 > (SSize_t)(minnext + trie->minlen)) + min1 = minnext + trie->minlen; + if (deltanext == SSize_t_MAX) { + is_inf = is_inf_internal = 1; + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) + max1 = minnext + deltanext + trie->maxlen; + + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > min + min1) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); + } + } + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + scan= tail; + continue; + } +#else + else if (PL_regkind[OP(scan)] == TRIE) { + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + U8*bang=NULL; + + min += trie->minlen; + delta += (trie->maxlen - trie->minlen); + flags &= ~SCF_DO_STCLASS; /* xxx */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += trie->minlen; + data->pos_delta += (trie->maxlen - trie->minlen); + if (trie->maxlen != trie->minlen) + data->longest = &(data->longest_float); + } + if (trie->jump) /* no more substrings -- for now /grr*/ + flags &= ~SCF_DO_SUBSTR; + } +#endif /* old or new */ +#endif /* TRIE_STUDY_OPT */ + + /* Else: zero-length, ignore. */ + scan = regnext(scan); + } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ + if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ + last = frame->last; + scan = frame->next; + stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + + frame = frame->prev; + goto fake_study_recurse; + } + + finish: + assert(!frame); + DEBUG_STUDYDATA("pre-fin:",data,depth); + + *scanp = scan; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + + if (flags & SCF_DO_SUBSTR && is_inf) + data->pos_delta = SSize_t_MAX - data->pos_min; + if (is_par > (I32)U8_MAX) + is_par = 0; + if (is_par && pars==1 && data) { + data->flags |= SF_IN_PAR; + data->flags &= ~SF_HAS_PAR; + } + else if (pars && data) { + data->flags |= SF_HAS_PAR; + data->flags &= ~SF_IN_PAR; + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + if (flags & SCF_TRIE_RESTUDY) + data->flags |= SCF_TRIE_RESTUDY; + + DEBUG_STUDYDATA("post-fin:",data,depth); + + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ +} + +STATIC U32 +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) +{ + U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; + + PERL_ARGS_ASSERT_ADD_DATA; + + Renewc(RExC_rxi->data, + sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), + char, struct reg_data); + if(count) + Renew(RExC_rxi->data->what, count + n, U8); + else + Newx(RExC_rxi->data->what, n, U8); + RExC_rxi->data->count = count + n; + Copy(s, RExC_rxi->data->what + count, n, U8); + return count; +} + +/*XXX: todo make this not included in a non debugging perl, but appears to be + * used anyway there, in 'use re' */ +#ifndef PERL_IN_XSUB_RE +void +Perl_reginitcolors(pTHX) +{ + const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); + if (s) { + char *t = savepv(s); + int i = 0; + PL_colors[0] = t; + while (++i < 6) { + t = strchr(t, '\t'); + if (t) { + *t = '\0'; + PL_colors[i] = ++t; + } + else + PL_colors[i] = t = (char *)""; + } + } else { + int i = 0; + while (i < 6) + PL_colors[i++] = (char *)""; + } + PL_colorset = 1; +} +#endif + + +#ifdef TRIE_STUDY_OPT +#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ + STMT_START { \ + if ( \ + (data.flags & SCF_TRIE_RESTUDY) \ + && ! restudied++ \ + ) { \ + dOsomething; \ + goto reStudy; \ + } \ + } STMT_END +#else +#define CHECK_RESTUDY_GOTO_butfirst +#endif + +/* + * pregcomp - compile a regular expression into internal code + * + * Decides which engine's compiler to call based on the hint currently in + * scope + */ + +#ifndef PERL_IN_XSUB_RE + +/* return the currently in-scope regex engine (or the default if none) */ + +regexp_engine const * +Perl_current_re_engine(pTHX) +{ + if (IN_PERL_COMPILETIME) { + HV * const table = GvHV(PL_hintgv); + SV **ptr; + + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) + return &PL_core_reg_engine; + ptr = hv_fetchs(table, "regcomp", FALSE); + if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) + return &PL_core_reg_engine; + return INT2PTR(regexp_engine*,SvIV(*ptr)); + } + else { + SV *ptr; + if (!PL_curcop->cop_hints_hash) + return &PL_core_reg_engine; + ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); + if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) + return &PL_core_reg_engine; + return INT2PTR(regexp_engine*,SvIV(ptr)); + } +} + + +REGEXP * +Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) +{ + regexp_engine const *eng = current_re_engine(); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_PREGCOMP; + + /* Dispatch a request to compile a regexp to correct regexp engine. */ + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + PTR2UV(eng)); + }); + return CALLREGCOMP_ENG(eng, pattern, flags); +} +#endif + +/* public(ish) entry point for the perl core's own regex compiling code. + * It's actually a wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs, and uses the internal engine rather + * than the current one */ + +REGEXP * +Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) +{ + SV *pat = pattern; /* defeat constness! */ + PERL_ARGS_ASSERT_RE_COMPILE; + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, +#ifdef PERL_IN_XSUB_RE + &my_reg_engine, +#else + &PL_core_reg_engine, +#endif + NULL, NULL, rx_flags, 0); +} + + +/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code + * blocks, recalculate the indices. Update pat_p and plen_p in-place to + * point to the realloced string and length. + * + * This is essentially a copy of Perl_bytes_to_utf8() with the code index + * stuff added */ + +static void +S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, + char **pat_p, STRLEN *plen_p, int num_code_blocks) +{ + U8 *const src = (U8*)*pat_p; + U8 *dst; + int n=0; + STRLEN s = 0, d = 0; + bool do_end = 0; + GET_RE_DEBUG_FLAGS_DECL; + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + + Newx(dst, *plen_p * 2 + 1, U8); + + while (s < *plen_p) { + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; + else { + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); + } + if (n < num_code_blocks) { + if (!do_end && pRExC_state->code_blocks[n].start == s) { + pRExC_state->code_blocks[n].start = d; + assert(dst[d] == '('); + do_end = 1; + } + else if (do_end && pRExC_state->code_blocks[n].end == s) { + pRExC_state->code_blocks[n].end = d; + assert(dst[d] == ')'); + do_end = 0; + n++; + } + } + s++; + d++; + } + dst[d] = '\0'; + *plen_p = d; + *pat_p = (char*) dst; + SAVEFREEPV(*pat_p); + RExC_orig_utf8 = RExC_utf8 = 1; +} + + + +/* S_concat_pat(): concatenate a list of args to the pattern string pat, + * while recording any code block indices, and handling overloading, + * nested qr// objects etc. If pat is null, it will allocate a new + * string, or just return the first arg, if there's only one. + * + * Returns the malloced/updated pat. + * patternp and pat_count is the array of SVs to be concatted; + * oplist is the optional list of ops that generated the SVs; + * recompile_p is a pointer to a boolean that will be set if + * the regex will need to be recompiled. + * delim, if non-null is an SV that will be inserted between each element + */ + +static SV* +S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, + SV *pat, SV ** const patternp, int pat_count, + OP *oplist, bool *recompile_p, SV *delim) +{ + SV **svp; + int n = 0; + bool use_delim = FALSE; + bool alloced = FALSE; + + /* if we know we have at least two args, create an empty string, + * then concatenate args to that. For no args, return an empty string */ + if (!pat && pat_count != 1) { + pat = newSVpvs(""); + SAVEFREESV(pat); + alloced = TRUE; + } + + for (svp = patternp; svp < patternp + pat_count; svp++) { + SV *sv; + SV *rx = NULL; + STRLEN orig_patlen = 0; + bool code = 0; + SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; + + /* if we've got a delimiter, we go round the loop twice for each + * svp slot (except the last), using the delimiter the second + * time round */ + if (use_delim) { + svp--; + use_delim = FALSE; + } + else if (delim) + use_delim = TRUE; + + if (SvTYPE(msv) == SVt_PVAV) { + /* we've encountered an interpolated array within + * the pattern, e.g. /...@a..../. Expand the list of elements, + * then recursively append elements. + * The code in this block is based on S_pushav() */ + + AV *const av = (AV*)msv; + const SSize_t maxarg = AvFILL(av) + 1; + SV **array; + + if (oplist) { + assert(oplist->op_type == OP_PADAV + || oplist->op_type == OP_RV2AV); + oplist = OP_SIBLING(oplist); + } + + if (SvRMAGICAL(av)) { + SSize_t i; + + Newx(array, maxarg, SV*); + SAVEFREEPV(array); + for (i=0; i < maxarg; i++) { + SV ** const svp = av_fetch(av, i, FALSE); + array[i] = svp ? *svp : &PL_sv_undef; + } + } + else + array = AvARRAY(av); + + pat = S_concat_pat(aTHX_ pRExC_state, pat, + array, maxarg, NULL, recompile_p, + /* $" */ + GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); + + continue; + } + + + /* we make the assumption here that each op in the list of + * op_siblings maps to one SV pushed onto the stack, + * except for code blocks, with have both an OP_NULL and + * and OP_CONST. + * This allows us to match up the list of SVs against the + * list of OPs to find the next code block. + * + * Note that PUSHMARK PADSV PADSV .. + * is optimised to + * PADRANGE PADSV PADSV .. + * so the alignment still works. */ + + if (oplist) { + if (oplist->op_type == OP_NULL + && (oplist->op_flags & OPf_SPECIAL)) + { + assert(n < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0; + pRExC_state->code_blocks[n].block = oplist; + pRExC_state->code_blocks[n].src_regex = NULL; + n++; + code = 1; + oplist = OP_SIBLING(oplist); /* skip CONST */ + assert(oplist); + } + oplist = OP_SIBLING(oplist);; + } + + /* apply magic and QR overloading to arg */ + + SvGETMAGIC(msv); + if (SvROK(msv) && SvAMAGIC(msv)) { + SV *sv = AMG_CALLunary(msv, regexp_amg); + if (sv) { + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) != SVt_REGEXP) + Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); + msv = sv; + } + } + + /* try concatenation overload ... */ + if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && + (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) + { + sv_setsv(pat, sv); + /* overloading involved: all bets are off over literal + * code. Pretend we haven't seen it */ + pRExC_state->num_code_blocks -= n; + n = 0; + } + else { + /* ... or failing that, try "" overload */ + while (SvAMAGIC(msv) + && (sv = AMG_CALLunary(msv, string_amg)) + && sv != msv + && !( SvROK(msv) + && SvROK(sv) + && SvRV(msv) == SvRV(sv)) + ) { + msv = sv; + SvGETMAGIC(msv); + } + if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) + msv = SvRV(msv); + + if (pat) { + /* this is a partially unrolled + * sv_catsv_nomg(pat, msv); + * that allows us to adjust code block indices if + * needed */ + STRLEN dlen; + char *dst = SvPV_force_nomg(pat, dlen); + orig_patlen = dlen; + if (SvUTF8(msv) && !SvUTF8(pat)) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); + sv_setpvn(pat, dst, dlen); + SvUTF8_on(pat); + } + sv_catsv_nomg(pat, msv); + rx = msv; + } + else + pat = msv; + + if (code) + pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + } + + /* extract any code blocks within any embedded qr//'s */ + if (rx && SvTYPE(rx) == SVt_REGEXP + && RX_ENGINE((REGEXP*)rx)->op_comp) + { + + RXi_GET_DECL(ReANY((REGEXP *)rx), ri); + if (ri->num_code_blocks) { + int i; + /* the presence of an embedded qr// with code means + * we should always recompile: the text of the + * qr// may not have changed, but it may be a + * different closure than last time */ + *recompile_p = 1; + Renew(pRExC_state->code_blocks, + pRExC_state->num_code_blocks + ri->num_code_blocks, + struct reg_code_block); + pRExC_state->num_code_blocks += ri->num_code_blocks; + + for (i=0; i < ri->num_code_blocks; i++) { + struct reg_code_block *src, *dst; + STRLEN offset = orig_patlen + + ReANY((REGEXP *)rx)->pre_prefix; + assert(n < pRExC_state->num_code_blocks); + src = &ri->code_blocks[i]; + dst = &pRExC_state->code_blocks[n]; + dst->start = src->start + offset; + dst->end = src->end + offset; + dst->block = src->block; + dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) + src->src_regex + ? src->src_regex + : (REGEXP*)rx); + n++; + } + } + } + } + /* avoid calling magic multiple times on a single element e.g. =~ $qr */ + if (alloced) + SvSETMAGIC(pat); + + return pat; +} + + + +/* see if there are any run-time code blocks in the pattern. + * False positives are allowed */ + +static bool +S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + int n = 0; + STRLEN s; + + PERL_UNUSED_CONTEXT; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + s = pRExC_state->code_blocks[n].end; + n++; + continue; + } + /* TODO ideally should handle [..], (#..), /#.../x to reduce false + * positives here */ + if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && + (pat[s+2] == '{' + || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) + ) + return 1; + } + return 0; +} + +/* Handle run-time code blocks. We will already have compiled any direct + * or indirect literal code blocks. Now, take the pattern 'pat' and make a + * copy of it, but with any literal code blocks blanked out and + * appropriate chars escaped; then feed it into + * + * eval "qr'modified_pattern'" + * + * For example, + * + * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno + * + * becomes + * + * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' + * + * After eval_sv()-ing that, grab any new code blocks from the returned qr + * and merge them with any code blocks of the original regexp. + * + * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; + * instead, just save the qr and return FALSE; this tells our caller that + * the original pattern needs upgrading to utf8. + */ + +static bool +S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + SV *qr; + + GET_RE_DEBUG_FLAGS_DECL; + + if (pRExC_state->runtime_code_qr) { + /* this is the second time we've been called; this should + * only happen if the main pattern got upgraded to utf8 + * during compilation; re-use the qr we compiled first time + * round (which should be utf8 too) + */ + qr = pRExC_state->runtime_code_qr; + pRExC_state->runtime_code_qr = NULL; + assert(RExC_utf8 && SvUTF8(qr)); + } + else { + int n = 0; + STRLEN s; + char *p, *newpat; + int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ + SV *sv, *qr_ref; + dSP; + + /* determine how many extra chars we need for ' and \ escaping */ + for (s = 0; s < plen; s++) { + if (pat[s] == '\'' || pat[s] == '\\') + newlen++; + } + + Newx(newpat, newlen, char); + p = newpat; + *p++ = 'q'; *p++ = 'r'; *p++ = '\''; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + /* blank out literal code block */ + assert(pat[s] == '('); + while (s <= pRExC_state->code_blocks[n].end) { + *p++ = '_'; + s++; + } + s--; + n++; + continue; + } + if (pat[s] == '\'' || pat[s] == '\\') + *p++ = '\\'; + *p++ = pat[s]; + } + *p++ = '\''; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + *p++ = 'x'; + *p++ = '\0'; + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%sre-parsing pattern for runtime code:%s %s\n", + PL_colors[4],PL_colors[5],newpat); + }); + + sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); + Safefree(newpat); + + ENTER; + SAVETMPS; + save_re_context(); + PUSHSTACKi(PERLSI_REQUIRE); + /* G_RE_REPARSING causes the toker to collapse \\ into \ when + * parsing qr''; normally only q'' does this. It also alters + * hints handling */ + eval_sv(sv, G_SCALAR|G_RE_REPARSING); + SvREFCNT_dec_NN(sv); + SPAGAIN; + qr_ref = POPs; + PUTBACK; + { + SV * const errsv = ERRSV; + if (SvTRUE_NN(errsv)) + { + Safefree(pRExC_state->code_blocks); + /* use croak_sv ? */ + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); + } + } + assert(SvROK(qr_ref)); + qr = SvRV(qr_ref); + assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); + /* the leaving below frees the tmp qr_ref. + * Give qr a life of its own */ + SvREFCNT_inc(qr); + POPSTACK; + FREETMPS; + LEAVE; + + } + + if (!RExC_utf8 && SvUTF8(qr)) { + /* first time through; the pattern got upgraded; save the + * qr for the next time through */ + assert(!pRExC_state->runtime_code_qr); + pRExC_state->runtime_code_qr = qr; + return 0; + } + + + /* extract any code blocks within the returned qr// */ + + + /* merge the main (r1) and run-time (r2) code blocks into one */ + { + RXi_GET_DECL(ReANY((REGEXP *)qr), r2); + struct reg_code_block *new_block, *dst; + RExC_state_t * const r1 = pRExC_state; /* convenient alias */ + int i1 = 0, i2 = 0; + + if (!r2->num_code_blocks) /* we guessed wrong */ + { + SvREFCNT_dec_NN(qr); + return 1; + } + + Newx(new_block, + r1->num_code_blocks + r2->num_code_blocks, + struct reg_code_block); + dst = new_block; + + while ( i1 < r1->num_code_blocks + || i2 < r2->num_code_blocks) + { + struct reg_code_block *src; + bool is_qr = 0; + + if (i1 == r1->num_code_blocks) { + src = &r2->code_blocks[i2++]; + is_qr = 1; + } + else if (i2 == r2->num_code_blocks) + src = &r1->code_blocks[i1++]; + else if ( r1->code_blocks[i1].start + < r2->code_blocks[i2].start) + { + src = &r1->code_blocks[i1++]; + assert(src->end < r2->code_blocks[i2].start); + } + else { + assert( r1->code_blocks[i1].start + > r2->code_blocks[i2].start); + src = &r2->code_blocks[i2++]; + is_qr = 1; + assert(src->end < r1->code_blocks[i1].start); + } + + assert(pat[src->start] == '('); + assert(pat[src->end] == ')'); + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; + dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) + : src->src_regex; + dst++; + } + r1->num_code_blocks += r2->num_code_blocks; + Safefree(r1->code_blocks); + r1->code_blocks = new_block; + } + + SvREFCNT_dec_NN(qr); + return 1; +} + + +STATIC bool +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) +{ + /* This is the common code for setting up the floating and fixed length + * string data extracted from Perl_re_op_compile() below. Returns a boolean + * as to whether succeeded or not */ + + I32 t; + SSize_t ml; + + if (! (longest_length + || (eol /* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) + ) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) + { + return FALSE; + } + + /* copy the information about the longest from the reg_scan_data + over to the program. */ + if (SvUTF8(sv_longest)) { + *rx_utf8 = sv_longest; + *rx_substr = NULL; + } else { + *rx_substr = sv_longest; + *rx_utf8 = NULL; + } + /* end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = minlen ? *(minlen) : (SSize_t)longest_length; + *rx_end_shift = ml - offset + - longest_length + (SvTAIL(sv_longest) != 0) + + lookbehind; + + t = (eol/* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); + fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); + + return TRUE; +} + +/* + * Perl_re_op_compile - the perl internal RE engine's function to compile a + * regular expression into internal code. + * The pattern may be passed either as: + * a list of SVs (patternp plus pat_count) + * a list of OPs (expr) + * If both are passed, the SV list is used, but the OP list indicates + * which SVs are actually pre-compiled code blocks + * + * The SVs in the list have magic and qr overloading applied to them (and + * the list may be modified in-place with replacement SVs in the latter + * case). + * + * If the pattern hasn't changed from old_re, then old_re will be + * returned. + * + * eng is the current engine. If that engine has an op_comp method, then + * handle directly (i.e. we assume that op_comp was us); otherwise, just + * do the initial concatenation of arguments and pass on to the external + * engine. + * + * If is_bare_re is not null, set it to a boolean indicating whether the + * arg list reduced (after overloading) to a single bare regex which has + * been returned (i.e. /$qr/). + * + * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. + * + * pm_flags contains the PMf_* flags, typically based on those from the + * pm_flags field of the related PMOP. Currently we're only interested in + * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) [NB: not true in perl] + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. [I'll say.] + */ + +REGEXP * +Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, + OP *expr, const regexp_engine* eng, REGEXP *old_re, + bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) +{ + REGEXP *rx; + struct regexp *r; + regexp_internal *ri; + STRLEN plen; + char *exp; + regnode *scan; + I32 flags; + SSize_t minlen = 0; + U32 rx_flags; + SV *pat; + SV *code_blocksv = NULL; + SV** new_patternp = patternp; + + /* these are all flags - maybe they should be turned + * into a single int with different bit masks */ + I32 sawlookahead = 0; + I32 sawplus = 0; + I32 sawopen = 0; + I32 sawminmod = 0; + + regex_charset initial_charset = get_regex_charset(orig_rx_flags); + bool recompile = 0; + bool runtime_code = 0; + scan_data_t data; + RExC_state_t RExC_state; + RExC_state_t * const pRExC_state = &RExC_state; +#ifdef TRIE_STUDY_OPT + int restudied = 0; + RExC_state_t copyRExC_state; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_OP_COMPILE; + + DEBUG_r(if (!PL_colorset) reginitcolors()); + +#ifndef PERL_IN_XSUB_RE + /* Initialize these here instead of as-needed, as is quick and avoids + * having to test them each time otherwise */ + if (! PL_AboveLatin1) { + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); + } +#endif + + pRExC_state->code_blocks = NULL; + pRExC_state->num_code_blocks = 0; + + if (is_bare_re) + *is_bare_re = FALSE; + + if (expr && (expr->op_type == OP_LIST || + (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { + /* allocate code_blocks if needed */ + OP *o; + int ncode = 0; + + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) + if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) + ncode++; /* count of DO blocks */ + if (ncode) { + pRExC_state->num_code_blocks = ncode; + Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); + } + } + + if (!pat_count) { + /* compile-time pattern with just OP_CONSTs and DO blocks */ + + int n; + OP *o; + + /* find how many CONSTs there are */ + assert(expr); + n = 0; + if (expr->op_type == OP_CONST) + n = 1; + else + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) { + if (o->op_type == OP_CONST) + n++; + } + + /* fake up an SV array */ + + assert(!new_patternp); + Newx(new_patternp, n, SV*); + SAVEFREEPV(new_patternp); + pat_count = n; + + n = 0; + if (expr->op_type == OP_CONST) + new_patternp[n] = cSVOPx_sv(expr); + else + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) { + if (o->op_type == OP_CONST) + new_patternp[n++] = cSVOPo_sv; + } + + } + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Assembling pattern from %d elements%s\n", pat_count, + orig_rx_flags & RXf_SPLIT ? " for split" : "")); + + /* set expr to the first arg op */ + + if (pRExC_state->num_code_blocks + && expr->op_type != OP_CONST) + { + expr = cLISTOPx(expr)->op_first; + assert( expr->op_type == OP_PUSHMARK + || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) + || expr->op_type == OP_PADRANGE); + expr = OP_SIBLING(expr); + } + + pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, + expr, &recompile, NULL); + + /* handle bare (possibly after overloading) regex: foo =~ $re */ + { + SV *re = pat; + if (SvROK(re)) + re = SvRV(re); + if (SvTYPE(re) == SVt_REGEXP) { + if (is_bare_re) + *is_bare_re = TRUE; + SvREFCNT_inc(re); + Safefree(pRExC_state->code_blocks); + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Precompiled pattern%s\n", + orig_rx_flags & RXf_SPLIT ? " for split" : "")); + + return (REGEXP*)re; + } + } + + exp = SvPV_nomg(pat, plen); + + if (!eng->op_comp) { + if ((SvUTF8(pat) && IN_BYTES) + || SvGMAGICAL(pat) || SvAMAGIC(pat)) + { + /* make a temporary copy; either to convert to bytes, + * or to avoid repeating get-magic / overloaded stringify */ + pat = newSVpvn_flags(exp, plen, SVs_TEMP | + (IN_BYTES ? 0 : SvUTF8(pat))); + } + Safefree(pRExC_state->code_blocks); + return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); + } + + /* ignore the utf8ness if the pattern is 0 length */ + RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); + RExC_uni_semantics = 0; + RExC_contains_locale = 0; + RExC_contains_i = 0; + pRExC_state->runtime_code_qr = NULL; + + DEBUG_COMPILE_r({ + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", + PL_colors[4],PL_colors[5],s); + }); + + redo_first_pass: + /* we jump here if we upgrade the pattern to utf8 and have to + * recompile */ + + if ((pm_flags & PMf_USE_RE_EVAL) + /* this second condition covers the non-regex literal case, + * i.e. $foo =~ '(?{})'. */ + || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) + ) + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); + + /* return old regex if pattern hasn't changed */ + /* XXX: note in the below we have to check the flags as well as the + * pattern. + * + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ + + if ( old_re + && !recompile + && !!RX_UTF8(old_re) == !!RExC_utf8 + && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) + && RX_PRECOMP(old_re) + && RX_PRELEN(old_re) == plen + && memEQ(RX_PRECOMP(old_re), exp, plen) + && !runtime_code /* with runtime code, always recompile */ ) + { + Safefree(pRExC_state->code_blocks); + return old_re; + } + + rx_flags = orig_rx_flags; + + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; + } + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + + /* Set to use unicode semantics if the pattern is in utf8 and has the + * 'depends' charset specified, as it means unicode when utf8 */ + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); + } + + RExC_precomp = exp; + RExC_flags = rx_flags; + RExC_pm_flags = pm_flags; + + if (runtime_code) { + if (TAINTING_get && TAINT_get) + Perl_croak(aTHX_ "Eval-group in insecure regular expression"); + + if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { + /* whoops, we have a non-utf8 pattern, whilst run-time code + * got compiled as utf8. Try again with a utf8 pattern */ + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + pRExC_state->num_code_blocks); + goto redo_first_pass; + } + } + assert(!pRExC_state->runtime_code_qr); + + RExC_sawback = 0; + + RExC_seen = 0; + RExC_maxlen = 0; + RExC_in_lookbehind = 0; + RExC_seen_zerolen = *exp == '^' ? -1 : 0; + RExC_extralen = 0; + RExC_override_recoding = 0; + RExC_in_multi_char_class = 0; + + /* First pass: determine size, legality. */ + RExC_parse = exp; + RExC_start = exp; + RExC_end = exp + plen; + RExC_naughty = 0; + RExC_npar = 1; + RExC_nestroot = 0; + RExC_size = 0L; + RExC_emit = (regnode *) &RExC_emit_dummy; + RExC_whilem_seen = 0; + RExC_open_parens = NULL; + RExC_close_parens = NULL; + RExC_opend = NULL; + RExC_paren_names = NULL; +#ifdef DEBUGGING + RExC_paren_name_list = NULL; +#endif + RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; + RExC_recurse_count = 0; + pRExC_state->code_index = 0; + +#if 0 /* REGC() is (currently) a NOP at the first pass. + * Clever compilers notice this and complain. --jhi */ + REGC((U8)REG_MAGIC, (char*)RExC_emit); +#endif + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + RExC_lastnum=0; + RExC_lastparse=NULL; + ); + /* reg may croak on us, not giving us a chance to free + pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may + need it to survive as long as the regexp (qr/(?{})/). + We must check that code_blocksv is not already set, because we may + have jumped back to restart the sizing pass. */ + if (pRExC_state->code_blocks && !code_blocksv) { + code_blocksv = newSV_type(SVt_PV); + SAVEFREESV(code_blocksv); + SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); + SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ + } + if (reg(pRExC_state, 0, &flags,1) == NULL) { + /* It's possible to write a regexp in ascii that represents Unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + -- dmq */ + if (flags & RESTART_UTF8) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + pRExC_state->num_code_blocks); + goto redo_first_pass; + } + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); + } + if (code_blocksv) + SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ + + DEBUG_PARSE_r({ + PerlIO_printf(Perl_debug_log, + "Required size %"IVdf" nodes\n" + "Starting second pass (creation)\n", + (IV)RExC_size); + RExC_lastnum=0; + RExC_lastparse=NULL; + }); + + /* The first pass could have found things that force Unicode semantics */ + if ((RExC_utf8 || RExC_uni_semantics) + && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) + { + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); + } + + /* Small enough for pointer-storage convention? + If extralen==0, this means that we will not need long jumps. */ + if (RExC_size >= 0x10000L && RExC_extralen) + RExC_size += RExC_extralen; + else + RExC_extralen = 0; + if (RExC_whilem_seen > 15) + RExC_whilem_seen = 15; + + /* Allocate space and zero-initialize. Note, the two step process + of zeroing when in debug mode, thus anything assigned has to + happen after that */ + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = ReANY(rx); + Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char, regexp_internal); + if ( r == NULL || ri == NULL ) + FAIL("Regexp out of space"); +#ifdef DEBUGGING + /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); +#else + /* bulk initialize base fields with 0. */ + Zero(ri, sizeof(regexp_internal), char); +#endif + + /* non-zero initialization begins here */ + RXi_SET( r, ri ); + r->engine= eng; + r->extflags = rx_flags; + RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; + + if (pm_flags & PMf_IS_QR) { + ri->code_blocks = pRExC_state->code_blocks; + ri->num_code_blocks = pRExC_state->num_code_blocks; + } + else + { + int n; + for (n = 0; n < pRExC_state->num_code_blocks; n++) + if (pRExC_state->code_blocks[n].src_regex) + SAVEFREESV(pRExC_state->code_blocks[n].src_regex); + SAVEFREEPV(pRExC_state->code_blocks); + } + + { + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); + + /* The caret is output if there are any defaults: if not all the STD + * flags are set, or if no character set specifier is needed */ + bool has_default = + (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) + || ! has_charset); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + >> RXf_PMf_STD_PMMOD_SHIFT); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + /* Allocate for the worst case, which is all the std flags are turned + * on. If more precision is desired, we could do a population count of + * the flags set. This could be done with a small lookup table, or by + * shifting, masking and adding, or even, when available, assembly + * language for a machine-language population count. + * We never output a minus, as all those are defaults, so are + * covered by the caret */ + const STRLEN wraplen = plen + has_p + has_runon + + has_default /* If needs a caret */ + + /* If needs a character set specifier */ + + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ + r->xpv_len_u.xpvlenu_pv = p; + if (RExC_utf8) + SvFLAGS(rx) |= SVf_UTF8; + *p++='('; *p++='?'; + + /* If a default, cover it using the caret */ + if (has_default) { + *p++= DEFAULT_PAT_MOD; + } + if (has_charset) { + STRLEN len; + const char* const name = get_regex_charset_name(r->extflags, &len); + Copy(name, p, len, char); + p += len; + } + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ + { + char ch; + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + reganch >>= 1; + } + } + + *p++ = ':'; + Copy(RExC_precomp, p, plen, char); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); + p += plen; + if (has_runon) + *p++ = '\n'; + *p++ = ')'; + *p = 0; + SvCUR_set(rx, p - RX_WRAPPED(rx)); + } + + r->intflags = 0; + r->nparens = RExC_npar - 1; /* set early to validate backrefs */ + + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { + Newxz(RExC_open_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_open_parens); + Newxz(RExC_close_parens,RExC_npar,regnode *); + SAVEFREEPV(RExC_close_parens); + } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } + + /* Useful during FAIL. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ + DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, + "%s %"UVuf" bytes for offset annotations.\n", + ri->u.offsets ? "Got" : "Couldn't get", + (UV)((2*RExC_size+1) * sizeof(U32)))); +#endif + SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; + RExC_rx = r; + RExC_rxi = ri; + + /* Second pass: emit code. */ + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_pm_flags = pm_flags; + RExC_parse = exp; + RExC_end = exp + plen; + RExC_naughty = 0; + RExC_npar = 1; + RExC_emit_start = ri->program; + RExC_emit = ri->program; + RExC_emit_bound = ri->program + RExC_size + 1; + pRExC_state->code_index = 0; + + REGC((U8)REG_MAGIC, (char*) RExC_emit++); + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(rx); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); + } + /* XXXX To minimize changes to RE engine we always allocate + 3-units-long substrs field. */ + Newx(r->substrs, 1, struct reg_substr_data); + if (RExC_recurse_count) { + Newxz(RExC_recurse,RExC_recurse_count,regnode *); + SAVEFREEPV(RExC_recurse); + } + +reStudy: + r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; + Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + +#ifdef TRIE_STUDY_OPT + if (!restudied) { + StructCopy(&zero_scan_data, &data, scan_data_t); + copyRExC_state = RExC_state; + } else { + U32 seen=RExC_seen; + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); + + RExC_state = copyRExC_state; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; + else + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; + StructCopy(&zero_scan_data, &data, scan_data_t); + } +#else + StructCopy(&zero_scan_data, &data, scan_data_t); +#endif + + /* Dig out information for optimizations. */ + r->extflags = RExC_flags; /* was pm_op */ + /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ + + if (UTF) + SvUTF8_on(rx); /* Unicode in it? */ + ri->regstclass = NULL; + if (RExC_naughty >= 10) /* Probably an expensive pattern. */ + r->intflags |= PREGf_NAUGHTY; + scan = ri->program + 1; /* First BRANCH. */ + + /* testing for BRANCH here tells us whether there is "must appear" + data in the pattern. If there is then we can use it for optimisations */ + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; + STRLEN longest_float_length, longest_fixed_length; + regnode_ssc ch_class; /* pointed to by data */ + int stclass_flag; + SSize_t last_close = 0; /* pointed to by data */ + regnode *first= scan; + regnode *first_next= regnext(first); + /* + * Skip introductions and multiplicators >= 1 + * so that we can extract the 'meat' of the pattern that must + * match in the large if() sequence following. + * NOTE that EXACT is NOT covered here, as it is normally + * picked up by the optimiser separately. + * + * This is unfortunate as the optimiser isnt handling lookahead + * properly currently. + * + */ + while ((OP(first) == OPEN && (sawopen = 1)) || + /* An OR of *one* alternative - should not happen now. */ + (OP(first) == BRANCH && OP(first_next) != BRANCH) || + /* for now we can't handle lookbehind IFMATCH*/ + (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || + (OP(first) == PLUS) || + (OP(first) == MINMOD) || + /* An {n,m} with n>0 */ + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) + { + /* + * the only op that could be a regnode is PLUS, all the rest + * will be regnode_1 or regnode_2. + * + * (yves doesn't think this is true) + */ + if (OP(first) == PLUS) + sawplus = 1; + else { + if (OP(first) == MINMOD) + sawminmod = 1; + first += regarglen[OP(first)]; + } + first = NEXTOPER(first); + first_next= regnext(first); + } + + /* Starting-point info. */ + again: + DEBUG_PEEP("first:",first,0); + /* Ignore EXACT as we deal with it later. */ + if (PL_regkind[OP(first)] == EXACT) { + if (OP(first) == EXACT) + NOOP; /* Empty, get anchored substr later. */ + else + ri->regstclass = first; + } +#ifdef TRIE_STCLASS + else if (PL_regkind[OP(first)] == TRIE && + ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) + { + /* this can happen only on restudy */ + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); + } +#endif + else if (REGNODE_SIMPLE(OP(first))) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOUND || + PL_regkind[OP(first)] == NBOUND) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOL) { + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL + : (OP(first) == SBOL + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); + first = NEXTOPER(first); + goto again; + } + else if (OP(first) == GPOS) { + r->intflags |= PREGf_ANCH_GPOS; + first = NEXTOPER(first); + goto again; + } + else if ((!sawopen || !RExC_sawback) && + !sawlookahead && + (OP(first) == STAR && + PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) + { + /* turn .* into ^.* with an implied $*=1 */ + const int type = + (OP(NEXTOPER(first)) == REG_ANY) + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); + first = NEXTOPER(first); + goto again; + } + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) + && !pRExC_state->num_code_blocks) /* May examine pos and $& */ + /* x+ must match at the 1st pos of run of x's */ + r->intflags |= PREGf_SKIP; + + /* Scan is after the zeroth branch, first is atomic matcher. */ +#ifdef TRIE_STUDY_OPT + DEBUG_PARSE_r( + if (!restudied) + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#else + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#endif + + + /* + * If there's something expensive in the r.e., find the + * longest literal string that must appear and make it the + * regmust. Resolve ties in favor of later strings, since + * the regstart check works with the beginning of the r.e. + * and avoiding duplication strengthens checking. Not a + * strong reason, but sufficient in the absence of others. + * [Now we resolve ties in favor of the earlier string if + * it happens that c_offset_min has been invalidated, since the + * earlier string may buy us something the later one won't.] + */ + + data.longest_fixed = newSVpvs(""); + data.longest_float = newSVpvs(""); + data.last_found = newSVpvs(""); + data.longest = &(data.longest_fixed); + ENTER_with_name("study_chunk"); + SAVEFREESV(data.longest_fixed); + SAVEFREESV(data.longest_float); + SAVEFREESV(data.last_found); + first = scan; + if (!ri->regstclass) { + ssc_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + stclass_flag = SCF_DO_STCLASS_AND; + } else /* XXXX Check for BOUND? */ + stclass_flag = 0; + data.last_closep = &last_close; + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, + SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag + | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), + 0); + + + CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); + + + if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) + && data.last_start_min == 0 && data.last_end > 0 + && !RExC_seen_zerolen + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ + r->extflags |= RXf_CHECK_ALL; + } + scan_commit(pRExC_state, &data,&minlen,0); + + longest_float_length = CHR_SVLEN(data.longest_float); + + if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) + && S_setup_longest (aTHX_ pRExC_state, + data.longest_float, + &(r->float_utf8), + &(r->float_substr), + &(r->float_end_shift), + data.lookbehind_float, + data.offset_float_min, + data.minlen_float, + longest_float_length, + cBOOL(data.flags & SF_FL_BEFORE_EOL), + cBOOL(data.flags & SF_FL_BEFORE_MEOL))) + { + r->float_min_offset = data.offset_float_min - data.lookbehind_float; + r->float_max_offset = data.offset_float_max; + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ + r->float_max_offset -= data.lookbehind_float; + SvREFCNT_inc_simple_void_NN(data.longest_float); + } + else { + r->float_substr = r->float_utf8 = NULL; + longest_float_length = 0; + } + + longest_fixed_length = CHR_SVLEN(data.longest_fixed); + + if (S_setup_longest (aTHX_ pRExC_state, + data.longest_fixed, + &(r->anchored_utf8), + &(r->anchored_substr), + &(r->anchored_end_shift), + data.lookbehind_fixed, + data.offset_fixed, + data.minlen_fixed, + longest_fixed_length, + cBOOL(data.flags & SF_FIX_BEFORE_EOL), + cBOOL(data.flags & SF_FIX_BEFORE_MEOL))) + { + r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; + SvREFCNT_inc_simple_void_NN(data.longest_fixed); + } + else { + r->anchored_substr = r->anchored_utf8 = NULL; + longest_fixed_length = 0; + } + LEAVE_with_name("study_chunk"); + + if (ri->regstclass + && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) + ri->regstclass = NULL; + + if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) + && stclass_flag + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); + StructCopy(data.start_class, + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class, NULL); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + data.start_class = NULL; + } + + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ + if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; + r->check_end_shift = r->anchored_end_shift; + r->check_substr = r->anchored_substr; + r->check_utf8 = r->anchored_utf8; + r->check_offset_min = r->check_offset_max = r->anchored_offset; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; + } + else { + r->substrs->check_ix = 1; + r->check_end_shift = r->float_end_shift; + r->check_substr = r->float_substr; + r->check_utf8 = r->float_utf8; + r->check_offset_min = r->float_min_offset; + r->check_offset_max = r->float_max_offset; + } + if ((r->check_substr || r->check_utf8) ) { + r->extflags |= RXf_USE_INTUIT; + if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) + r->extflags |= RXf_INTUIT_TAIL; + } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) + if ( (STRLEN)minlen < longest_float_length ) + minlen= longest_float_length; + if ( (STRLEN)minlen < longest_fixed_length ) + minlen= longest_fixed_length; + */ + } + else { + /* Several toplevels. Best we can is to set minlen. */ + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); + + scan = ri->program + 1; + ssc_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + data.last_closep = &last_close; + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), + 0); + + CHECK_RESTUDY_GOTO_butfirst(NOOP); + + r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 + = r->float_substr = r->float_utf8 = NULL; + + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); + StructCopy(data.start_class, + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class, NULL); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + data.start_class = NULL; + } + } + + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + + /* Guard against an embedded (?=) or (?<=) with a longer minlen than + the "real" pattern. */ + DEBUG_OPTIMISE_r({ + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); + }); + r->minlenret = minlen; + if (r->minlen < minlen) + r->minlen = minlen; + + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ + if (pRExC_state->num_code_blocks) + r->extflags |= RXf_EVAL_SEEN; + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) + { + r->intflags |= PREGf_VERBARG_SEEN; + r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ + } + if (RExC_seen & REG_CUTGROUP_SEEN) + r->intflags |= PREGf_CUTGROUP_SEEN; + if (pm_flags & PMf_USE_RE_EVAL) + r->intflags |= PREGf_USE_RE_EVAL; + if (RExC_paren_names) + RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); + else + RXp_PAREN_NAMES(r) = NULL; + + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ + regnode *first = ri->program + 1; + U8 fop = OP(first); + regnode *next = NEXTOPER(first); + U8 nop = OP(next); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) + r->extflags |= RXf_WHITE; + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + + } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + +#ifdef DEBUGGING + if (RExC_paren_names) { + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); + } else +#endif + ri->name_list_idx = 0; + + if (RExC_recurse_count) { + for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { + const regnode *scan = RExC_recurse[RExC_recurse_count-1]; + ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); + } + } + Newxz(r->offs, RExC_npar, regexp_paren_pair); + /* assume we don't need to swap parens around before we match */ + + DEBUG_DUMP_r({ + DEBUG_RExC_seen(); + PerlIO_printf(Perl_debug_log,"Final program:\n"); + regdump(r); + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_OFFSETS_r(if (ri->u.offsets) { + const STRLEN len = ri->u.offsets[0]; + STRLEN i; + GET_RE_DEBUG_FLAGS_DECL; + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + for (i = 1; i <= len; i++) { + if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) + PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", + (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); + } + PerlIO_printf(Perl_debug_log, "\n"); + }); +#endif + +#ifdef USE_ITHREADS + /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated + * by setting the regexp SV to readonly-only instead. If the + * pattern's been recompiled, the USEDness should remain. */ + if (old_re && SvREADONLY(old_re)) + SvREADONLY_on(rx); +#endif + return rx; +} + + +SV* +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak_no_modify(); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) +{ + AV *retarray = NULL; + SV *ret; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + + if (flags & RXapif_ALL) + retarray=newAV(); + + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) + { + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); + if (!retarray) + return ret; + } else { + if (retarray) + ret = newSVsv(&PL_sv_undef); + } + if (retarray) + av_push(retarray, ret); + } + if (retarray) + return newRV_noinc(MUTABLE_SV(retarray)); + } + } + return NULL; +} + +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, + const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & RXapif_ALL) { + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); + if (sv) { + SvREFCNT_dec_NN(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; + + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); + + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + return newSVhek(HeKEY_hek(temphe)); + } + } + } + return NULL; +} + +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) +{ + SV *ret; + AV *av; + SSize_t length; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = MUTABLE_AV(SvRV(ret)); + length = av_tindex(av); + SvREFCNT_dec_NN(ret); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + AV *av = newAV(); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + av_push(av, newSVhek(HeKEY_hek(temphe))); + } + } + } + + return newRV_noinc(MUTABLE_SV(av)); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) +{ + struct regexp *const rx = ReANY(r); + char *s = NULL; + SSize_t i = 0; + SSize_t s1, t1; + I32 n = paren; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; + + if ( n == RX_BUFF_IDX_CARET_PREMATCH + || n == RX_BUFF_IDX_CARET_FULLMATCH + || n == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } + + if (!rx->subbeg) + goto ret_undef; + + if (n == RX_BUFF_IDX_CARET_FULLMATCH) + /* no need to distinguish between them any more */ + n = RX_BUFF_IDX_FULLMATCH; + + if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) + && rx->offs[0].start != -1) + { + /* $`, ${^PREMATCH} */ + i = rx->offs[0].start; + s = rx->subbeg; + } + else + if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) + && rx->offs[0].end != -1) + { + /* $', ${^POSTMATCH} */ + s = rx->subbeg - rx->suboffset + rx->offs[0].end; + i = rx->sublen + rx->suboffset - rx->offs[0].end; + } + else + if ( 0 <= n && n <= (I32)rx->nparens && + (s1 = rx->offs[n].start) != -1 && + (t1 = rx->offs[n].end) != -1) + { + /* $&, ${^MATCH}, $1 ... */ + i = t1 - s1; + s = rx->subbeg + s1 - rx->suboffset; + } else { + goto ret_undef; + } + + assert(s >= rx->subbeg); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); + if (i >= 0) { +#ifdef NO_TAINT_SUPPORT + sv_setpvn(sv, s, i); +#else + const int oldtainted = TAINT_get; + TAINT_NOT; + sv_setpvn(sv, s, i); + TAINT_set(oldtainted); +#endif + if ( (rx->intflags & PREGf_CANY_SEEN) + ? (RXp_MATCH_UTF8(rx) + && (!i || is_utf8_string((U8*)s, i))) + : (RXp_MATCH_UTF8(rx)) ) + { + SvUTF8_on(sv); + } + else + SvUTF8_off(sv); + if (TAINTING_get) { + if (RXp_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + TAINT; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); + } + } else { + TAINT; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); + } + } else { + ret_undef: + sv_setsv(sv,&PL_sv_undef); + return; + } +} + +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak_no_modify(); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, + const I32 paren) +{ + struct regexp *const rx = ReANY(r); + I32 i; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + + /* Some of this code was originally in C in F */ + switch (paren) { + case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: /* $` */ + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + + case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: /* $' */ + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + + default: /* $& / ${^MATCH}, $1, $2, ... */ + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + warn_undef: + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((const SV *)sv); + return 0; + } + } + getlen: + if (i > 0 && RXp_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg - rx->suboffset + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_ARGS_ASSERT_REG_QR_PACKAGE; + PERL_UNUSED_ARG(rx); + if (0) + return NULL; + else + return newSVpvs("Regexp"); +} + +/* Scans the name of a named buffer from the pattern. + * If flags is REG_RSN_RETURN_NULL returns null. + * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name + * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding + * to the parsed name as looked up in the RExC_paren_names hash. + * If there is an error throws a vFAIL().. type exception. + */ + +#define REG_RSN_RETURN_NULL 0 +#define REG_RSN_RETURN_NAME 1 +#define REG_RSN_RETURN_DATA 2 + +STATIC SV* +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) +{ + char *name_start = RExC_parse; + + PERL_ARGS_ASSERT_REG_SCAN_NAME; + + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + /* skip IDFIRST by using do...while */ + if (UTF) + do { + RExC_parse += UTF8SKIP(RExC_parse); + } while (isWORDCHAR_utf8((U8*)RExC_parse)); + else + do { + RExC_parse++; + } while (isWORDCHAR(*RExC_parse)); + } else { + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ + vFAIL("Group name must start with a non-digit word character"); + } + if ( flags ) { + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); + if ( flags == REG_RSN_RETURN_NAME) + return sv_name; + else if (flags==REG_RSN_RETURN_DATA) { + HE *he_str = NULL; + SV *sv_dat = NULL; + if ( ! sv_name ) /* should not happen*/ + Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) + vFAIL("Reference to nonexistent named group"); + return sv_dat; + } + else { + Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", + (unsigned long) flags); + } + assert(0); /* NOT REACHED */ + } + return NULL; +} + +#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ + int rem=(int)(RExC_end - RExC_parse); \ + int cut; \ + int num; \ + int iscut=0; \ + if (rem>10) { \ + rem=10; \ + iscut=1; \ + } \ + cut=10-rem; \ + if (RExC_lastparse!=RExC_parse) \ + PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ + rem, RExC_parse, \ + cut + 4, \ + iscut ? "..." : "<" \ + ); \ + else \ + PerlIO_printf(Perl_debug_log,"%16s",""); \ + \ + if (SIZE_ONLY) \ + num = RExC_size + 1; \ + else \ + num=REG_NODE_NUM(RExC_emit); \ + if (RExC_lastnum!=num) \ + PerlIO_printf(Perl_debug_log,"|%4d",num); \ + else \ + PerlIO_printf(Perl_debug_log,"|%4s",""); \ + PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ + (int)((depth*2)), "", \ + (funcname) \ + ); \ + RExC_lastnum=num; \ + RExC_lastparse=RExC_parse; \ +}) + + + +#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,"%4s","\n"); \ +}) +#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,fmt "\n",args); \ +}) + +/* This section of code defines the inversion list object and its methods. The + * interfaces are highly subject to change, so as much as possible is static to + * this file. An inversion list is here implemented as a malloc'd C UV array + * as an SVt_INVLIST scalar. + * + * An inversion list for Unicode is an array of code points, sorted by ordinal + * number. The zeroth element is the first code point in the list. The 1th + * element is the first element beyond that not in the list. In other words, + * the first range is + * invlist[0]..(invlist[1]-1) + * The other ranges follow. Thus every element whose index is divisible by two + * marks the beginning of a range that is in the list, and every element not + * divisible by two marks the beginning of a range not in the list. A single + * element inversion list that contains the single code point N generally + * consists of two elements + * invlist[0] == N + * invlist[1] == N+1 + * (The exception is when N is the highest representable value on the + * machine, in which case the list containing just it would be a single + * element, itself. By extension, if the last range in the list extends to + * infinity, then the first element of that range will be in the inversion list + * at a position that is divisible by two, and is the final element in the + * list.) + * Taking the complement (inverting) an inversion list is quite simple, if the + * first element is 0, remove it; otherwise add a 0 element at the beginning. + * This implementation reserves an element at the beginning of each inversion + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. + * + * More about inversion lists can be found in "Unicode Demystified" + * Chapter 13 by Richard Gillam, published by Addison-Wesley. + * More will be coming when functionality is added later. + * + * The inversion list data structure is currently implemented as an SV pointing + * to an array of UVs that the SV thinks are bytes. This allows us to have an + * array of UV whose memory management is automatically handled by the existing + * facilities for SV's. + * + * Some of the methods should always be private to the implementation, and some + * should eventually be made public */ + +/* The header definitions are in F */ + +PERL_STATIC_INLINE UV* +S__invlist_array_init(SV* const invlist, const bool will_have_0) +{ + /* Returns a pointer to the first element in the inversion list's array. + * This is called upon initialization of an inversion list. Where the + * array begins depends on whether the list has the code point U+0000 in it + * or not. The other parameter tells it whether the code that follows this + * call is about to put a 0 in the inversion list or not. The first + * element is either the element reserved for 0, if TRUE, or the element + * after it, if FALSE */ + + bool* offset = get_invlist_offset_addr(invlist); + UV* zero_addr = (UV *) SvPVX(invlist); + + PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; + + /* Must be empty */ + assert(! _invlist_len(invlist)); + + *zero_addr = 0; + + /* 1^1 = 0; 1^0 = 1 */ + *offset = 1 ^ will_have_0; + return zero_addr + *offset; +} + +PERL_STATIC_INLINE UV* +S_invlist_array(SV* const invlist) +{ + /* Returns the pointer to the inversion list's array. Every time the + * length changes, this needs to be called in case malloc or realloc moved + * it */ + + PERL_ARGS_ASSERT_INVLIST_ARRAY; + + /* Must not be empty. If these fail, you probably didn't check for + * being non-zero before trying to get the array */ + assert(_invlist_len(invlist)); + + /* The very first element always contains zero, The array begins either + * there, or if the inversion list is offset, at the element after it. + * The offset header field determines which; it contains 0 or 1 to indicate + * how much additionally to add */ + assert(0 == *(SvPVX(invlist))); + return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); +} + +PERL_STATIC_INLINE void +S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) +{ + /* Sets the current number of elements stored in the inversion list. + * Updates SvCUR correspondingly */ + PERL_UNUSED_CONTEXT; + PERL_ARGS_ASSERT_INVLIST_SET_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + SvCUR_set(invlist, + (len == 0) + ? 0 + : TO_INTERNAL_SIZE(len + offset)); + assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); +} + +PERL_STATIC_INLINE IV* +S_get_invlist_previous_index_addr(SV* invlist) +{ + /* Return the address of the IV that is reserved to hold the cached index + * */ + PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); +} + +PERL_STATIC_INLINE IV +S_invlist_previous_index(SV* const invlist) +{ + /* Returns cached index of previous search */ + + PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; + + return *get_invlist_previous_index_addr(invlist); +} + +PERL_STATIC_INLINE void +S_invlist_set_previous_index(SV* const invlist, const IV index) +{ + /* Caches for later retrieval */ + + PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; + + assert(index == 0 || index < (int) _invlist_len(invlist)); + + *get_invlist_previous_index_addr(invlist) = index; +} + +PERL_STATIC_INLINE UV +S_invlist_max(SV* const invlist) +{ + /* Returns the maximum number of elements storable in the inversion list's + * array, without having to realloc() */ + + PERL_ARGS_ASSERT_INVLIST_MAX; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ + return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; +} + +#ifndef PERL_IN_XSUB_RE +SV* +Perl__new_invlist(pTHX_ IV initial_size) +{ + + /* Return a pointer to a newly constructed inversion list, with enough + * space to store 'initial_size' elements. If that number is negative, a + * system default is used instead */ + + SV* new_list; + + if (initial_size < 0) { + initial_size = 10; + } + + /* Allocate the initial space */ + new_list = newSV_type(SVt_INVLIST); + + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(new_list, 0, 0); + + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; + + *get_invlist_previous_index_addr(new_list) = 0; + + return new_list; +} + +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) +{ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to , which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands. The passed in 'list' contains + * several header fields at the beginning that are not part of the + * inversion list body proper */ + + const STRLEN length = (STRLEN) list[0]; + const UV version_id = list[1]; + const bool offset = cBOOL(list[2]); +#define HEADER_LENGTH 3 + /* If any of the above changes in any way, you must change HEADER_LENGTH + * (if appropriate) and regenerate INVLIST_VERSION_ID by running + * perl -E 'say int(rand 2**31-1)' + */ +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and + data structure type, so that one being + passed in can be validated to be an + inversion list of the correct vintage. + */ + + SV* invlist = newSV_type(SVt_INVLIST); + + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; + + if (version_id != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + /* The generated array passed in includes header elements that aren't part + * of the list proper, so start it just after them */ + SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); + + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); + + invlist_set_previous_index(invlist, 0); + + /* Initialize the iteration pointer. */ + invlist_iterfinish(invlist); + + SvREADONLY_on(invlist); + + return invlist; +} +#endif /* ifndef PERL_IN_XSUB_RE */ + +STATIC void +S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) +{ + /* Grow the maximum size of an inversion list */ + + PERL_ARGS_ASSERT_INVLIST_EXTEND; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Add one to account for the zero element at the beginning which may not + * be counted by the calling parameters */ + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); +} + +PERL_STATIC_INLINE void +S_invlist_trim(SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_TRIM; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Change the length of the inversion list to how many entries it currently + * has */ + SvPV_shrink_to_cur((SV *) invlist); +} + +STATIC void +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) +{ + /* Subject to change or removal. Append the range from 'start' to 'end' at + * the end of the inversion list. The range must be above any existing + * ones. */ + + UV* array; + UV max = invlist_max(invlist); + UV len = _invlist_len(invlist); + bool offset; + + PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; + + if (len == 0) { /* Empty lists must be initialized */ + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); + } + else { + /* Here, the existing list is non-empty. The current max entry in the + * list is generally the first value not in the set, except when the + * set extends to the end of permissible values, in which case it is + * the first entry in that final set, and so this call is an attempt to + * append out-of-order */ + + UV final_element = len - 1; + array = invlist_array(invlist); + if (array[final_element] > start + || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) + { + Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); + } + + /* Here, it is a legal append. If the new range begins with the first + * value not in the set, it is extending the set, so the new first + * value not in the set is one greater than the newly extended range. + * */ + offset = *get_invlist_offset_addr(invlist); + if (array[final_element] == start) { + if (end != UV_MAX) { + array[final_element] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, + * just let the range that this would extend to have no end */ + invlist_set_len(invlist, len - 1, offset); + } + return; + } + } + + /* Here the new range doesn't extend any existing set. Add it */ + + len += 2; /* Includes an element each for the start and end of range */ + + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ + if (max < len) { + invlist_extend(invlist, len); + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + + array = invlist_array(invlist); + } + else { + invlist_set_len(invlist, len, offset); + } + + /* The next item on the list starts the range, the one after that is + * one past the new range. */ + array[len - 2] = start; + if (end != UV_MAX) { + array[len - 1] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, just let + * the range have no end */ + invlist_set_len(invlist, len - 1, offset); + } +} + +#ifndef PERL_IN_XSUB_RE + +IV +Perl__invlist_search(SV* const invlist, const UV cp) +{ + /* Searches the inversion list for the entry that contains the input code + * point . If is not in the list, -1 is returned. Otherwise, the + * return value is the index into the list's array of the range that + * contains */ + + IV low = 0; + IV mid; + IV high = _invlist_len(invlist); + const IV highest_element = high - 1; + const UV* array; + + PERL_ARGS_ASSERT__INVLIST_SEARCH; + + /* If list is empty, return failure. */ + if (high == 0) { + return -1; + } + + /* (We can't get the array unless we know the list is non-empty) */ + array = invlist_array(invlist); + + mid = invlist_previous_index(invlist); + assert(mid >=0 && mid <= highest_element); + + /* contains the cache of the result of the previous call to this + * function (0 the first time). See if this call is for the same result, + * or if it is for mid-1. This is under the theory that calls to this + * function will often be for related code points that are near each other. + * And benchmarks show that caching gives better results. We also test + * here if the code point is within the bounds of the list. These tests + * replace others that would have had to be made anyway to make sure that + * the array bounds were not exceeded, and these give us extra information + * at the same time */ + if (cp >= array[mid]) { + if (cp >= array[highest_element]) { + return highest_element; + } + + /* Here, array[mid] <= cp < array[highest_element]. This means that + * the final element is not the answer, so can exclude it; it also + * means that is not the final element, so can refer to 'mid + 1' + * safely */ + if (cp < array[mid + 1]) { + return mid; + } + high--; + low = mid + 1; + } + else { /* cp < aray[mid] */ + if (cp < array[0]) { /* Fail if outside the array */ + return -1; + } + high = mid; + if (cp >= array[mid - 1]) { + goto found_entry; + } + } + + /* Binary search. What we are looking for is such that + * array[i] <= cp < array[i+1] + * The loop below converges on the i+1. Note that there may not be an + * (i+1)th element in the array, and things work nonetheless */ + while (low < high) { + mid = (low + high) / 2; + assert(mid <= highest_element); + if (array[mid] <= cp) { /* cp >= array[mid] */ + low = mid + 1; + + /* We could do this extra test to exit the loop early. + if (cp < array[low]) { + return mid; + } + */ + } + else { /* cp < array[mid] */ + high = mid; + } + } + + found_entry: + high--; + invlist_set_previous_index(invlist, high); + return high; +} + +void +Perl__invlist_populate_swatch(SV* const invlist, + const UV start, const UV end, U8* swatch) +{ + /* populates a swatch of a swash the same way swatch_get() does in utf8.c, + * but is used when the swash has an inversion list. This makes this much + * faster, as it uses a binary search instead of a linear one. This is + * intimately tied to that function, and perhaps should be in utf8.c, + * except it is intimately tied to inversion lists as well. It assumes + * that is all 0's on input */ + + UV current = start; + const IV len = _invlist_len(invlist); + IV i; + const UV * array; + + PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; + + if (len == 0) { /* Empty inversion list */ + return; + } + + array = invlist_array(invlist); + + /* Find which element it is */ + i = _invlist_search(invlist, start); + + /* We populate from to */ + while (current < end) { + UV upper; + + /* The inversion list gives the results for every possible code point + * after the first one in the list. Only those ranges whose index is + * even are ones that the inversion list matches. For the odd ones, + * and if the initial code point is not in the list, we have to skip + * forward to the next element */ + if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { + i++; + if (i >= len) { /* Finished if beyond the end of the array */ + return; + } + current = array[i]; + if (current >= end) { /* Finished if beyond the end of what we + are populating */ + if (LIKELY(end < UV_MAX)) { + return; + } + + /* We get here when the upper bound is the maximum + * representable on the machine, and we are looking for just + * that code point. Have to special case it */ + i = len; + goto join_end_of_list; + } + } + assert(current >= start); + + /* The current range ends one below the next one, except don't go past + * */ + i++; + upper = (i < len && array[i] < end) ? array[i] : end; + + /* Here we are in a range that matches. Populate a bit in the 3-bit U8 + * for each code point in it */ + for (; current < upper; current++) { + const STRLEN offset = (STRLEN)(current - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + + join_end_of_list: + + /* Quit if at the end of the list */ + if (i >= len) { + + /* But first, have to deal with the highest possible code point on + * the platform. The previous code assumes that is one + * beyond where we want to populate, but that is impossible at the + * platform's infinity, so have to handle it specially */ + if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) + { + const STRLEN offset = (STRLEN)(end - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + return; + } + + /* Advance to the next range, which will be for code points not in the + * inversion list */ + current = array[i]; + } + + return; +} + +void +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) +{ + /* Take the union of two inversion lists and point to it. *output + * SHOULD BE DEFINED upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. + * + * The algorithm is like a merge sort. + * + * XXX A potential performance improvement is to keep track as we go along + * if only one of the inputs contributes to the result, meaning the other + * is a subset of that one. In that case, we can skip the final copy and + * return the larger of the input lists, but then outside code might need + * to keep track of whether to free the input list or not */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* u; /* the resulting union */ + UV* array_u; + UV len_u; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_u = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 0. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is zero is something not in the set. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* If either one is empty, the union is the other one */ + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + + if (*output == a) { + if (a != NULL) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + } + if (*output != b) { + *output = invlist_clone(b); + if (complement_b) { + _invlist_invert(*output); + } + } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } + return; + } + else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; + if (*output == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + + /* The complement of an empty list is a list that has everything in it, + * so the union with includes everything too */ + if (complement_b) { + if (a == *output) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + *output = _new_invlist(1); + _append_range_to_invlist(*output, 0, UV_MAX); + } + else if (*output != a) { + *output = invlist_clone(a); + } + /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the union for the worst case: that the sets are completely + * disjoint */ + u = _new_invlist(len_a + len_b); + + /* Will contain U+0000 if either component does */ + array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) + || (len_b > 0 && array_b[0] == 0)); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the union's array */ + bool cp_in_set; /* is it in the the input list's set or not */ + + /* We need to take one or the other of the two inputs for the union. + * Since we are merging two sorted lists, we take the smaller of the + * next items. In case of a tie, we take the one that is in its set + * first. If we took one not in the set first, it would decrement the + * count, possibly to 0 which would cause it to be output as ending the + * range, and the next time through we would take the same number, and + * output it again as beginning the next range. By doing it the + * opposite way, there is no possibility that the count will be + * momentarily decremented to 0, and thus the two adjoining ranges will + * be seamlessly merged. (In a tie and both are in the set or both not + * in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp = array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 0, which marks the + * beginning/end of a range in that's in the set */ + if (cp_in_set) { + if (count == 0) { + array_u[i_u++] = cp; + } + count++; + } + else { + count--; + if (count == 0) { + array_u[i_u++] = cp; + } + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that hasn't been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to the element beyond + * the one we care about.) If in the set, we decrement 'count'; if 0, there + * is potentially more to output. + * There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. What's left + * in the union is entirely from the non-exhausted set. + * 2) Both were in their sets, count is 2. Nothing further should + * be output, as everything that remains will be in the exhausted + * list's set, hence in the union; decrementing to 1 but not 0 insures + * that + * 3) the exhausted was in its set, non-exhausted isn't, count is 1. + * Nothing further should be output because the union includes + * everything from the exhausted set. Not decrementing ensures that. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; + * decrementing to 0 insures that we look at the remainder of the + * non-exhausted set */ + if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count--; + } + + /* The final length is what we've output so far, plus what else is about to + * be output. (If 'count' is non-zero, then the input list we exhausted + * has everything remaining up to the machine's limit in its set, and hence + * in the union, so there will be no further output. */ + len_u = i_u; + if (count == 0) { + /* At most one of the subexpressions will be non-zero */ + len_u += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_u, so + * re-find it */ + if (len_u != _invlist_len(u)) { + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); + invlist_trim(u); + array_u = invlist_array(u); + } + + /* When 'count' is 0, the list that was exhausted (if one was shorter than + * the other) ended with everything above it not in its set. That means + * that the remaining part of the union is precisely the same as the + * non-exhausted list, so can just copy it unchanged. (If both list were + * exhausted at the same time, then the operations below will be both 0.) + */ + if (count == 0) { + IV copy_count; /* At most one will have a non-zero copy count */ + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_u + i_u, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_u + i_u, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ + if (a == *output || b == *output) { + assert(! invlist_is_iterating(*output)); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } + } + + *output = u; + + return; +} + +void +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) +{ + /* Take the intersection of two inversion lists and point to it. *i + * SHOULD BE DEFINED upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. In fact, it had bugs + * + * The algorithm is like a merge sort, and is essentially the same as the + * union above + */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* r; /* the resulting intersection */ + UV* array_r; + UV len_r; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_r = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 2. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is 2 is something in the intersection. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* Special case if either one is empty */ + len_a = (a == NULL) ? 0 : _invlist_len(a); + if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; + + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the above 'if', 'b' must + * be empty. Here, also we are using 'b's complement, which hence + * must be every possible code point. Thus the intersection is + * simply 'a'. */ + if (*i != a) { + if (*i == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + + *i = invlist_clone(a); + } + /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ + if (*i == a) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + else if (*i == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the intersection for the worst case: that the intersection ends up + * fragmenting everything to be completely disjoint */ + r= _new_invlist(len_a + len_b); + + /* Will contain U+0000 iff both components do */ + array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 + && len_b > 0 && array_b[0] == 0); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the intersection's + array */ + bool cp_in_set; /* Is it in the input list's set or not */ + + /* We need to take one or the other of the two inputs for the + * intersection. Since we are merging two sorted lists, we take the + * smaller of the next items. In case of a tie, we take the one that + * is not in its set first (a difference from the union algorithm). If + * we took one in the set first, it would increment the count, possibly + * to 2 which would cause it to be output as starting a range in the + * intersection, and the next time through we would take that same + * number, and output it again as ending the set. By doing it the + * opposite of this, there is no possibility that the count will be + * momentarily incremented to 2. (In a tie and both are in the set or + * both not in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp= array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 2, which marks the + * beginning/end of a range that's in the intersection */ + if (cp_in_set) { + count++; + if (count == 2) { + array_r[i_r++] = cp; + } + } + else { + if (count == 2) { + array_r[i_r++] = cp; + } + count--; + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that has been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to elements 1 beyond + * the ones we care about.) There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. There's + * nothing left in the intersection. + * 2) Both were in their sets, count is 2 and perhaps is incremented to + * above 2. What should be output is exactly that which is in the + * non-exhausted set, as everything it has is also in the intersection + * set, and everything it doesn't have can't be in the intersection + * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and + * gets incremented to 2. Like the previous case, the intersection is + * everything that remains in the non-exhausted set. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and + * remains 1. And the intersection has nothing more. */ + if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count++; + } + + /* The final length is what we've output so far plus what else is in the + * intersection. At most one of the subexpressions below will be non-zero + * */ + len_r = i_r; + if (count >= 2) { + len_r += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_r, so + * re-find it */ + if (len_r != _invlist_len(r)) { + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); + invlist_trim(r); + array_r = invlist_array(r); + } + + /* Finish outputting any remaining */ + if (count >= 2) { /* At most one will have a non-zero copy count */ + IV copy_count; + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_r + i_r, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_r + i_r, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ + if (a == *i || b == *i) { + assert(! invlist_is_iterating(*i)); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } + } + + *i = r; + + return; +} + +SV* +Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) +{ + /* Add the range from 'start' to 'end' inclusive to the inversion list's + * set. A pointer to the inversion list is returned. This may actually be + * a new list, in which case the passed in one has been destroyed. The + * passed in inversion list can be NULL, in which case a new one is created + * with just the one range in it */ + + SV* range_invlist; + UV len; + + if (invlist == NULL) { + invlist = _new_invlist(2); + len = 0; + } + else { + len = _invlist_len(invlist); + } + + /* If comes after the final entry actually in the list, can just append it + * to the end, */ + if (len == 0 + || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1) + && start >= invlist_array(invlist)[len - 1])) + { + _append_range_to_invlist(invlist, start, end); + return invlist; + } + + /* Here, can't just append things, create and return a new inversion list + * which is the union of this range and the existing inversion list */ + range_invlist = _new_invlist(2); + _append_range_to_invlist(range_invlist, start, end); + + _invlist_union(invlist, range_invlist, &invlist); + + /* The temporary can be freed */ + SvREFCNT_dec_NN(range_invlist); + + return invlist; +} + +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + +#endif + +PERL_STATIC_INLINE SV* +S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { + return _add_range_to_invlist(invlist, cp, cp); +} + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_invert(pTHX_ SV* const invlist) +{ + /* Complement the input inversion list. This adds a 0 if the list didn't + * have a zero; removes it otherwise. As described above, the data + * structure is set up so that this is very efficient */ + + PERL_ARGS_ASSERT__INVLIST_INVERT; + + assert(! invlist_is_iterating(invlist)); + + /* The inverse of matching nothing is matching everything */ + if (_invlist_len(invlist) == 0) { + _append_range_to_invlist(invlist, 0, UV_MAX); + return; + } + + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); +} + +#endif + +PERL_STATIC_INLINE SV* +S_invlist_clone(pTHX_ SV* const invlist) +{ + + /* Return a new inversion list that is a copy of the input one, which is + * unchanged. The new list will not be mortal even if the old one was. */ + + /* Need to allocate extra space to accommodate Perl's addition of a + * trailing NUL to SvPV's, since it thinks they are always strings */ + SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); + STRLEN physical_length = SvCUR(invlist); + bool offset = *(get_invlist_offset_addr(invlist)); + + PERL_ARGS_ASSERT_INVLIST_CLONE; + + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, _invlist_len(invlist), offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); + + return new_invlist; +} + +PERL_STATIC_INLINE STRLEN* +S_get_invlist_iter_addr(SV* invlist) +{ + /* Return the address of the UV that contains the current iteration + * position */ + + PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->iterator); +} + +PERL_STATIC_INLINE void +S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ +{ + PERL_ARGS_ASSERT_INVLIST_ITERINIT; + + *get_invlist_iter_addr(invlist) = 0; +} + +PERL_STATIC_INLINE void +S_invlist_iterfinish(SV* invlist) +{ + /* Terminate iterator for invlist. This is to catch development errors. + * Any iteration that is interrupted before completed should call this + * function. Functions that add code points anywhere else but to the end + * of an inversion list assert that they are not in the middle of an + * iteration. If they were, the addition would make the iteration + * problematical: if the iteration hadn't reached the place where things + * were being added, it would be ok */ + + PERL_ARGS_ASSERT_INVLIST_ITERFINISH; + + *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; +} + +STATIC bool +S_invlist_iternext(SV* invlist, UV* start, UV* end) +{ + /* An C call on must be used to set this up. + * This call sets in <*start> and <*end>, the next range in . + * Returns if successful and the next call will return the next + * range; if was already at the end of the list. If the latter, + * <*start> and <*end> are unchanged, and the next call to this function + * will start over at the beginning of the list */ + + STRLEN* pos = get_invlist_iter_addr(invlist); + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_ITERNEXT; + + if (*pos >= len) { + *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ + return FALSE; + } + + array = invlist_array(invlist); + + *start = array[(*pos)++]; + + if (*pos >= len) { + *end = UV_MAX; + } + else { + *end = array[(*pos)++] - 1; + } + + return TRUE; +} + +PERL_STATIC_INLINE bool +S_invlist_is_iterating(SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; + + return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; +} + +PERL_STATIC_INLINE UV +S_invlist_highest(SV* const invlist) +{ + /* Returns the highest code point that matches an inversion list. This API + * has an ambiguity, as it returns 0 under either the highest is actually + * 0, or if the list is empty. If this distinction matters to you, check + * for emptiness before calling this function */ + + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_HIGHEST; + + if (len == 0) { + return 0; + } + + array = invlist_array(invlist); + + /* The last element in the array in the inversion list always starts a + * range that goes to infinity. That range may be for code points that are + * matched in the inversion list, or it may be for ones that aren't + * matched. In the latter case, the highest code point in the set is one + * less than the beginning of this range; otherwise it is the final element + * of this range: infinity */ + return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) + ? UV_MAX + : array[len - 1] - 1; +} + +#ifndef PERL_IN_XSUB_RE +SV * +Perl__invlist_contents(pTHX_ SV* const invlist) +{ + /* Get the contents of an inversion list into a string SV so that they can + * be printed out. It uses the format traditionally done for debug tracing + */ + + UV start, end; + SV* output = newSVpvs("\n"); + + PERL_ARGS_ASSERT__INVLIST_CONTENTS; + + assert(! invlist_is_iterating(invlist)); + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); + } + else if (end != start) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", + start, end); + } + else { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); + } + } + + return output; +} +#endif + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) +{ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ + + UV start, end; + STRLEN count = 0; + + PERL_ARGS_ASSERT__INVLIST_DUMP; + + if (invlist_is_iterating(invlist)) { + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); + return; + } + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); + } + else if (end != start) { + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); + } + else { + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); + } + count += 2; + } +} + +void +Perl__load_PL_utf8_foldclosures (pTHX) +{ + assert(! PL_utf8_foldclosures); + + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); +} +#endif + +#ifdef PERL_ARGS_ASSERT__INVLISTEQ +bool +S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) +{ + /* Return a boolean as to if the two passed in inversion lists are + * identical. The final argument, if TRUE, says to take the complement of + * the second inversion list before doing the comparison */ + + const UV* array_a = invlist_array(a); + const UV* array_b = invlist_array(b); + UV len_a = _invlist_len(a); + UV len_b = _invlist_len(b); + + UV i = 0; /* current index into the arrays */ + bool retval = TRUE; /* Assume are identical until proven otherwise */ + + PERL_ARGS_ASSERT__INVLISTEQ; + + /* If are to compare 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* The complement of nothing is everything, so would have to have + * just one element, starting at zero (ending at infinity) */ + if (len_b == 0) { + return (len_a == 1 && array_a[0] == 0); + } + else if (array_b[0] == 0) { + + /* Otherwise, to complement, we invert. Here, the first element is + * 0, just remove it. To do this, we just pretend the array starts + * one later */ + + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Make sure that the lengths are the same, as well as the final element + * before looping through the remainder. (Thus we test the length, final, + * and first elements right off the bat) */ + if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { + retval = FALSE; + } + else for (i = 0; i < len_a - 1; i++) { + if (array_a[i] != array_b[i]) { + retval = FALSE; + break; + } + } + + return retval; +} +#endif + +#undef HEADER_LENGTH +#undef TO_INTERNAL_SIZE +#undef FROM_INTERNAL_SIZE +#undef INVLIST_VERSION_ID + +/* End of inversion list object */ + +STATIC void +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) +{ + /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' + * constructs, and updates RExC_flags with them. On input, RExC_parse + * should point to the first flag; it is updated on output to point to the + * final ')' or ':'. There needs to be at least one flag, or this will + * abort */ + + /* for (?g), (?gc), and (?o) warnings; warning + about (?c) will warn about (?g) -- japhy */ + +#define WASTED_O 0x01 +#define WASTED_G 0x02 +#define WASTED_C 0x04 +#define WASTED_GC (WASTED_G|WASTED_C) + I32 wastedflags = 0x00; + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; + char has_charset_modifier = '\0'; + regex_charset cs; + bool has_use_defaults = FALSE; + const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ + + PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; + + /* '^' as an initial flag sets certain defaults */ + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + has_use_defaults = TRUE; + STD_PMMOD_FLAGS_CLEAR(&RExC_flags); + set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET); + } + + cs = get_regex_charset(RExC_flags); + if (cs == REGEX_DEPENDS_CHARSET + && (RExC_utf8 || RExC_uni_semantics)) + { + cs = REGEX_UNICODE_CHARSET; + } + + while (*RExC_parse) { + /* && strchr("iogcmsx", *RExC_parse) */ + /* (?g), (?gc) and (?o) are useless here + and must be globally applied -- japhy */ + switch (*RExC_parse) { + + /* Code for the imsx flags */ + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + + case LOCALE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_LOCALE_CHARSET; + has_charset_modifier = LOCALE_PAT_MOD; + break; + case UNICODE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_UNICODE_CHARSET; + has_charset_modifier = UNICODE_PAT_MOD; + break; + case ASCII_RESTRICT_PAT_MOD: + if (flagsp == &negflags) { + goto neg_modifier; + } + if (has_charset_modifier) { + if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { + goto excess_modifier; + } + /* Doubled modifier implies more restricted */ + cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; + } + else { + cs = REGEX_ASCII_RESTRICTED_CHARSET; + } + has_charset_modifier = ASCII_RESTRICT_PAT_MOD; + break; + case DEPENDS_PAT_MOD: + if (has_use_defaults) { + goto fail_modifiers; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + else if (has_charset_modifier) { + goto excess_modifier; + } + + /* The dual charset means unicode semantics if the + * pattern (or target, not known until runtime) are + * utf8, or something in the pattern indicates unicode + * semantics */ + cs = (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET; + has_charset_modifier = DEPENDS_PAT_MOD; + break; + excess_modifier: + RExC_parse++; + if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { + vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); + } + else if (has_charset_modifier == *(RExC_parse - 1)) { + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); + } + else { + vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); + } + /*NOTREACHED*/ + neg_modifier: + RExC_parse++; + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); + /*NOTREACHED*/ + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; + if (! (wastedflags & wflagbit) ) { + wastedflags |= wflagbit; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ + vWARN5( + RExC_parse + 1, + "Useless (%s%c) - %suse /%c modifier", + flagsp == &negflags ? "?-" : "?", + *RExC_parse, + flagsp == &negflags ? "don't " : "", + *RExC_parse + ); + } + } + break; + + case CONTINUE_PAT_MOD: /* 'c' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (! (wastedflags & WASTED_C) ) { + wastedflags |= WASTED_GC; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ + vWARN3( + RExC_parse + 1, + "Useless (%sc) - %suse /gc modifier", + flagsp == &negflags ? "?-" : "?", + flagsp == &negflags ? "don't " : "" + ); + } + } + break; + case KEEPCOPY_PAT_MOD: /* 'p' */ + if (flagsp == &negflags) { + if (SIZE_ONLY) + ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); + } else { + *flagsp |= RXf_PMf_KEEPCOPY; + } + break; + case '-': + /* A flag is a default iff it is following a minus, so + * if there is a minus, it means will be trying to + * re-specify a default which is an error */ + if (has_use_defaults || flagsp == &negflags) { + goto fail_modifiers; + } + flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case ':': + case ')': + RExC_flags |= posflags; + RExC_flags &= ~negflags; + set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } + return; + /*NOTREACHED*/ + default: + fail_modifiers: + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); + /*NOTREACHED*/ + } + + ++RExC_parse; + } +} + +/* + - reg - regular expression, i.e. main body or parenthesized thing + * + * Caller must absorb opening parenthesis. + * + * Combining parenthesis handling with the base level of regular expression + * is a trifle forced, but the need to tie the tails of the branches to what + * follows makes it hard to avoid. + */ +#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) +#ifdef DEBUGGING +#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) +#else +#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) +#endif + +/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets + flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan + needs to be restarted. + Otherwise would only return NULL if regbranch() returns NULL, which + cannot happen. */ +STATIC regnode * +S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) + /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. + * 2 is like 1, but indicates that nextchar() has been called to advance + * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and + * this flag alerts us to the need to check for that */ +{ + regnode *ret; /* Will be the head of the group. */ + regnode *br; + regnode *lastbr; + regnode *ender = NULL; + I32 parno = 0; + I32 flags; + U32 oregflags = RExC_flags; + bool have_branch = 0; + bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; + I32 num; /* numeric backreferences */ + + char * parse_start = RExC_parse; /* MJD */ + char * const oregcomp_parse = RExC_parse; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG; + DEBUG_PARSE("reg "); + + *flagp = 0; /* Tentatively. */ + + + /* Make an OPEN node, if parenthesized. */ + if (paren) { + + /* Under /x, space and comments can be gobbled up between the '(' and + * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such + * intervening space, as the sequence is a token, and a token should be + * indivisible */ + bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; + + if ( *RExC_parse == '*') { /* (*VERB:ARG) */ + char *start_verb = RExC_parse; + STRLEN verb_len = 0; + char *start_arg = NULL; + unsigned char op = 0; + int argok = 1; + int internal_argval = 0; /* internal_argval is only useful if + !argok */ + + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); + } + while ( *RExC_parse && *RExC_parse != ')' ) { + if ( *RExC_parse == ':' ) { + start_arg = RExC_parse + 1; + break; + } + RExC_parse++; + } + ++start_verb; + verb_len = RExC_parse - start_verb; + if ( start_arg ) { + RExC_parse++; + while ( *RExC_parse && *RExC_parse != ')' ) + RExC_parse++; + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern argument"); + if ( RExC_parse == start_arg ) + start_arg = NULL; + } else { + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern"); + } + + switch ( *start_verb ) { + case 'A': /* (*ACCEPT) */ + if ( memEQs(start_verb,verb_len,"ACCEPT") ) { + op = ACCEPT; + internal_argval = RExC_nestroot; + } + break; + case 'C': /* (*COMMIT) */ + if ( memEQs(start_verb,verb_len,"COMMIT") ) + op = COMMIT; + break; + case 'F': /* (*FAIL) */ + if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { + op = OPFAIL; + argok = 0; + } + break; + case ':': /* (*:NAME) */ + case 'M': /* (*MARK:NAME) */ + if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { + op = MARKPOINT; + argok = -1; + } + break; + case 'P': /* (*PRUNE) */ + if ( memEQs(start_verb,verb_len,"PRUNE") ) + op = PRUNE; + break; + case 'S': /* (*SKIP) */ + if ( memEQs(start_verb,verb_len,"SKIP") ) + op = SKIP; + break; + case 'T': /* (*THEN) */ + /* [19:06] :: is then */ + if ( memEQs(start_verb,verb_len,"THEN") ) { + op = CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; + } + break; + } + if ( ! op ) { + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); + } + if ( argok ) { + if ( start_arg && internal_argval ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else if ( argok < 0 && !start_arg ) { + vFAIL3("Verb pattern '%.*s' has a mandatory argument", + verb_len, start_verb); + } else { + ret = reganode(pRExC_state, op, internal_argval); + if ( ! internal_argval && ! SIZE_ONLY ) { + if (start_arg) { + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); + RExC_rxi->data->data[ARG(ret)]=(void*)sv; + ret->flags = 0; + } else { + ret->flags = 1; + } + } + } + if (!internal_argval) + RExC_seen |= REG_VERBARG_SEEN; + } else if ( start_arg ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else { + ret = reg_node(pRExC_state, op); + } + nextchar(pRExC_state); + return ret; + } + else if (*RExC_parse == '?') { /* (?...) */ + bool is_logical = 0; + const char * const seqstart = RExC_parse; + const char * endptr; + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(?...)', the '(' and '?' must be adjacent"); + } + + RExC_parse++; + paren = *RExC_parse++; + ret = NULL; /* For look-ahead/behind. */ + switch (paren) { + + case 'P': /* (?P...) variants for those used to PCRE/Python */ + paren = *RExC_parse++; + if ( paren == '<') /* (?P<...>) named capture */ + goto named_capture; + else if (paren == '>') { /* (?P>name) named recursion */ + goto named_recursion; + } + else if (paren == '=') { /* (?P=...) named backref */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ + char* name_start = RExC_parse; + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + + nextchar(pRExC_state); + return ret; + } + RExC_parse++; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + case '<': /* (?<...) */ + if (*RExC_parse == '!') + paren = ','; + else if (*RExC_parse != '=') + named_capture: + { /* (?<...>) */ + char *name_start; + SV *svname; + paren= '>'; + case '\'': /* (?'...') */ + name_start= RExC_parse; + svname = reg_scan_name(pRExC_state, + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) + vFAIL2("Sequence (?%c... not terminated", + paren=='>' ? '<' : paren); + if (SIZE_ONLY) { + HE *he_str; + SV *sv_dat = NULL; + if (!svname) /* shouldn't happen */ + Perl_croak(aTHX_ + "panic: reg_scan_name returned NULL"); + if (!RExC_paren_names) { + RExC_paren_names= newHV(); + sv_2mortal(MUTABLE_SV(RExC_paren_names)); +#ifdef DEBUGGING + RExC_paren_name_list= newAV(); + sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); +#endif + } + he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) { + /* croak baby croak */ + Perl_croak(aTHX_ + "panic: paren_name hash element allocation failed"); + } else if ( SvPOK(sv_dat) ) { + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIV_set(sv_dat, SvIVX(sv_dat) + 1); + } + } else { + (void)SvUPGRADE(sv_dat,SVt_PVNV); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); + SvIOK_on(sv_dat); + SvIV_set(sv_dat, 1); + } +#ifdef DEBUGGING + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) + SvREFCNT_dec_NN(svname); +#endif + + /*sv_dump(sv_dat);*/ + } + nextchar(pRExC_state); + paren = 1; + goto capturing_parens; + } + RExC_seen |= REG_LOOKBEHIND_SEEN; + RExC_in_lookbehind++; + RExC_parse++; + /* FALLTHROUGH */ + case '=': /* (?=...) */ + RExC_seen_zerolen++; + break; + case '!': /* (?!...) */ + RExC_seen_zerolen++; + if (*RExC_parse == ')') { + ret=reg_node(pRExC_state, OPFAIL); + nextchar(pRExC_state); + return ret; + } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; + case ':': /* (?:...) */ + case '>': /* (?>...) */ + break; + case '$': /* (?$...) */ + case '@': /* (?@...) */ + vFAIL2("Sequence (?%c...) not implemented", (int)paren); + break; + case '0' : /* (?0) */ + case 'R' : /* (?R) */ + if (*RExC_parse != ')') + FAIL("Sequence (?R) not terminated"); + ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + /*notreached*/ + /* named and numeric backreferences */ + case '&': /* (?&NAME) */ + parse_start = RExC_parse - 1; + named_recursion: + { + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); + goto gen_recurse_regop; + assert(0); /* NOT REACHED */ + case '+': + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse++; + vFAIL("Illegal pattern"); + } + goto parse_recursion; + /* NOT REACHED*/ + case '-': /* (?-1) */ + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse--; /* rewind to let it be handled later */ + goto parse_flags; + } + /* FALLTHROUGH */ + case '1': case '2': case '3': case '4': /* (?1) */ + case '5': case '6': case '7': case '8': case '9': + RExC_parse--; + parse_recursion: + { + bool is_neg = FALSE; + parse_start = RExC_parse - 1; /* MJD */ + if (*RExC_parse == '-') { + RExC_parse++; + is_neg = TRUE; + } + num = grok_atou(RExC_parse, &endptr); + if (endptr) + RExC_parse = (char*)endptr; + if (is_neg) { + /* Some limit for num? */ + num = -num; + } + } + if (*RExC_parse!=')') + vFAIL("Expecting close bracket"); + + gen_recurse_regop: + if ( paren == '-' ) { + /* + Diagram of capture buffer numbering. + Top line is the normal capture buffer numbers + Bottom line is the negative indexing as from + the X (the (?-2)) + + + 1 2 3 4 5 X 6 7 + /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ + - 5 4 3 2 1 X x x + + */ + num = RExC_npar + num; + if (num < 1) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + } else if ( paren == '+' ) { + num = RExC_npar + num - 1; + } + + ret = reganode(pRExC_state, GOSUB, num); + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + ARG2L_SET( ret, RExC_recurse_count++); + RExC_emit++; + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); + } else { + RExC_size++; + } + RExC_seen |= REG_RECURSE_SEEN; + Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ + Set_Node_Offset(ret, parse_start); /* MJD */ + + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + + assert(0); /* NOT REACHED */ + + case '?': /* (??...) */ + is_logical = 1; + if (*RExC_parse != '{') { + RExC_parse++; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); + /*NOTREACHED*/ + } + *flagp |= POSTPONED; + paren = *RExC_parse++; + /* FALLTHROUGH */ + case '{': /* (?{...}) */ + { + U32 n = 0; + struct reg_code_block *cb; + + RExC_seen_zerolen++; + + if ( !pRExC_state->num_code_blocks + || pRExC_state->code_index >= pRExC_state->num_code_blocks + || pRExC_state->code_blocks[pRExC_state->code_index].start + != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) + - RExC_start) + ) { + if (RExC_pm_flags & PMf_USE_RE_EVAL) + FAIL("panic: Sequence (?{...}): no code block found\n"); + FAIL("Eval-group not allowed at runtime, use re 'eval'"); + } + /* this is a pre-compiled code block (?{...}) */ + cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + RExC_parse = RExC_start + cb->end; + if (!SIZE_ONLY) { + OP *o = cb->block; + if (cb->src_regex) { + n = add_data(pRExC_state, STR_WITH_LEN("rl")); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o; + } + else { + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); + RExC_rxi->data->data[n] = (void*)o; + } + } + pRExC_state->code_index++; + nextchar(pRExC_state); + + if (is_logical) { + regnode *eval; + ret = reg_node(pRExC_state, LOGICAL); + eval = reganode(pRExC_state, EVAL, n); + if (!SIZE_ONLY) { + ret->flags = 2; + /* for later propagation into (??{}) return value */ + eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); + } + REGTAIL(pRExC_state, ret, eval); + /* deal with the length of this later - MJD */ + return ret; + } + ret = reganode(pRExC_state, EVAL, n); + Set_Node_Length(ret, RExC_parse - parse_start + 1); + Set_Node_Offset(ret, parse_start); + return ret; + } + case '(': /* (?(?{...})...) and (?(?=...)...) */ + { + int is_define= 0; + if (RExC_parse[0] == '?') { /* (?(?...)) */ + if (RExC_parse[1] == '=' || RExC_parse[1] == '!' + || RExC_parse[1] == '<' + || RExC_parse[1] == '{') { /* Lookahead or eval. */ + I32 flag; + regnode *tail; + + ret = reg_node(pRExC_state, LOGICAL); + if (!SIZE_ONLY) + ret->flags = 1; + + tail = reg(pRExC_state, 1, &flag, depth+1); + if (flag & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + REGTAIL(pRExC_state, ret, tail); + goto insert_if; + } + /* Fall through to ‘Unknown switch condition’ at the + end of the if/else chain. */ + } + else if ( RExC_parse[0] == '<' /* (?()...) */ + || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ + { + char ch = RExC_parse[0] == '<' ? '>' : '\''; + char *name_start= RExC_parse++; + U32 num = 0; + SV *sv_dat=reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ch) + vFAIL2("Sequence (?(%c... not terminated", + (ch == '>' ? '<' : ch)); + RExC_parse++; + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + ret = reganode(pRExC_state,NGROUPP,num); + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'D' && + RExC_parse[1] == 'E' && + RExC_parse[2] == 'F' && + RExC_parse[3] == 'I' && + RExC_parse[4] == 'N' && + RExC_parse[5] == 'E') + { + ret = reganode(pRExC_state,DEFINEP,0); + RExC_parse +=6 ; + is_define = 1; + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'R') { + RExC_parse++; + parno = 0; + if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + parno = grok_atou(RExC_parse, &endptr); + if (endptr) + RExC_parse = (char*)endptr; + } else if (RExC_parse[0] == '&') { + SV *sv_dat; + RExC_parse++; + sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); + parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + ret = reganode(pRExC_state,INSUBP,parno); + goto insert_if_check_paren; + } + else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + /* (?(1)...) */ + char c; + char *tmp; + parno = grok_atou(RExC_parse, &endptr); + if (endptr) + RExC_parse = (char*)endptr; + ret = reganode(pRExC_state, GROUPP, parno); + + insert_if_check_paren: + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Switch condition not recognized"); + } + insert_if: + REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); + br = regbranch(pRExC_state, &flags, 1,depth+1); + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + (UV) flags); + } else + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); + c = *nextchar(pRExC_state); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + if (c == '|') { + if (is_define) + vFAIL("(?(DEFINE)....) does not allow branches"); + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + (UV) flags); + } + REGTAIL(pRExC_state, ret, lastbr); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + c = *nextchar(pRExC_state); + } + else + lastbr = NULL; + if (c != ')') + vFAIL("Switch (?(condition)... contains too many branches"); + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, br, ender); + if (lastbr) { + REGTAIL(pRExC_state, lastbr, ender); + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); + } + else + REGTAIL(pRExC_state, ret, ender); + RExC_size++; /* XXX WHY do we need this?!! + For large programs it seems to be required + but I can't figure out why. -- dmq*/ + return ret; + } + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); + } + case '[': /* (?[ ... ]) */ + return handle_regex_sets(pRExC_state, NULL, flagp, depth, + oregcomp_parse); + case 0: + RExC_parse--; /* for vFAIL to print correctly */ + vFAIL("Sequence (? incomplete"); + break; + default: /* e.g., (?i) */ + --RExC_parse; + parse_flags: + parse_lparen_question_flags(pRExC_state); + if (UCHARAT(RExC_parse) != ':') { + nextchar(pRExC_state); + *flagp = TRYAGAIN; + return NULL; + } + paren = ':'; + nextchar(pRExC_state); + ret = NULL; + goto parse_rest; + } /* end switch */ + } + else { /* (...) */ + capturing_parens: + parno = RExC_npar; + RExC_npar++; + + ret = reganode(pRExC_state, OPEN, parno); + if (!SIZE_ONLY ){ + if (!RExC_nestroot) + RExC_nestroot = parno; + if (RExC_seen & REG_RECURSE_SEEN + && !RExC_open_parens[parno-1]) + { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting open paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ret))); + RExC_open_parens[parno-1]= ret; + } + } + Set_Node_Length(ret, 1); /* MJD */ + Set_Node_Offset(ret, RExC_parse); /* MJD */ + is_open = 1; + } + } + else /* ! paren */ + ret = NULL; + + parse_rest: + /* Pick up the branches, linking them together. */ + parse_start = RExC_parse; /* MJD */ + br = regbranch(pRExC_state, &flags, 1,depth+1); + + /* branch_len = (paren != 0); */ + + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + } + if (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, BRANCHJ, br, depth+1); + } + else { /* MJD */ + reginsert(pRExC_state, BRANCH, br, depth+1); + Set_Node_Length(br, paren != 0); + Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); + } + have_branch = 1; + if (SIZE_ONLY) + RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ + } + else if (paren == ':') { + *flagp |= flags&SIMPLE; + } + if (is_open) { /* Starts with OPEN. */ + REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ + } + else if (paren != '?') /* Not Conditional */ + ret = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + lastbr = br; + while (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + ender = reganode(pRExC_state, LONGJMP,0); + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); + } + if (SIZE_ONLY) + RExC_extralen += 2; /* Account for LONGJMP. */ + nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } + br = regbranch(pRExC_state, &flags, 0, depth+1); + + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + } + REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ + lastbr = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + } + + if (have_branch || paren != ':') { + /* Make a closing node, and hook it on the end. */ + switch (paren) { + case ':': + ender = reg_node(pRExC_state, TAIL); + break; + case 1: case 2: + ender = reganode(pRExC_state, CLOSE, parno); + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting close paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ender))); + RExC_close_parens[parno-1]= ender; + if (RExC_nestroot == parno) + RExC_nestroot = 0; + } + Set_Node_Offset(ender,RExC_parse+1); /* MJD */ + Set_Node_Length(ender,1); /* MJD */ + break; + case '<': + case ',': + case '=': + case '!': + *flagp &= ~HASWIDTH; + /* FALLTHROUGH */ + case '>': + ender = reg_node(pRExC_state, SUCCEED); + break; + case 0: + ender = reg_node(pRExC_state, END); + if (!SIZE_ONLY) { + assert(!RExC_opend); /* there can only be one! */ + RExC_opend = ender; + } + break; + } + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("lsbr"); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); + PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(lastbr), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - lastbr) + ); + }); + REGTAIL(pRExC_state, lastbr, ender); + + if (have_branch && !SIZE_ONLY) { + char is_nothing= 1; + if (depth==1) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; + + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br; br = regnext(br)) { + const U8 op = PL_regkind[OP(br)]; + if (op == BRANCH) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) + is_nothing= 0; + } + else if (op == BRANCHJ) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); + /* for now we always disable this optimisation * / + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) + */ + is_nothing= 0; + } + } + if (is_nothing) { + br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("NADA"); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); + PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(ret), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - ret) + ); + }); + OP(br)= NOTHING; + if (OP(ender) == TAIL) { + NEXT_OFF(br)= 0; + RExC_emit= br + 1; + } else { + regnode *opt; + for ( opt= br + 1; opt < ender ; opt++ ) + OP(opt)= OPTIMIZED; + NEXT_OFF(br)= ender - br; + } + } + } + } + + { + const char *p; + static const char parens[] = "=!<,>"; + + if (paren && (p = strchr(parens, paren))) { + U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; + int flag = (p - parens) > 1; + + if (paren == '>') + node = SUSPEND, flag = 0; + reginsert(pRExC_state, node,ret, depth+1); + Set_Node_Cur_Length(ret, parse_start); + Set_Node_Offset(ret, parse_start + 1); + ret->flags = flag; + REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); + } + } + + /* Check for proper termination. */ + if (paren) { + /* restore original flags, but keep (?p) */ + RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); + if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ("); + } + } + else if (!paren && RExC_parse < RExC_end) { + if (*RExC_parse == ')') { + RExC_parse++; + vFAIL("Unmatched )"); + } + else + FAIL("Junk on end of regexp"); /* "Can't happen". */ + assert(0); /* NOTREACHED */ + } + + if (RExC_in_lookbehind) { + RExC_in_lookbehind--; + } + if (after_freeze > RExC_npar) + RExC_npar = after_freeze; + return(ret); +} + +/* + - regbranch - one alternative of an | operator + * + * Implements the concatenation operator. + * + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + * restarted. + */ +STATIC regnode * +S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) +{ + regnode *ret; + regnode *chain = NULL; + regnode *latest; + I32 flags = 0, c = 0; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGBRANCH; + + DEBUG_PARSE("brnc"); + + if (first) + ret = NULL; + else { + if (!SIZE_ONLY && RExC_extralen) + ret = reganode(pRExC_state, BRANCHJ,0); + else { + ret = reg_node(pRExC_state, BRANCH); + Set_Node_Length(ret, 1); + } + } + + if (!first && SIZE_ONLY) + RExC_extralen += 1; /* BRANCHJ */ + + *flagp = WORST; /* Tentatively. */ + + RExC_parse--; + nextchar(pRExC_state); + while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { + flags &= ~TRYAGAIN; + latest = regpiece(pRExC_state, &flags,depth+1); + if (latest == NULL) { + if (flags & TRYAGAIN) + continue; + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); + } + else if (ret == NULL) + ret = latest; + *flagp |= flags&(HASWIDTH|POSTPONED); + if (chain == NULL) /* First piece. */ + *flagp |= flags&SPSTART; + else { + RExC_naughty++; + REGTAIL(pRExC_state, chain, latest); + } + chain = latest; + c++; + } + if (chain == NULL) { /* Loop ran zero times. */ + chain = reg_node(pRExC_state, NOTHING); + if (ret == NULL) + ret = chain; + } + if (c == 1) { + *flagp |= flags&SIMPLE; + } + + return ret; +} + +/* + - regpiece - something followed by possible [*+?] + * + * Note that the branching code sequences used for ? and the general cases + * of * and + are somewhat optimized: they use the same NOTHING node as + * both the endmarker for their branch list and the body of the last branch. + * It might seem that this node could be dispensed with entirely, but the + * endmarker role is not redundant. + * + * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with + * TRYAGAIN. + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + * restarted. + */ +STATIC regnode * +S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + regnode *ret; + char op; + char *next; + I32 flags; + const char * const origparse = RExC_parse; + I32 min; + I32 max = REG_INFTY; +#ifdef RE_TRACK_PATTERN_OFFSETS + char *parse_start; +#endif + const char *maxpos = NULL; + + /* Save the original in case we change the emitted regop to a FAIL. */ + regnode * const orig_emit = RExC_emit; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPIECE; + + DEBUG_PARSE("piec"); + + ret = regatom(pRExC_state, &flags,depth+1); + if (ret == NULL) { + if (flags & (TRYAGAIN|RESTART_UTF8)) + *flagp |= flags & (TRYAGAIN|RESTART_UTF8); + else + FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); + return(NULL); + } + + op = *RExC_parse; + + if (op == '{' && regcurly(RExC_parse)) { + maxpos = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; /* MJD */ +#endif + next = RExC_parse + 1; + while (isDIGIT(*next) || *next == ',') { + if (*next == ',') { + if (maxpos) + break; + else + maxpos = next; + } + next++; + } + if (*next == '}') { /* got one */ + const char* endptr; + if (!maxpos) + maxpos = next; + RExC_parse++; + min = grok_atou(RExC_parse, &endptr); + if (*maxpos == ',') + maxpos++; + else + maxpos = RExC_parse; + max = grok_atou(maxpos, &endptr); + if (!max && *maxpos != '0') + max = REG_INFTY; /* meaning "infinity" */ + else if (max >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + RExC_parse = next; + nextchar(pRExC_state); + if (max < min) { /* If can't match, warn and optimize to fail + unconditionally */ + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); + + /* We can't back off the size because we have to reserve + * enough space for all the things we are about to throw + * away, but we can shrink it by the ammount we are about + * to re-use here */ + RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; + } + else { + RExC_emit = orig_emit; + } + ret = reg_node(pRExC_state, OPFAIL); + return ret; + } + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { + if (SIZE_ONLY) { + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); + } + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); + } + + do_curly: + if ((flags&SIMPLE)) { + RExC_naughty += 2 + RExC_naughty / 2; + reginsert(pRExC_state, CURLY, ret, depth+1); + Set_Node_Offset(ret, parse_start+1); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); + } + else { + regnode * const w = reg_node(pRExC_state, WHILEM); + + w->flags = 0; + REGTAIL(pRExC_state, ret, w); + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, LONGJMP,ret, depth+1); + reginsert(pRExC_state, NOTHING,ret, depth+1); + NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ + } + reginsert(pRExC_state, CURLYX,ret, depth+1); + /* MJD hk */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Length(ret, + op == '{' ? (RExC_parse - parse_start) : 1); + + if (!SIZE_ONLY && RExC_extralen) + NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ + REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); + if (SIZE_ONLY) + RExC_whilem_seen++, RExC_extralen += 3; + RExC_naughty += 4 + RExC_naughty; /* compound interest */ + } + ret->flags = 0; + + if (min > 0) + *flagp = WORST; + if (max > 0) + *flagp |= HASWIDTH; + if (!SIZE_ONLY) { + ARG1_SET(ret, (U16)min); + ARG2_SET(ret, (U16)max); + } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + + goto nest_check; + } + } + + if (!ISMULT1(op)) { + *flagp = flags; + return(ret); + } + +#if 0 /* Now runtime fix should be reliable. */ + + /* if this is reinstated, don't forget to put this back into perldiag: + + =item Regexp *+ operand could be empty at {#} in regex m/%s/ + + (F) The part of the regexp subject to either the * or + quantifier + could match an empty string. The {#} shows in the regular + expression about where the problem was discovered. + + */ + + if (!(flags&HASWIDTH) && op != '?') + vFAIL("Regexp *+ operand could be empty"); +#endif + +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; +#endif + nextchar(pRExC_state); + + *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); + + if (op == '*' && (flags&SIMPLE)) { + reginsert(pRExC_state, STAR, ret, depth+1); + ret->flags = 0; + RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + } + else if (op == '*') { + min = 0; + goto do_curly; + } + else if (op == '+' && (flags&SIMPLE)) { + reginsert(pRExC_state, PLUS, ret, depth+1); + ret->flags = 0; + RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + } + else if (op == '+') { + min = 1; + goto do_curly; + } + else if (op == '?') { + min = 0; max = 1; + goto do_curly; + } + nest_check: + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); + (void)ReREFCNT_inc(RExC_rx_sv); + } + + if (RExC_parse < RExC_end && *RExC_parse == '?') { + nextchar(pRExC_state); + reginsert(pRExC_state, MINMOD, ret, depth+1); + REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); + } + else + if (RExC_parse < RExC_end && *RExC_parse == '+') { + regnode *ender; + nextchar(pRExC_state); + ender = reg_node(pRExC_state, SUCCEED); + REGTAIL(pRExC_state, ret, ender); + reginsert(pRExC_state, SUSPEND, ret, depth+1); + ret->flags = 0; + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, ret, ender); + } + + if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { + RExC_parse++; + vFAIL("Nested quantifiers"); + } + + return(ret); +} + +STATIC bool +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ + ) +{ + + /* This is expected to be called by a parser routine that has recognized '\N' + and needs to handle the rest. RExC_parse is expected to point at the first + char following the N at the time of the call. On successful return, + RExC_parse has been updated to point to just after the sequence identified + by this routine, and <*flagp> has been updated. + + The \N may be inside (indicated by the boolean ) or outside a + character class. + + \N may begin either a named sequence, or if outside a character class, mean + to match a non-newline. For non single-quoted regexes, the tokenizer has + attempted to decide which, and in the case of a named sequence, converted it + into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, + where c1... are the characters in the sequence. For single-quoted regexes, + the tokenizer passes the \N sequence through unchanged; this code will not + attempt to determine this nor expand those, instead raising a syntax error. + The net effect is that if the beginning of the passed-in pattern isn't '{U+' + or there is no '}', it signals that this \N occurrence means to match a + non-newline. + + Only the \N{U+...} form should occur in a character class, for the same + reason that '.' inside a character class means to just match a period: it + just doesn't make sense. + + The function raises an error (via vFAIL), and doesn't return for various + syntax errors. Otherwise it returns TRUE and sets or on + success; it returns FALSE otherwise. Returns FALSE, setting *flagp to + RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is + only possible if node_p is non-NULL. + + + If is non-null, it means the caller can accept an input sequence + consisting of a just a single code point; <*valuep> is set to that value + if the input is such. + + If is non-null it signifies that the caller can accept any other + legal sequence (i.e., one that isn't just a single code point). <*node_p> + is set as follows: + 1) \N means not-a-NL: points to a newly created REG_ANY node; + 2) \N{}: points to a new NOTHING node; + 3) otherwise: points to a new EXACT node containing the resolved + string. + Note that FALSE is returned for single code point sequences if is + null. + */ + + char * endbrace; /* '}' following the name */ + char* p; + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + bool has_multiple_chars; /* true if the input stream contains a sequence of + more than one character */ + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_GROK_BSLASH_N; + + GET_RE_DEBUG_FLAGS; + + assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ + + /* The [^\n] meaning of \N ignores spaces and comments under the /x + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ + p = (RExC_flags & RXf_PMf_EXTENDED) + ? regpatws(pRExC_state, RExC_parse, + TRUE) /* means recognize comments */ + : RExC_parse; + + /* Disambiguate between \N meaning a named character versus \N meaning + * [^\n]. The former is assumed when it can't be the latter. */ + if (*p != '{' || regcurly(p)) { + RExC_parse = p; + if (! node_p) { + /* no bare \N allowed in a charclass */ + if (in_char_class) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + return FALSE; + } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ + nextchar(pRExC_state); + *node_p = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + Set_Node_Length(*node_p, 1); /* MJD */ + return TRUE; + } + + /* Here, we have decided it should be a named character or sequence */ + + /* The test above made sure that the next real character is a '{', but + * under the /x modifier, it could be separated by space (or a comment and + * \n) and this is not allowed (for consistency with \x{...} and the + * tokenizer handling of \N{NAME}). */ + if (*RExC_parse != '{') { + vFAIL("Missing braces on \\N{}"); + } + + RExC_parse++; /* Skip past the '{' */ + + if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ + || ! (endbrace == RExC_parse /* nothing between the {} */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ + { + if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ + vFAIL("\\N{NAME} must be resolved by the lexer"); + } + + if (endbrace == RExC_parse) { /* empty: \N{} */ + bool ret = TRUE; + if (node_p) { + *node_p = reg_node(pRExC_state,NOTHING); + } + else if (in_char_class) { + if (SIZE_ONLY && in_char_class) { + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } + else { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class"); + } + } + ret = FALSE; + } + else { + return FALSE; + } + nextchar(pRExC_state); + return ret; + } + + RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ + RExC_parse += 2; /* Skip past the 'U+' */ + + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + + /* Code points are separated by dots. If none, there is only one code + * point, and is terminated by the brace */ + has_multiple_chars = (endchar < endbrace); + + if (valuep && (! has_multiple_chars || in_char_class)) { + /* We only pay attention to the first char of + multichar strings being returned in char classes. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. XXX Solution is to recharacterize as + [rest-of-class]|multi1|multi2... */ + + STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); + I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + + *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); + + /* The tokenizer should have guaranteed validity, but it's possible to + * bypass it by using single quoting, so check */ + if (length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) { + RExC_parse = endchar; + } + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + + if (in_char_class && has_multiple_chars) { + if (strict) { + RExC_parse = endbrace; + vFAIL("\\N{} in character class restricted to one character"); + } + else { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } + } + + RExC_parse = endbrace + 1; + } + else if (! node_p || ! has_multiple_chars) { + + /* Here, the input is legal, but not according to the caller's + * options. We fail without advancing the parse, so that the + * caller can try again */ + RExC_parse = p; + return FALSE; + } + else { + + /* What is done here is to convert this to a sub-pattern of the form + * (?:\x{char1}\x{char2}...) + * and then call reg recursively. That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. toke.c has converted the original Unicode values + * to native, so that we can just pass on the hex values unchanged. We + * do have to set a flag to keep recoding from happening in the + * recursion */ + + SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); + STRLEN len; + char *orig_end = RExC_end; + I32 flags; + + while (RExC_parse < endbrace) { + + /* Convert to notation the rest of the code understands */ + sv_catpv(substitute_parse, "\\x{"); + sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(substitute_parse, "}"); + + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + } + sv_catpv(substitute_parse, ")"); + + RExC_parse = SvPV(substitute_parse, len); + + /* Don't allow empty number */ + if (len < 8) { + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + RExC_end = RExC_parse + len; + + /* The values are Unicode, and therefore not subject to recoding */ + RExC_override_recoding = 1; + + if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return FALSE; + } + FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", + (UV) flags); + } + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + + RExC_parse = endbrace; + RExC_end = orig_end; + RExC_override_recoding = 0; + + nextchar(pRExC_state); + } + + return TRUE; +} + + +/* + * reg_recode + * + * It returns the code point in utf8 for the value in *encp. + * value: a code value in the source encoding + * encp: a pointer to an Encode object + * + * If the result from Encode is not a single character, + * it returns U+FFFD (Replacement character) and sets *encp to NULL. + */ +STATIC UV +S_reg_recode(pTHX_ const char value, SV **encp) +{ + STRLEN numlen = 1; + SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); + const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); + const STRLEN newlen = SvCUR(sv); + UV uv = UNICODE_REPLACEMENT; + + PERL_ARGS_ASSERT_REG_RECODE; + + if (newlen) + uv = SvUTF8(sv) + ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) + : *(U8*)s; + + if (!newlen || numlen != newlen) { + uv = UNICODE_REPLACEMENT; + *encp = NULL; + } + return uv; +} + +PERL_STATIC_INLINE U8 +S_compute_EXACTish(RExC_state_t *pRExC_state) +{ + U8 op; + + PERL_ARGS_ASSERT_COMPUTE_EXACTISH; + + if (! FOLD) { + return EXACT; + } + + op = get_regex_charset(RExC_flags); + if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { + op--; /* /a is same as /u, and map /aa's offset to what /a's would have + been, so there is no hole */ + } + + return op + EXACTF; +} + +PERL_STATIC_INLINE void +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) +{ + /* This knows the details about sizing an EXACTish node, setting flags for + * it (by setting <*flagp>, and potentially populating it with a single + * character. + * + * If (the length in bytes) is non-zero, this function assumes that + * the node has already been populated, and just does the sizing. In this + * case should be the final code point that has already been + * placed into the node. This value will be ignored except that under some + * circumstances <*flagp> is set based on it. + * + * If is zero, the function assumes that the node is to contain only + * the single character given by and calculates what + * should be. In pass 1, it sizes the node appropriately. In pass 2, it + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set + * + * It knows that under FOLD, the Latin Sharp S and UTF characters above + * 255, must be folded (the former only when the rules indicate it can + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ + + bool len_passed_in = cBOOL(len != 0); + U8 character[UTF8_MAXBYTES_CASE+1]; + + PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + + if (! len_passed_in) { + if (UTF) { + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, + character, + &len, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; + } + else { + uvchr_to_utf8( character, code_point); + len = UTF8SKIP(character); + } + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { + *character = (U8) code_point; + len = 1; + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { + *character = 's'; + *(character + 1) = 's'; + len = 2; + } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } + } + + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } + else { + RExC_emit += STR_SZ(len); + STR_LEN(node) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(node), len, char); + } + } + + *flagp |= HASWIDTH; + + /* A single character node is SIMPLE, except for the special-cased SHARP S + * under /di. */ + if ((len == 1 || (UTF && len == UNISKIP(code_point))) + && (code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS)) + { + *flagp |= SIMPLE; + } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } +} + + +/* Parse backref decimal value, unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + const char* endptr; + UV val = grok_atou(p, &endptr); + if (endptr == p || endptr == NULL || val > I32_MAX) + return I32_MAX; + return (I32)val; +} + + +/* + - regatom - the lowest level + + Try to identify anything special at the start of the pattern. If there + is, then handle it as required. This may involve generating a single regop, + such as for an assertion; or it may involve recursing, such as to + handle a () structure. + + If the string doesn't start with something special then we gobble up + as much literal text as we can. + + Once we have been able to handle whatever type of thing started the + sequence, we return. + + Note: we have to be careful with escapes, as they can be both literal + and special, and in the case of \10 and friends, context determines which. + + A summary of the code structure is: + + switch (first_byte) { + cases for each special: + handle this special; + break; + case '\\': + switch (2nd byte) { + cases for each unambiguous special: + handle this special; + break; + cases for each ambigous special/literal: + disambiguate; + if (special) handle here + else goto defchar; + default: // unambiguously literal: + goto defchar; + } + default: // is a literal char + // FALL THROUGH + defchar: + create EXACTish node for literal; + while (more input and node isn't full) { + switch (input_byte) { + cases for each special; + make sure parse pointer is set so that the next call to + regatom will see this special first + goto loopdone; // EXACTish node terminated by prev. char + default: + append char to EXACTISH node; + } + get next input byte; + } + loopdone: + } + return the generated node; + + Specifically there are two separate switches for handling + escape sequences, with the one for handling literal escapes requiring + a dummy entry for all of the special escapes that are actually handled + by the other. + + Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with + TRYAGAIN. + Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + restarted. + Otherwise does not return NULL. +*/ + +STATIC regnode * +S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + regnode *ret = NULL; + I32 flags = 0; + char *parse_start = RExC_parse; + U8 op; + int invert = 0; + U8 arg; + + GET_RE_DEBUG_FLAGS_DECL; + + *flagp = WORST; /* Tentatively. */ + + DEBUG_PARSE("atom"); + + PERL_ARGS_ASSERT_REGATOM; + +tryagain: + switch ((U8)*RExC_parse) { + case '^': + RExC_seen_zerolen++; + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MBOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SBOL); + else + ret = reg_node(pRExC_state, BOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '$': + nextchar(pRExC_state); + if (*RExC_parse) + RExC_seen_zerolen++; + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MEOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SEOL); + else + ret = reg_node(pRExC_state, EOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '.': + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SANY); + else + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + Set_Node_Length(ret, 1); /* MJD */ + break; + case '[': + { + char * const oregcomp_parse = ++RExC_parse; + ret = regclass(pRExC_state, flagp,depth+1, + FALSE, /* means parse the whole char class */ + TRUE, /* allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + NULL); + if (*RExC_parse != ']') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } + if (ret == NULL) { + if (*flagp & RESTART_UTF8) + return NULL; + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); + } + nextchar(pRExC_state); + Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ + break; + } + case '(': + nextchar(pRExC_state); + ret = reg(pRExC_state, 2, &flags,depth+1); + if (ret == NULL) { + if (flags & TRYAGAIN) { + if (RExC_parse == RExC_end) { + /* Make parent create an empty node if needed. */ + *flagp |= TRYAGAIN; + return(NULL); + } + goto tryagain; + } + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); + } + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + break; + case '|': + case ')': + if (flags & TRYAGAIN) { + *flagp |= TRYAGAIN; + return NULL; + } + vFAIL("Internal urp"); + /* Supposed to be caught earlier. */ + break; + case '?': + case '+': + case '*': + RExC_parse++; + vFAIL("Quantifier follows nothing"); + break; + case '\\': + /* Special Escapes + + This switch handles escape sequences that resolve to some kind + of special regop and not to literal text. Escape sequnces that + resolve to literal text are handled below in the switch marked + "Literal Escapes". + + Every entry in this switch *must* have a corresponding entry + in the literal escape switch. However, the opposite is not + required, as the default for this switch is to jump to the + literal text handling code. + */ + switch ((U8)*++RExC_parse) { + /* Special Escapes */ + case 'A': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, SBOL); + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'G': + ret = reg_node(pRExC_state, GPOS); + RExC_seen |= REG_GPOS_SEEN; + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'K': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, KEEPS); + *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_LOOKBEHIND_SEEN; + goto finish_meta_pat; + case 'Z': + ret = reg_node(pRExC_state, SEOL); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'z': + ret = reg_node(pRExC_state, EOS); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'C': + ret = reg_node(pRExC_state, CANY); + RExC_seen |= REG_CANY_SEEN; + *flagp |= HASWIDTH|SIMPLE; + if (SIZE_ONLY) { + ckWARNdep(RExC_parse+1, "\\C is deprecated"); + } + goto finish_meta_pat; + case 'X': + ret = reg_node(pRExC_state, CLUMP); + *flagp |= HASWIDTH; + goto finish_meta_pat; + + case 'W': + invert = 1; + /* FALLTHROUGH */ + case 'w': + arg = ANYOF_WORDCHAR; + goto join_posix; + + case 'b': + RExC_seen_zerolen++; + RExC_seen |= REG_LOOKBEHIND_SEEN; + op = BOUND + get_regex_charset(RExC_flags); + if (op > BOUNDA) { /* /aa is same as /a */ + op = BOUNDA; + } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); + } + goto finish_meta_pat; + case 'B': + RExC_seen_zerolen++; + RExC_seen |= REG_LOOKBEHIND_SEEN; + op = NBOUND + get_regex_charset(RExC_flags); + if (op > NBOUNDA) { /* /aa is same as /a */ + op = NBOUNDA; + } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); + } + goto finish_meta_pat; + + case 'D': + invert = 1; + /* FALLTHROUGH */ + case 'd': + arg = ANYOF_DIGIT; + goto join_posix; + + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + + case 'H': + invert = 1; + /* FALLTHROUGH */ + case 'h': + arg = ANYOF_BLANK; + op = POSIXU; + goto join_posix_op_known; + + case 'V': + invert = 1; + /* FALLTHROUGH */ + case 'v': + arg = ANYOF_VERTWS; + op = POSIXU; + goto join_posix_op_known; + + case 'S': + invert = 1; + /* FALLTHROUGH */ + case 's': + arg = ANYOF_SPACE; + + join_posix: + + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; + } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } + + join_posix_op_known: + + if (invert) { + op += NPOSIXD - POSIXD; + } + + ret = reg_node(pRExC_state, op); + if (! SIZE_ONLY) { + FLAGS(ret) = namedclass_to_classnum(arg); + } + + *flagp |= HASWIDTH|SIMPLE; + /* FALLTHROUGH */ + + finish_meta_pat: + nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ + break; + case 'p': + case 'P': + { +#ifdef DEBUGGING + char* parse_start = RExC_parse - 2; +#endif + + RExC_parse--; + + ret = regclass(pRExC_state, flagp,depth+1, + TRUE, /* means just parse this element */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. + It would be a bug if these returned + non-portables */ + NULL); + /* regclass() can only return RESTART_UTF8 if multi-char folds + are allowed. */ + if (!ret) + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); + + RExC_parse--; + + Set_Node_Offset(ret, parse_start + 2); + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + } + break; + case 'N': + /* Handle \N and \N{NAME} with multiple code points here and not + * below because it can be multicharacter. join_exact() will join + * them up later on. Also this makes sure that things like + * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. + * The options to the grok function call causes it to fail if the + * sequence is just a single code point. We then go treat it as + * just another character in the current EXACT node, and hence it + * gets uniform treatment with all the other characters. The + * special treatment for quantifiers is not needed for such single + * character sequences */ + ++RExC_parse; + if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, + FALSE /* not strict */ )) { + if (*flagp & RESTART_UTF8) + return NULL; + RExC_parse--; + goto defchar; + } + break; + case 'k': /* Handle \k and \k'NAME' */ + parse_named_seq: + { + char ch= RExC_parse[1]; + if (ch != '<' && ch != '\'' && ch != '{') { + RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.2s... not terminated",parse_start); + } else { + /* this pretty much dupes the code for (?P=...) in reg(), if + you change this make sure you change that */ + char* name_start = (RExC_parse += 2); + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; + if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + + } + break; + } + case 'g': + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + { + I32 num; + bool hasbrace = 0; + + if (*RExC_parse == 'g') { + bool isrel = 0; + + RExC_parse++; + if (*RExC_parse == '{') { + RExC_parse++; + hasbrace = 1; + } + if (*RExC_parse == '-') { + RExC_parse++; + isrel = 1; + } + if (hasbrace && !isDIGIT(*RExC_parse)) { + if (isrel) RExC_parse--; + RExC_parse -= 2; + goto parse_named_seq; + } + + num = S_backref_value(RExC_parse); + if (num == 0) + vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); + } + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); + } + } + else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } + } + + /* at this point RExC_parse definitely points to a backref + * number */ + { +#ifdef RE_TRACK_PATTERN_OFFSETS + char * const parse_start = RExC_parse - 1; /* MJD */ +#endif + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (hasbrace) { + if (*RExC_parse != '}') + vFAIL("Unterminated \\g{...} pattern"); + RExC_parse++; + } + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) + vFAIL("Reference to nonexistent group"); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? REF + : (ASCII_FOLD_RESTRICTED) + ? REFFA + : (AT_LEAST_UNI_SEMANTICS) + ? REFFU + : (LOC) + ? REFFL + : REFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + RExC_parse--; + nextchar(pRExC_state); + } + } + break; + case '\0': + if (RExC_parse >= RExC_end) + FAIL("Trailing \\"); + /* FALLTHROUGH */ + default: + /* Do not generate "unrecognized" warnings here, we fall + back into the quick-grab loop below */ + parse_start--; + goto defchar; + } + break; + + case '#': + if (RExC_flags & RXf_PMf_EXTENDED) { + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) + goto tryagain; + } + /* FALLTHROUGH */ + + default: + + parse_start = RExC_parse - 1; + + RExC_parse++; + + defchar: { + STRLEN len = 0; + UV ender = 0; + char *p; + char *s; +#define MAX_NODE_STRING_SIZE 127 + char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; + char *s0; + U8 upper_parse = MAX_NODE_STRING_SIZE; + U8 node_type = compute_EXACTish(pRExC_state); + bool next_is_quantifier; + char * oldp = NULL; + + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); + + /* If a folding node contains only code points that don't + * participate in folds, it can be changed into an EXACT node, + * which allows the optimizer more things to look for */ + bool maybe_exact; + + ret = reg_node(pRExC_state, node_type); + + /* In pass1, folded, we use a temporary buffer instead of the + * actual node, as the node doesn't exist yet */ + s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); + + s0 = s; + + reparse: + + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; + + /* XXX The node can hold up to 255 bytes, yet this only goes to + * 127. I (khw) do not know why. Keeping it somewhat less than + * 255 allows us to not have to worry about overflow due to + * converting to utf8 and fold expansion, but that value is + * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes + * split up by this limit into a single one using the real max of + * 255. Even at 127, this breaks under rare circumstances. If + * folding, we do not want to split a node at a character that is a + * non-final in a multi-char fold, as an input string could just + * happen to want to match across the node boundary. The join + * would solve that problem if the join actually happens. But a + * series of more than two nodes in a row each of 127 would cause + * the first join to succeed to get to 254, but then there wouldn't + * be room for the next one, which could at be one of those split + * multi-char folds. I don't know of any fool-proof solution. One + * could back off to end with only a code point that isn't such a + * non-final, but it is possible for there not to be any in the + * entire node. */ + for (p = RExC_parse - 1; + len < upper_parse && p < RExC_end; + len++) + { + oldp = p; + + if (RExC_flags & RXf_PMf_EXTENDED) + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ + switch ((U8)*p) { + case '^': + case '$': + case '.': + case '[': + case '(': + case ')': + case '|': + goto loopdone; + case '\\': + /* Literal Escapes Switch + + This switch is meant to handle escape sequences that + resolve to a literal character. + + Every escape sequence that represents something + else, like an assertion or a char class, is handled + in the switch marked 'Special Escapes' above in this + routine, but also has an entry here as anything that + isn't explicitly mentioned here will be treated as + an unescaped equivalent literal. + */ + + switch ((U8)*++p) { + /* These are all the special escapes. */ + case 'A': /* Start assertion */ + case 'b': case 'B': /* Word-boundary assertion*/ + case 'C': /* Single char !DANGEROUS! */ + case 'd': case 'D': /* digit class */ + case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ + case 'k': case 'K': /* named backref, keep marker */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ + case 's': case 'S': /* space class */ + case 'v': case 'V': /* VERTWS */ + case 'w': case 'W': /* word class */ + case 'X': /* eXtended Unicode "combining + character sequence" */ + case 'z': case 'Z': /* End of line/string assertion */ + --p; + goto loopdone; + + /* Anything after here is an escape that resolves to a + literal. (Except digits, which may or may not) + */ + case 'n': + ender = '\n'; + p++; + break; + case 'N': /* Handle a single-code point named character. */ + /* The options cause it to fail if a multiple code + * point sequence. Handle those in the switch() above + * */ + RExC_parse = p + 1; + if (! grok_bslash_N(pRExC_state, NULL, &ender, + flagp, depth, FALSE, + FALSE /* not strict */ )) + { + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + RExC_parse = p = oldp; + goto loopdone; + } + p = RExC_parse; + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + case 'r': + ender = '\r'; + p++; + break; + case 't': + ender = '\t'; + p++; + break; + case 'f': + ender = '\f'; + p++; + break; + case 'e': + ender = ASCII_TO_NATIVE('\033'); + p++; + break; + case 'a': + ender = '\a'; + p++; + break; + case 'o': + { + UV result; + const char* error_msg; + + bool valid = grok_bslash_o(&p, + &result, + &error_msg, + TRUE, /* out warnings */ + FALSE, /* not strict */ + TRUE, /* Output warnings + for non- + portables */ + UTF); + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + ender = result; + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } + case 'x': + { + UV result = UV_MAX; /* initialize to erroneous + value */ + const char* error_msg; + + bool valid = grok_bslash_x(&p, + &result, + &error_msg, + TRUE, /* out warnings */ + FALSE, /* not strict */ + TRUE, /* Output warnings + for non- + portables */ + UTF); + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + ender = result; + + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } + case 'c': + p++; + ender = grok_bslash_c(*p++, SIZE_ONLY); + break; + case '8': case '9': /* must be a backreference */ + --p; + goto loopdone; + case '1': case '2': case '3':case '4': + case '5': case '6': case '7': + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. + * + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) + { /* Not to be treated as an octal constant, go + find backref */ + --p; + goto loopdone; + } + /* FALLTHROUGH */ + case '0': + { + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + STRLEN numlen = 3; + ender = grok_oct(p, &numlen, &flags, NULL); + if (ender > 0xff) { + REQUIRE_UTF8; + } + p += numlen; + if (SIZE_ONLY /* like \08, \178 */ + && numlen < 3 + && p < RExC_end + && isDIGIT(*p) && ckWARN(WARN_REGEXP)) + { + reg_warn_non_literal_string( + p + 1, + form_short_octal_warning(p, numlen)); + } + } + if (PL_encoding && ender < 0x100) + goto recode_encoding; + break; + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + ender = reg_recode((const char)(U8)ender, &enc); + if (!enc && SIZE_ONLY) + ckWARNreg(p, "Invalid escape in the specified encoding"); + REQUIRE_UTF8; + } + break; + case '\0': + if (p >= RExC_end) + FAIL("Trailing \\"); + /* FALLTHROUGH */ + default: + if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { + /* Include any { following the alpha to emphasize + * that it could be part of an escape at some point + * in the future */ + int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; + ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p); + } + goto normal_default; + } /* End of switch on '\' */ + break; + case '{': + /* Currently we don't warn when the lbrace is at the start + * of a construct. This catches it in the middle of a + * literal string, or when its the first thing after + * something like "\b" */ + if (! SIZE_ONLY + && (len || (p > RExC_start && isALPHA_A(*(p -1))))) + { + ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); + } + /*FALLTHROUGH*/ + default: /* A literal character */ + normal_default: + if (UTF8_IS_START(*p) && UTF) { + STRLEN numlen; + ender = utf8n_to_uvchr((U8*)p, RExC_end - p, + &numlen, UTF8_ALLOW_DEFAULT); + p += numlen; + } + else + ender = (U8) *p++; + break; + } /* End of switch on the literal */ + + /* Here, have looked at the literal character and + * contains its ordinal,

points to the character after it + */ + + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ + + /* If the next thing is a quantifier, it applies to this + * character only, which means that this character has to be in + * its own node and can't just be appended to the string in an + * existing node, so if there are already other characters in + * the node, close the node with just them, and set up to do + * this character again next time through, when it will be the + * only thing in its new node */ + if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + { + p = oldp; + goto loopdone; + } + + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { + if (UTF) { + const STRLEN unilen = reguni(pRExC_state, ender, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len--; + } + else { + REGC((char)ender, s++); + } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } + } + else /* FOLD */ + if (! ( UTF + /* See comments for join_exact() as to why we fold this + * non-UTF at compile time */ + || (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S))) + { + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) + { + maybe_exactfu = FALSE; + } + } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ + *(s++) = (char) ender; + } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; + } + else { + STRLEN foldlen; + + folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { + maybe_exact = FALSE; + } + } + } + ender = folded; + } + + if (next_is_quantifier) { + + /* Here, the next input is a quantifier, and to get here, + * the current character is the only one in the node. + * Also, here doesn't include the final byte for this + * character */ + len++; + goto loopdone; + } + + } /* End of loop through literal characters */ + + /* Here we have either exhausted the input or ran out of room in + * the node. (If we encountered a character that can't be in the + * node, transfer is made directly to , and so we + * wouldn't have fallen off the end of the loop.) In the latter + * case, we artificially have to split the node into two, because + * we just don't have enough space to hold everything. This + * creates a problem if the final character participates in a + * multi-character fold in the non-final position, as a match that + * should have occurred won't, due to the way nodes are matched, + * and our artificial boundary. So back off until we find a non- + * problematic character -- one that isn't at the beginning or + * middle of such a fold. (Either it doesn't participate in any + * folds, or appears only in the final position of all the folds it + * does participate in.) A better solution with far fewer false + * positives, and that would fill the nodes more completely, would + * be to actually have available all the multi-character folds to + * test against, and to back-off only far enough to be sure that + * this node isn't ending with a partial one. is set + * further below (if we need to reparse the node) to include just + * up through that final non-problematic character that this code + * identifies, so when it is set to less than the full node, we can + * skip the rest of this */ + if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { + + const STRLEN full_len = len; + + assert(len >= MAX_NODE_STRING_SIZE); + + /* Here, points to the final byte of the final character. + * Look backwards through the string until find a non- + * problematic character */ + + if (! UTF) { + + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { + goto loopdone; + } + + while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } + len = s - s0 + 1; + } + else { + if (! PL_NonL1NonFinalFold) { + PL_NonL1NonFinalFold = _new_invlist_C_array( + NonL1_Perl_Non_Final_Folds_invlist); + } + + /* Point to the first byte of the final character */ + s = (char *) utf8_hop((U8 *) s, -1); + + while (s >= s0) { /* Search backwards until find + non-problematic char */ + if (UTF8_IS_INVARIANT(*s)) { + + /* There are no ascii characters that participate + * in multi-char folds under /aa. In EBCDIC, the + * non-ascii invariants are all control characters, + * so don't ever participate in any folds. */ + if (ASCII_FOLD_RESTRICTED + || ! IS_NON_FINAL_FOLD(*s)) + { + break; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + *s, *(s+1)))) + { + break; + } + } + else if (! _invlist_contains_cp( + PL_NonL1NonFinalFold, + valid_utf8_to_uvchr((U8 *) s, NULL))) + { + break; + } + + /* Here, the current character is problematic in that + * it does occur in the non-final position of some + * fold, so try the character before it, but have to + * special case the very first byte in the string, so + * we don't read outside the string */ + s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); + } /* End of loop backwards through the string */ + + /* If there were only problematic characters in the string, + * will point to before s0, in which case the length + * should be 0, otherwise include the length of the + * non-problematic character just found */ + len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); + } + + /* Here, have found the final character, if any, that is + * non-problematic as far as ending the node without splitting + * it across a potential multi-char fold. contains the + * number of bytes in the node up-to and including that + * character, or is 0 if there is no such character, meaning + * the whole node contains only problematic characters. In + * this case, give up and just take the node as-is. We can't + * do any better */ + if (len == 0) { + len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } + } else { + + /* Here, the node does contain some characters that aren't + * problematic. If one such is the final character in the + * node, we are done */ + if (len == full_len) { + goto loopdone; + } + else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { + + /* If the final character is problematic, but the + * penultimate is not, back-off that last character to + * later start a new node with it */ + p = oldp; + goto loopdone; + } + + /* Here, the final non-problematic character is earlier + * in the input than the penultimate character. What we do + * is reparse from the beginning, going up only as far as + * this final ok one, thus guaranteeing that the node ends + * in an acceptable character. The reason we reparse is + * that we know how far in the character is, but we don't + * know how to correlate its position with the input parse. + * An alternate implementation would be to build that + * correlation as we go along during the original parse, + * but that would entail extra work for every node, whereas + * this code gets executed only when the string is too + * large for the node, and the final two characters are + * problematic, an infrequent occurrence. Yet another + * possible strategy would be to save the tail of the + * string, and the next time regatom is called, initialize + * with that. The problem with this is that unless you + * back off one more character, you won't be guaranteed + * regatom will get called again, unless regbranch, + * regpiece ... are also changed. If you do back off that + * extra character, so that there is input guaranteed to + * force calling regatom, you can't handle the case where + * just the first character in the node is acceptable. I + * (khw) decided to try this method which doesn't have that + * pitfall; if performance issues are found, we can do a + * combination of the current approach plus that one */ + upper_parse = len; + len = 0; + s = s0; + goto reparse; + } + } /* End of verifying node ends with an appropriate char */ + + loopdone: /* Jumped to when encounters something that shouldn't be in + the node */ + + /* I (khw) don't know if you can get here with zero length, but the + * old code handled this situation by creating a zero-length EXACT + * node. Might as well be NOTHING instead */ + if (len == 0) { + OP(ret) = NOTHING; + } + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * (for /u), or depending on locale for /l */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } + } + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); + } + + RExC_parse = p - 1; + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + { + /* len is STRLEN which is unsigned, need to copy to signed */ + IV iv = len; + if (iv < 0) + vFAIL("Internal disaster"); + } + + } /* End of label 'defchar:' */ + break; + } /* End of giant switch on input character */ + + return(ret); +} + +STATIC char * +S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) +{ + /* Returns the next non-pattern-white space, non-comment character (the + * latter only if 'recognize_comment is true) in the string p, which is + * ended by RExC_end. See also reg_skipcomment */ + const char *e = RExC_end; + + PERL_ARGS_ASSERT_REGPATWS; + + while (p < e) { + STRLEN len; + if ((len = is_PATWS_safe(p, e, UTF))) { + p += len; + } + else if (recognize_comment && *p == '#') { + p = reg_skipcomment(pRExC_state, p); + } + else + break; + } + return p; +} + +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + +/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. + Character classes ([:foo:]) can also be negated ([:^foo:]). + Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. + Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, + but trigger failures because they are currently unimplemented. */ + +#define POSIXCC_DONE(c) ((c) == ':') +#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') +#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) + +PERL_STATIC_INLINE I32 +S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) +{ + I32 namedclass = OOB_NAMEDCLASS; + + PERL_ARGS_ASSERT_REGPPOSIXCC; + + if (value == '[' && RExC_parse + 1 < RExC_end && + /* I smell either [: or [= or [. -- POSIX has been here, right? */ + POSIXCC(UCHARAT(RExC_parse))) + { + const char c = UCHARAT(RExC_parse); + char* const s = RExC_parse++; + + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) + RExC_parse++; + if (RExC_parse == RExC_end) { + if (strict) { + + /* Try to give a better location for the error (than the end of + * the string) by looking for the matching ']' */ + RExC_parse = s; + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { + RExC_parse++; + } + vFAIL2("Unmatched '%c' in POSIX class", c); + } + /* Grandfather lone [:, [=, [. */ + RExC_parse = s; + } + else { + const char* const t = RExC_parse++; /* skip over the c */ + assert(*t == c); + + if (UCHARAT(RExC_parse) == ']') { + const char *posixcc = s + 1; + RExC_parse++; /* skip over the ending ] */ + + if (*s == ':') { + const I32 complement = *posixcc == '^' ? *posixcc++ : 0; + const I32 skip = t - posixcc; + + /* Initially switch on the length of the name. */ + switch (skip) { + case 4: + if (memEQ(posixcc, "word", 4)) /* this is not POSIX, + this is the Perl \w + */ + namedclass = ANYOF_WORDCHAR; + break; + case 5: + /* Names all of length 5. */ + /* alnum alpha ascii blank cntrl digit graph lower + print punct space upper */ + /* Offset 4 gives the best switch position. */ + switch (posixcc[4]) { + case 'a': + if (memEQ(posixcc, "alph", 4)) /* alpha */ + namedclass = ANYOF_ALPHA; + break; + case 'e': + if (memEQ(posixcc, "spac", 4)) /* space */ + namedclass = ANYOF_PSXSPC; + break; + case 'h': + if (memEQ(posixcc, "grap", 4)) /* graph */ + namedclass = ANYOF_GRAPH; + break; + case 'i': + if (memEQ(posixcc, "asci", 4)) /* ascii */ + namedclass = ANYOF_ASCII; + break; + case 'k': + if (memEQ(posixcc, "blan", 4)) /* blank */ + namedclass = ANYOF_BLANK; + break; + case 'l': + if (memEQ(posixcc, "cntr", 4)) /* cntrl */ + namedclass = ANYOF_CNTRL; + break; + case 'm': + if (memEQ(posixcc, "alnu", 4)) /* alnum */ + namedclass = ANYOF_ALPHANUMERIC; + break; + case 'r': + if (memEQ(posixcc, "lowe", 4)) /* lower */ + namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; + else if (memEQ(posixcc, "uppe", 4)) /* upper */ + namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; + break; + case 't': + if (memEQ(posixcc, "digi", 4)) /* digit */ + namedclass = ANYOF_DIGIT; + else if (memEQ(posixcc, "prin", 4)) /* print */ + namedclass = ANYOF_PRINT; + else if (memEQ(posixcc, "punc", 4)) /* punct */ + namedclass = ANYOF_PUNCT; + break; + } + break; + case 6: + if (memEQ(posixcc, "xdigit", 6)) + namedclass = ANYOF_XDIGIT; + break; + } + + if (namedclass == OOB_NAMEDCLASS) + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); + + /* The #defines are structured so each complement is +1 to + * the normal one */ + if (complement) { + namedclass++; + } + assert (posixcc[skip] == ':'); + assert (posixcc[skip+1] == ']'); + } else if (!SIZE_ONLY) { + /* [[=foo=]] and [[.foo.]] are still future. */ + + /* adjust RExC_parse so the warning shows after + the class closes */ + while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') + RExC_parse++; + vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } + } else { + /* Maternal grandfather: + * "[:" ending in ":" but not in ":]" */ + if (strict) { + vFAIL("Unmatched '[' in POSIX class"); + } + + /* Grandfather lone [:, [=, [. */ + RExC_parse = s; + } + } + } + + return namedclass; +} + +STATIC bool +S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state) +{ + /* This applies some heuristics at the current parse position (which should + * be at a '[') to see if what follows might be intended to be a [:posix:] + * class. It returns true if it really is a posix class, of course, but it + * also can return true if it thinks that what was intended was a posix + * class that didn't quite make it. + * + * It will return true for + * [:alphanumerics: + * [:alphanumerics] (as long as the ] isn't followed immediately by a + * ')' indicating the end of the (?[ + * [:any garbage including %^&$ punctuation:] + * + * This is designed to be called only from S_handle_regex_sets; it could be + * easily adapted to be called from the spot at the beginning of regclass() + * that checks to see in a normal bracketed class if the surrounding [] + * have been omitted ([:word:] instead of [[:word:]]). But doing so would + * change long-standing behavior, so I (khw) didn't do that */ + char* p = RExC_parse + 1; + char first_char = *p; + + PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS; + + assert(*(p - 1) == '['); + + if (! POSIXCC(first_char)) { + return FALSE; + } + + p++; + while (p < RExC_end && isWORDCHAR(*p)) p++; + + if (p >= RExC_end) { + return FALSE; + } + + if (p - RExC_parse > 2 /* Got at least 1 word character */ + && (*p == first_char + || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')'))) + { + return TRUE; + } + + p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse); + + return (p + && p - RExC_parse > 2 /* [:] evaluates to colon; + [::] is a bad posix class. */ + && first_char == *(p - 1)); +} + +STATIC regnode * +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) +{ + /* Handle the (?[...]) construct to do set operations */ + + U8 curchar; + UV start, end; /* End points of code point ranges */ + SV* result_string; + char *save_end, *save_parse; + SV* final; + STRLEN len; + regnode* node; + AV* stack; + const bool save_fold = FOLD; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; + + if (LOC) { + vFAIL("(?[...]) not valid in locale"); + } + RExC_uni_semantics = 1; + + /* This will return only an ANYOF regnode, or (unlikely) something smaller + * (such as EXACT). Thus we can skip most everything if just sizing. We + * call regclass to handle '[]' so as to not have to reinvent its parsing + * rules here (throwing away the size it computes each time). And, we exit + * upon an unescaped ']' that isn't one ending a regclass. To do both + * these things, we need to realize that something preceded by a backslash + * is escaped, so we have to keep track of backslashes */ + if (SIZE_ONLY) { + UV depth = 0; /* how many nested (?[...]) constructs */ + + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__REGEX_SETS), + "The regex_sets feature is experimental" REPORT_LOCATION, + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); + + while (RExC_parse < RExC_end) { + SV* current = NULL; + RExC_parse = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + switch (*RExC_parse) { + case '?': + if (RExC_parse[1] == '[') depth++, RExC_parse++; + /* FALLTHROUGH */ + default: + break; + case '\\': + /* Skip the next byte (which could cause us to end up in + * the middle of a UTF-8 character, but since none of those + * are confusable with anything we currently handle in this + * switch (invariants all), it's safe. We'll just hit the + * default: case next time and keep on incrementing until + * we find one of the invariants we do handle. */ + RExC_parse++; + break; + case '[': + { + /* If this looks like it is a [:posix:] class, leave the + * parse pointer at the '[' to fool regclass() into + * thinking it is part of a '[[:posix:]]'. That function + * will use strict checking to force a syntax error if it + * doesn't work out to a legitimate class */ + bool is_posix_class + = could_it_be_a_POSIX_class(pRExC_state); + if (! is_posix_class) { + RExC_parse++; + } + + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if (!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char + class only if not a + posix class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + + /* function call leaves parse pointing to the ']', except + * if we faked it */ + if (is_posix_class) { + RExC_parse--; + } + + SvREFCNT_dec(current); /* In case it returned something */ + break; + } + + case ']': + if (depth--) break; + RExC_parse++; + if (RExC_parse < RExC_end + && *RExC_parse == ')') + { + node = reganode(pRExC_state, ANYOF, 0); + RExC_size += ANYOF_SKIP; + nextchar(pRExC_state); + Set_Node_Length(node, + RExC_parse - oregcomp_parse + 1); /* MJD */ + return node; + } + goto no_close; + } + RExC_parse++; + } + + no_close: + FAIL("Syntax error in (?[...])"); + } + + /* Pass 2 only after this. Everything in this construct is a + * metacharacter. Operands begin with either a '\' (for an escape + * sequence), or a '[' for a bracketed character class. Any other + * character should be an operator, or parenthesis for grouping. Both + * types of operands are handled by calling regclass() to parse them. It + * is called with a parameter to indicate to return the computed inversion + * list. The parsing here is implemented via a stack. Each entry on the + * stack is a single character representing one of the operators, or the + * '('; or else a pointer to an operand inversion list. */ + +#define IS_OPERAND(a) (! SvIOK(a)) + + /* The stack starts empty. It is a syntax error if the first thing parsed + * is a binary operator; everything else is pushed on the stack. When an + * operand is parsed, the top of the stack is examined. If it is a binary + * operator, the item before it should be an operand, and both are replaced + * by the result of doing that operation on the new operand and the one on + * the stack. Thus a sequence of binary operands is reduced to a single + * one before the next one is parsed. + * + * A unary operator may immediately follow a binary in the input, for + * example + * [a] + ! [b] + * When an operand is parsed and the top of the stack is a unary operator, + * the operation is performed, and then the stack is rechecked to see if + * this new operand is part of a binary operation; if so, it is handled as + * above. + * + * A '(' is simply pushed on the stack; it is valid only if the stack is + * empty, or the top element of the stack is an operator or another '(' + * (for which the parenthesized expression will become an operand). By the + * time the corresponding ')' is parsed everything in between should have + * been parsed and evaluated to a single operand (or else is a syntax + * error), and is handled as a regular operand */ + + sv_2mortal((SV *)(stack = newAV())); + + while (RExC_parse < RExC_end) { + I32 top_index = av_tindex(stack); + SV** top_ptr; + SV* current = NULL; + + /* Skip white space */ + RExC_parse = regpatws(pRExC_state, RExC_parse, + TRUE /* means recognize comments */ ); + if (RExC_parse >= RExC_end) { + Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); + } + if ((curchar = UCHARAT(RExC_parse)) == ']') { + break; + } + + switch (curchar) { + + case '?': + if (av_tindex(stack) >= 0 /* This makes sure that we can + safely subtract 1 from + RExC_parse in the next clause. + If we have something on the + stack, we have parsed something + */ + && UCHARAT(RExC_parse - 1) == '(' + && RExC_parse < RExC_end) + { + /* If is a '(?', could be an embedded '(?flags:(?[...])'. + * This happens when we have some thing like + * + * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; + * ... + * qr/(?[ \p{Digit} & $thai_or_lao ])/; + * + * Here we would be handling the interpolated + * '$thai_or_lao'. We handle this by a recursive call to + * ourselves which returns the inversion list the + * interpolated expression evaluates to. We use the flags + * from the interpolated pattern. */ + U32 save_flags = RExC_flags; + const char * const save_parse = ++RExC_parse; + + parse_lparen_question_flags(pRExC_state); + + if (RExC_parse == save_parse /* Makes sure there was at + least one flag (or this + embedding wasn't compiled) + */ + || RExC_parse >= RExC_end - 4 + || UCHARAT(RExC_parse) != ':' + || UCHARAT(++RExC_parse) != '(' + || UCHARAT(++RExC_parse) != '?' + || UCHARAT(++RExC_parse) != '[') + { + + /* In combination with the above, this moves the + * pointer to the point just after the first erroneous + * character (or if there are no flags, to where they + * should have been) */ + if (RExC_parse >= RExC_end - 4) { + RExC_parse = RExC_end; + } + else if (RExC_parse != save_parse) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } + vFAIL("Expecting '(?flags:(?[...'"); + } + RExC_parse++; + (void) handle_regex_sets(pRExC_state, ¤t, flagp, + depth+1, oregcomp_parse); + + /* Here, 'current' contains the embedded expression's + * inversion list, and RExC_parse points to the trailing + * ']'; the next character should be the ')' which will be + * paired with the '(' that has been put on the stack, so + * the whole embedded expression reduces to '(operand)' */ + RExC_parse++; + + RExC_flags = save_flags; + goto handle_operand; + } + /* FALLTHROUGH */ + + default: + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unexpected character"); + + case '\\': + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if (!regclass(pRExC_state, flagp,depth+1, + TRUE, /* means parse just the next thing */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + /* regclass() will return with parsing just the \ sequence, + * leaving the parse pointer at the next thing to parse */ + RExC_parse--; + goto handle_operand; + + case '[': /* Is a bracketed character class */ + { + bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state); + + if (! is_posix_class) { + RExC_parse++; + } + + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if(!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char class + only if not a posix class */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + /* function call leaves parse pointing to the ']', except if we + * faked it */ + if (is_posix_class) { + RExC_parse--; + } + + goto handle_operand; + } + + case '&': + case '|': + case '+': + case '-': + case '^': + if (top_index < 0 + || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) + || ! IS_OPERAND(*top_ptr)) + { + RExC_parse++; + vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar); + } + av_push(stack, newSVuv(curchar)); + break; + + case '!': + av_push(stack, newSVuv(curchar)); + break; + + case '(': + if (top_index >= 0) { + top_ptr = av_fetch(stack, top_index, FALSE); + assert(top_ptr); + if (IS_OPERAND(*top_ptr)) { + RExC_parse++; + vFAIL("Unexpected '(' with no preceding operator"); + } + } + av_push(stack, newSVuv(curchar)); + break; + + case ')': + { + SV* lparen; + if (top_index < 1 + || ! (current = av_pop(stack)) + || ! IS_OPERAND(current) + || ! (lparen = av_pop(stack)) + || IS_OPERAND(lparen) + || SvUV(lparen) != '(') + { + SvREFCNT_dec(current); + RExC_parse++; + vFAIL("Unexpected ')'"); + } + top_index -= 2; + SvREFCNT_dec_NN(lparen); + + /* FALLTHROUGH */ + } + + handle_operand: + + /* Here, we have an operand to process, in 'current' */ + + if (top_index < 0) { /* Just push if stack is empty */ + av_push(stack, current); + } + else { + SV* top = av_pop(stack); + SV *prev = NULL; + char current_operator; + + if (IS_OPERAND(top)) { + SvREFCNT_dec_NN(top); + SvREFCNT_dec_NN(current); + vFAIL("Operand with no preceding operator"); + } + current_operator = (char) SvUV(top); + switch (current_operator) { + case '(': /* Push the '(' back on followed by the new + operand */ + av_push(stack, top); + av_push(stack, current); + SvREFCNT_inc(top); /* Counters the '_dec' done + just after the 'break', so + it doesn't get wrongly freed + */ + break; + + case '!': + _invlist_invert(current); + + /* Unlike binary operators, the top of the stack, + * now that this unary one has been popped off, may + * legally be an operator, and we now have operand + * for it. */ + top_index--; + SvREFCNT_dec_NN(top); + goto handle_operand; + + case '&': + prev = av_pop(stack); + _invlist_intersection(prev, + current, + ¤t); + av_push(stack, current); + break; + + case '|': + case '+': + prev = av_pop(stack); + _invlist_union(prev, current, ¤t); + av_push(stack, current); + break; + + case '-': + prev = av_pop(stack);; + _invlist_subtract(prev, current, ¤t); + av_push(stack, current); + break; + + case '^': /* The union minus the intersection */ + { + SV* i = NULL; + SV* u = NULL; + SV* element; + + prev = av_pop(stack); + _invlist_union(prev, current, &u); + _invlist_intersection(prev, current, &i); + /* _invlist_subtract will overwrite current + without freeing what it already contains */ + element = current; + _invlist_subtract(u, i, ¤t); + av_push(stack, current); + SvREFCNT_dec_NN(i); + SvREFCNT_dec_NN(u); + SvREFCNT_dec_NN(element); + break; + } + + default: + Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); + } + SvREFCNT_dec_NN(top); + SvREFCNT_dec(prev); + } + } + + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } + + if (av_tindex(stack) < 0 /* Was empty */ + || ((final = av_pop(stack)) == NULL) + || ! IS_OPERAND(final) + || av_tindex(stack) >= 0) /* More left on stack */ + { + vFAIL("Incomplete expression within '(?[ ])'"); + } + + /* Here, 'final' is the resultant inversion list from evaluating the + * expression. Return it if so requested */ + if (return_invlist) { + *return_invlist = final; + return END; + } + + /* Otherwise generate a resultant node, based on 'final'. regclass() is + * expecting a string of ranges and individual code points */ + invlist_iterinit(final); + result_string = newSVpvs(""); + while (invlist_iternext(final, &start, &end)) { + if (start == end) { + Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start); + } + else { + Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}", + start, end); + } + } + + save_parse = RExC_parse; + RExC_parse = SvPV(result_string, len); + save_end = RExC_end; + RExC_end = RExC_parse + len; + + /* We turn off folding around the call, as the class we have constructed + * already has all folding taken into consideration, and we don't want + * regclass() to add to that */ + RExC_flags &= ~RXf_PMf_FOLD; + /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. + */ + node = regclass(pRExC_state, flagp,depth+1, + FALSE, /* means parse the whole char class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. The above may very + well have generated non-portable code points, but + they're valid on this machine */ + NULL); + if (!node) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, + PTR2UV(flagp)); + if (save_fold) { + RExC_flags |= RXf_PMf_FOLD; + } + RExC_parse = save_parse + 1; + RExC_end = save_end; + SvREFCNT_dec_NN(final); + SvREFCNT_dec_NN(result_string); + + nextchar(pRExC_state); + Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ + return node; +} +#undef IS_OPERAND + +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a simple fold with other code points above + * Latin1. It would give false results if /aa has been specified. + * Multi-char folds are outside the scope of this, and must be handled + * specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp)); + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + default: + /* Use deprecated warning to increase the chances of this being + * output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + break; + } +} + +/* The names of properties whose definitions are not known at compile time are + * stored in this SV, after a constant heading. So if the length has been + * changed since initialization, then there is a run-time definition. */ +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) + +STATIC regnode * +S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, + const bool stop_at_1, /* Just parse the next thing, don't + look for a full character class */ + bool allow_multi_folds, + const bool silence_non_portable, /* Don't output warnings + about too large + characters */ + SV** ret_invlist) /* Return an inversion list, not a node */ +{ + /* parse a bracketed class specification. Most of these will produce an + * ANYOF node; but something like [a] will produce an EXACT node; [aA], an + * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex + * under /i with multi-character folds: it will be rewritten following the + * paradigm of this example, where the s are characters which + * fold to multiple character sequences: + * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i + * gets effectively rewritten as: + * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i + * reg() gets called (recursively) on the rewritten version, and this + * function will return what it constructs. (Actually the s + * aren't physically removed from the [abcdefghi], it's just that they are + * ignored in the recursion by means of a flag: + * .) + * + * ANYOF nodes contain a bit map for the first 256 characters, with the + * corresponding bit set if that character is in the list. For characters + * above 255, a range list or swash is used. There are extra bits for \w, + * etc. in locale ANYOFs, as what these match is not determinable at + * compile time + * + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs + * to be restarted. This can only happen if ret_invlist is non-NULL. + */ + + UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; + IV range = 0; + UV value = OOB_UNICODE, save_value = OOB_UNICODE; + regnode *ret; + STRLEN numlen; + IV namedclass = OOB_NAMEDCLASS; + char *rangebegin = NULL; + bool need_class = 0; + SV *listsv = NULL; + STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more + than just initialized. */ + SV* properties = NULL; /* Code points that match \p{} \P{} */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ + UV element_count = 0; /* Number of distinct elements in the class. + Optimizations may be possible if this is tiny */ + AV * multi_char_matches = NULL; /* Code points that fold to more than one + character; used under /i */ + UV n; + char * stop_ptr = RExC_end; /* where to stop parsing */ + const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white + space? */ + const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */ + + /* Unicode properties are stored in a swash; this holds the current one + * being parsed. If this swash is the only above-latin1 component of the + * character class, an optimization is to pass it directly on to the + * execution engine. Otherwise, it is set to NULL to indicate that there + * are other things in the class that have to be dealt with at execution + * time */ + SV* swash = NULL; /* Code points that match \p{} \P{} */ + + /* Set if a component of this character class is user-defined; just passed + * on to the engine */ + bool has_user_defined_property = FALSE; + + /* inversion list of code points this node matches only when the target + * string is in UTF-8. (Because is under /d) */ + SV* depends_list = NULL; + + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ + SV* cp_list = NULL; + + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + +#ifdef EBCDIC + /* In a range, counts how many 0-2 of the ends of it came from literals, + * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ + UV literal_endpoint = 0; +#endif + bool invert = FALSE; /* Is this class to be complemented */ + + bool warn_super = ALWAYS_WARN_SUPER; + + regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in + case we need to change the emitted regop to an EXACT. */ + const char * orig_parse = RExC_parse; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCLASS; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + DEBUG_PARSE("clas"); + + /* Assume we are going to generate an ANYOF node. */ + ret = reganode(pRExC_state, ANYOF, 0); + + if (SIZE_ONLY) { + RExC_size += ANYOF_SKIP; + listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ + } + else { + ANYOF_FLAGS(ret) = 0; + + RExC_emit += ANYOF_SKIP; + listsv = newSVpvs_flags("# comment\n", SVs_TEMP); + initial_listsv_len = SvCUR(listsv); + SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ + } + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ + RExC_parse++; + invert = TRUE; + allow_multi_folds = FALSE; + RExC_naughty++; + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + } + + /* Check that they didn't say [:posix:] instead of [[:posix:]] */ + if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) { + const char *s = RExC_parse; + const char c = *s++; + + while (isWORDCHAR(*s)) + s++; + if (*s && c == *s && s[1] == ']') { + SAVEFREESV(RExC_rx_sv); + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + + /* If the caller wants us to just parse a single element, accomplish this + * by faking the loop ending condition */ + if (stop_at_1 && RExC_end > RExC_parse) { + stop_ptr = RExC_parse + 1; + } + + /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ + if (UCHARAT(RExC_parse) == ']') + goto charclassloop; + +parseit: + while (1) { + if (RExC_parse >= stop_ptr) { + break; + } + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (UCHARAT(RExC_parse) == ']') { + break; + } + + charclassloop: + + namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + save_value = value; + save_prevvalue = prevvalue; + + if (!range) { + rangebegin = RExC_parse; + element_count++; + } + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + + if (value == '[' + && RExC_parse < RExC_end + && POSIXCC(UCHARAT(RExC_parse))) + { + namedclass = regpposixcc(pRExC_state, value, strict); + } + else if (value == '\\') { + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. + * A similar issue a little bit later when switching on + * namedclass. --jhi */ + + /* If the \ is escaping white space when white space is being + * skipped, it means that that white space is wanted literally, and + * is already in 'value'. Otherwise, need to translate the escape + * into what it signifies. */ + if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) { + + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; + case 's': namedclass = ANYOF_SPACE; break; + case 'S': namedclass = ANYOF_NSPACE; break; + case 'd': namedclass = ANYOF_DIGIT; break; + case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; + case 'N': /* Handle \N{NAME} in class */ + { + /* We only pay attention to the first char of + multichar strings being returned. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. */ + if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, + TRUE, /* => charclass */ + strict)) + { + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + goto parseit; + } + } + break; + case 'p': + case 'P': + { + char *e; + + /* We will handle any undefined properties ourselves */ + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; + + if (RExC_parse >= RExC_end) + vFAIL2("Empty \\%c{}", (U8)value); + if (*RExC_parse == '{') { + const U8 c = (U8)value; + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL2("Missing right brace on \\%c{}", c); + while (isSPACE(*RExC_parse)) + RExC_parse++; + if (e == RExC_parse) + vFAIL2("Empty \\%c{}", c); + n = e - RExC_parse; + while (isSPACE(*(RExC_parse + n - 1))) + n--; + } + else { + e = RExC_parse; + n = 1; + } + if (!SIZE_ONLY) { + SV* invlist; + char* name; + + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + n--; + /* toggle. (The rhs xor gets the single bit that + * differs between P and p; the other xor inverts just + * that bit) */ + value ^= 'P' ^ 'p'; + + while (isSPACE(*RExC_parse)) { + RExC_parse++; + n--; + } + } + /* Try to get the definition of the property into + * . If /i is in effect, the effective property + * will have its name be <__NAME_i>. The design is + * discussed in commit + * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ + name = savepv(Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + )); + + /* Look up the property name, and get its swash and + * inversion list, if the property is found */ + if (swash) { + SvREFCNT_dec_NN(swash); + } + swash = _core_swash_init("utf8", name, &PL_sv_undef, + 1, /* binary */ + 0, /* not tr/// */ + NULL, /* No inversion list */ + &swash_init_flags + ); + if (! swash || ! (invlist = _get_swash_invlist(swash))) { + HV* curpkg = (IN_PERL_COMPILETIME) + ? PL_curstash + : CopSTASH(PL_curcop); + if (swash) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + /* Here didn't find it. It could be a user-defined + * property that will be available at run-time. If we + * accept only compile-time properties, is an error; + * otherwise add it to the list for run-time look up */ + if (ret_invlist) { + RExC_parse = e + 1; + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); + } + + /* If the property name doesn't already have a package + * name, add the current one to it so that it can be + * referred to outside it. [perl #121777] */ + if (curpkg && ! instr(name, "::")) { + char* pkgname = HvNAME(curpkg); + if (strNE(pkgname, "main")) { + char* full_name = Perl_form(aTHX_ + "%s::%s", + pkgname, + name); + n = strlen(full_name); + Safefree(name); + name = savepvn(full_name, n); + } + } + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", + (value == 'p' ? '+' : '!'), + UTF8fARG(UTF, n, name)); + has_user_defined_property = TRUE; + + /* We don't know yet, so have to assume that the + * property could match something in the Latin1 range, + * hence something that isn't utf8. Note that this + * would cause things in to match + * inappropriately, except that any \p{}, including + * this one forces Unicode semantics, which means there + * is no */ + ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; + } + else { + + /* Here, did get the swash and its inversion list. If + * the swash is from a user-defined property, then this + * whole character class should be regarded as such */ + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + + + /* Invert if asking for the complement */ + if (value == 'P') { + _invlist_union_complement_2nd(properties, + invlist, + &properties); + + /* The swash can't be used as-is, because we've + * inverted things; delay removing it to here after + * have copied its invlist above */ + SvREFCNT_dec_NN(swash); + swash = NULL; + } + else { + _invlist_union(properties, invlist, &properties); + } + } + Safefree(name); + } + RExC_parse = e + 1; + namedclass = ANYOF_UNIPROP; /* no official name, but it's + named */ + + /* \p means they want Unicode semantics */ + RExC_uni_semantics = 1; + } + break; + case 'n': value = '\n'; break; + case 'r': value = '\r'; break; + case 't': value = '\t'; break; + case 'f': value = '\f'; break; + case 'b': value = '\b'; break; + case 'e': value = ASCII_TO_NATIVE('\033');break; + case 'a': value = '\a'; break; + case 'o': + RExC_parse--; /* function expects to be pointed at the 'o' */ + { + const char* error_msg; + bool valid = grok_bslash_o(&RExC_parse, + &value, + &error_msg, + SIZE_ONLY, /* warnings in pass + 1 only */ + strict, + silence_non_portable, + UTF); + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) { + goto recode_encoding; + } + break; + case 'x': + RExC_parse--; /* function expects to be pointed at the 'x' */ + { + const char* error_msg; + bool valid = grok_bslash_x(&RExC_parse, + &value, + &error_msg, + TRUE, /* Output warnings */ + strict, + silence_non_portable, + UTF); + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + case 'c': + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + /* Take 1-3 octal digits */ + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + numlen = (strict) ? 4 : 3; + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); + RExC_parse += numlen; + if (numlen != 3) { + if (strict) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Need exactly 3 octal digits"); + } + else if (! SIZE_ONLY /* like \08, \178 */ + && numlen < 3 + && RExC_parse < RExC_end + && isDIGIT(*RExC_parse) + && ckWARN(WARN_REGEXP)) + { + SAVEFREESV(RExC_rx_sv); + reg_warn_non_literal_string( + RExC_parse + 1, + form_short_octal_warning(RExC_parse, numlen)); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + } + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + value = reg_recode((const char)(U8)value, &enc); + if (!enc) { + if (strict) { + vFAIL("Invalid escape in the specified encoding"); + } + else if (SIZE_ONLY) { + ckWARNreg(RExC_parse, + "Invalid escape in the specified encoding"); + } + } + break; + } + default: + /* Allow \_ to not give an error */ + if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') { + if (strict) { + vFAIL2("Unrecognized escape \\%c in character class", + (int)value); + } + else { + SAVEFREESV(RExC_rx_sv); + ckWARN2reg(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + break; + } /* End of switch on char following backslash */ + } /* end of handling backslash escape sequences */ +#ifdef EBCDIC + else + literal_endpoint++; +#endif + + /* Here, we have the current token in 'value' */ + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; + + /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a + * literal, as is the character that began the false range, i.e. + * the 'a' in the examples */ + if (range) { + if (!SIZE_ONLY) { + const int w = (RExC_parse >= rangebegin) + ? RExC_parse - rangebegin + : 0; + if (strict) { + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + } + else { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + (void)ReREFCNT_inc(RExC_rx_sv); + cp_list = add_cp_to_invlist(cp_list, '-'); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); + } + } + + range = 0; /* this was not a true range */ + element_count += 2; /* So counts for three values */ + } + + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* Coverity thinks it is possible for this to be negative; both + * jhi and khw think it's not, but be safer */ + assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ + if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ + + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ + if (classnum != _CC_VERTSPACE) { + assert( namedclass == ANYOF_HORIZWS + || namedclass == ANYOF_NHORIZWS); + + /* It turns out that \h is just a synonym for + * XPosixBlank */ + classnum = _CC_BLANK; + } + + _invlist_union_maybe_complement_2nd( + cp_list, + PL_XPosix_ptrs[classnum], + namedclass % 2 != 0, /* Complement if odd + (NHORIZWS, NVERTWS) + */ + &cp_list); + } + } + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); + } + continue; /* Go get next character */ + } + } /* end of namedclass \blah */ + + /* Here, we have a single value. If 'range' is set, it is the ending + * of a range--check its validity. Later, we will handle each + * individual code point in the range. If 'range' isn't set, this + * could be the beginning of a range, so check for that by looking + * ahead to see if the next real character to be processed is the range + * indicator--the minus sign */ + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (range) { + if (prevvalue > value) /* b-a */ { + const int w = RExC_parse - rangebegin; + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + range = 0; /* not a valid range */ + } + } + else { + prevvalue = value; /* save the beginning of the potential range */ + if (! stop_at_1 /* Can't be a range if parsing just one thing */ + && *RExC_parse == '-') + { + char* next_char_ptr = RExC_parse + 1; + if (skip_white) { /* Get the next real char after the '-' */ + next_char_ptr = regpatws(pRExC_state, + RExC_parse + 1, + FALSE); /* means don't recognize + comments */ + } + + /* If the '-' is at the end of the class (just before the ']', + * it is a literal minus; otherwise it is a range */ + if (next_char_ptr < RExC_end && *next_char_ptr != ']') { + RExC_parse = next_char_ptr; + + /* a bad range like \w-, [:word:]- ? */ + if (namedclass > OOB_NAMEDCLASS) { + if (strict || ckWARN(WARN_REGEXP)) { + const int w = + RExC_parse >= rangebegin ? + RExC_parse - rangebegin : 0; + if (strict) { + vFAIL4("False [] range \"%*.*s\"", + w, w, rangebegin); + } + else { + vWARN4(RExC_parse, + "False [] range \"%*.*s\"", + w, w, rangebegin); + } + } + if (!SIZE_ONLY) { + cp_list = add_cp_to_invlist(cp_list, '-'); + } + element_count++; + } else + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ + } + } + } + + /* Here, is the beginning of the range, if any; or + * if not */ + + /* non-Latin1 code point implies unicode semantics. Must be set in + * pass1 so is there for the whole of pass 2 */ + if (value > 255) { + RExC_uni_semantics = 1; + } + + /* Ready to process either the single value, or the completed range. + * For single-valued non-inverted ranges, we consider the possibility + * of multi-char folds. (We made a conscious decision to not do this + * for the other cases because it can often lead to non-intuitive + * results. For example, you have the peculiar case that: + * "s s" =~ /^[^\xDF]+$/i => Y + * "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + if (FOLD && allow_multi_folds && value == prevvalue) { + if (value == LATIN_SMALL_LETTER_SHARP_S + || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, + value))) + { + /* Here is indeed a multi-char fold. Get what it is */ + + U8 foldbuf[UTF8_MAXBYTES_CASE]; + STRLEN foldlen; + + UV folded = _to_uni_fold_flags( + value, + foldbuf, + &foldlen, + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + + /* Here, should be the first character of the + * multi-char fold of , with containing the + * whole thing. But, if this fold is not allowed (because of + * the flags), will be the same as , and should + * be processed like any other character, so skip the special + * handling */ + if (folded != value) { + + /* Skip if we are recursed, currently parsing the class + * again. Otherwise add this character to the list of + * multi-char folds. */ + if (! RExC_in_multi_char_class) { + AV** this_array_ptr; + AV* this_array; + STRLEN cp_count = utf8_length(foldbuf, + foldbuf + foldlen); + SV* multi_fold = sv_2mortal(newSVpvs("")); + + Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); + + + if (! multi_char_matches) { + multi_char_matches = newAV(); + } + + /* is actually an array of arrays. + * There will be one or two top-level elements: [2], + * and/or [3]. The [2] element is an array, each + * element thereof is a character which folds to TWO + * characters; [3] is for folds to THREE characters. + * (Unicode guarantees a maximum of 3 characters in any + * fold.) When we rewrite the character class below, + * we will do so such that the longest folds are + * written first, so that it prefers the longest + * matching strings first. This is done even if it + * turns out that any quantifier is non-greedy, out of + * programmer laziness. Tom Christiansen has agreed + * that this is ok. This makes the test for the + * ligature 'ffi' come before the test for 'ff' */ + if (av_exists(multi_char_matches, cp_count)) { + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + this_array = *this_array_ptr; + } + else { + this_array = newAV(); + av_store(multi_char_matches, cp_count, + (SV*) this_array); + } + av_push(this_array, multi_fold); + } + + /* This element should not be processed further in this + * class */ + element_count--; + value = save_value; + prevvalue = save_prevvalue; + continue; + } + } + } + + /* Deal with this element of the class */ + if (! SIZE_ONLY) { +#ifndef EBCDIC + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); +#else + SV* this_range = _new_invlist(1); + _append_range_to_invlist(this_range, prevvalue, value); + + /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. + * If this range was specified using something like 'i-j', we want + * to include only the 'i' and the 'j', and not anything in + * between, so exclude non-ASCII, non-alphabetics from it. + * However, if the range was specified with something like + * [\x89-\x91] or [\x89-j], all code points within it should be + * included. literal_endpoint==2 means both ends of the range used + * a literal character, not \x{foo} */ + if (literal_endpoint == 2 + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) + { + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII], + &this_range); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], + &this_range); + } + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); + literal_endpoint = 0; +#endif + } + + range = 0; /* this range (if it was one) is done now */ + } /* End of loop through all the text within the brackets */ + + /* If anything in the class expands to more than one character, we have to + * deal with them by building up a substitute parse string, and recursively + * calling reg() on it, instead of proceeding */ + if (multi_char_matches) { + SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); + I32 cp_count; + STRLEN len; + char *save_end = RExC_end; + char *save_parse = RExC_parse; + bool first_time = TRUE; /* First multi-char occurrence doesn't get + a "|" */ + I32 reg_flags; + + assert(! invert); +#if 0 /* Have decided not to deal with multi-char folds in inverted classes, + because too confusing */ + if (invert) { + sv_catpv(substitute_parse, "(?:"); + } +#endif + + /* Look at the longest folds first */ + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { + + if (av_exists(multi_char_matches, cp_count)) { + AV** this_array_ptr; + SV* this_sequence; + + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + while ((this_sequence = av_pop(*this_array_ptr)) != + &PL_sv_undef) + { + if (! first_time) { + sv_catpv(substitute_parse, "|"); + } + first_time = FALSE; + + sv_catpv(substitute_parse, SvPVX(this_sequence)); + } + } + } + + /* If the character class contains anything else besides these + * multi-character folds, have to include it in recursive parsing */ + if (element_count) { + sv_catpv(substitute_parse, "|["); + sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); + sv_catpv(substitute_parse, "]"); + } + + sv_catpv(substitute_parse, ")"); +#if 0 + if (invert) { + /* This is a way to get the parse to skip forward a whole named + * sequence instead of matching the 2nd character when it fails the + * first */ + sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); + } +#endif + + RExC_parse = SvPV(substitute_parse, len); + RExC_end = RExC_parse + len; + RExC_in_multi_char_class = 1; + RExC_emit = (regnode *)orig_emit; + + ret = reg(pRExC_state, 1, ®_flags, depth+1); + + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8); + + RExC_parse = save_parse; + RExC_end = save_end; + RExC_in_multi_char_class = 0; + SvREFCNT_dec_NN(multi_char_matches); + return ret; + } + + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { + U8 op = END; + U8 arg = 0; + + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ + + /* All named classes are mapped into POSIXish nodes, with its FLAG + * argument giving which class it is */ + switch ((I32)namedclass) { + case ANYOF_UNIPROP: + break; + + /* These don't depend on the charset modifiers. They always + * match under /u rules */ + case ANYOF_NHORIZWS: + case ANYOF_HORIZWS: + namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS; + /* FALLTHROUGH */ + + case ANYOF_NVERTWS: + case ANYOF_VERTWS: + op = POSIXU; + goto join_posix; + + /* The actual POSIXish node for all the rest depends on the + * charset modifier. The ones in the first set depend only on + * ASCII or, if available on this platform, locale */ + case ANYOF_ASCII: + case ANYOF_NASCII: +#ifdef HAS_ISASCII + op = (LOC) ? POSIXL : POSIXA; +#else + op = POSIXA; +#endif + goto join_posix; + + case ANYOF_NCASED: + case ANYOF_LOWER: + case ANYOF_NLOWER: + case ANYOF_UPPER: + case ANYOF_NUPPER: + /* under /a could be alpha */ + if (FOLD) { + if (ASCII_RESTRICTED) { + namedclass = ANYOF_ALPHA + (namedclass % 2); + } + else if (! LOC) { + break; + } + } + /* FALLTHROUGH */ + + /* The rest have more possibilities depending on the charset. + * We take advantage of the enum ordering of the charset + * modifiers to get the exact node type, */ + default: + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; + } + + join_posix: + /* The odd numbered ones are the complements of the + * next-lower even number one */ + if (namedclass % 2 == 1) { + invert = ! invert; + namedclass--; + } + arg = namedclass_to_classnum(namedclass); + break; + } + } + else if (value == prevvalue) { + + /* Here, the class consists of just a single code point */ + + if (invert) { + if (! LOC && value == '\n') { + op = REG_ANY; /* Optimize [^\n] */ + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + else if (value < 256 || UTF) { + + /* Optimize a single value into an EXACTish node, but not if it + * would require converting the pattern to UTF-8. */ + op = compute_EXACTish(pRExC_state); + } + } /* Otherwise is a range */ + else if (! LOC) { /* locale could vary these */ + if (prevvalue == '0') { + if (value == '9') { + arg = _CC_DIGIT; + op = POSIXA; + } + } + else if (prevvalue == 'A') { + if (value == 'Z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } + } + else if (prevvalue == 'a') { + if (value == 'z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } + } + } + + /* Here, we have changed away from its initial value iff we found + * an optimization */ + if (op != END) { + + /* Throw away this ANYOF regnode, and emit the calculated one, + * which should correspond to the beginning, not current, state of + * the parse */ + const char * cur_parse = RExC_parse; + RExC_parse = (char *)orig_parse; + if ( SIZE_ONLY) { + if (! LOC) { + + /* To get locale nodes to not use the full ANYOF size would + * require moving the code above that writes the portions + * of it that aren't in other nodes to after this point. + * e.g. ANYOF_POSIXL_SET */ + RExC_size = orig_size; + } + } + else { + RExC_emit = (regnode *)orig_emit; + if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } + if (invert) { + op += NPOSIXD - POSIXD; + } + } + } + + ret = reg_node(pRExC_state, op); + + if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { + if (! SIZE_ONLY) { + FLAGS(ret) = arg; + } + *flagp |= HASWIDTH|SIMPLE; + } + else if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); + } + + RExC_parse = (char *) cur_parse; + + SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); + SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); + return ret; + } + } + + if (SIZE_ONLY) + return ret; + /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ + + /* If folding, we calculate all characters that could fold to or from the + * ones already on the list */ + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; + } + + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, + &fold_intersection); + + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { + + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + } + + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; + + if (j < 256) { + + if (IS_IN_SOME_FOLD_L1(j)) { + + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } + } + + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); + } + continue; + } + + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } + + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } + } + } + } + } + SvREFCNT_dec_NN(fold_intersection); + } + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); + } + + /* And combine the result (if any) with any inversion list from posix + * classes. The lists are kept separate up to now because we don't want to + * fold the classes (folding of those is automatically handled by the swash + * fetching code) */ + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } + if (! DEPENDS_SEMANTICS) { + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + } + else { + cp_list = posixes; + } + } + else { + /* Under /d, we put into a separate list the Latin1 things that + * match only when the target string is utf8 */ + SV* nonascii_but_latin1_properties = NULL; + _invlist_intersection(posixes, PL_UpperLatin1, + &nonascii_but_latin1_properties); + _invlist_subtract(posixes, nonascii_but_latin1_properties, + &posixes); + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + } + else { + cp_list = posixes; + } + + if (depends_list) { + _invlist_union(depends_list, nonascii_but_latin1_properties, + &depends_list); + SvREFCNT_dec_NN(nonascii_but_latin1_properties); + } + else { + depends_list = nonascii_but_latin1_properties; + } + } + } + + /* And combine the result (if any) with any inversion list from properties. + * The lists are kept separate up to now so that we can distinguish the two + * in regards to matching above-Unicode. A run-time warning is generated + * if a Unicode property is matched against a non-Unicode code point. But, + * we allow user-defined properties to match anything, without any warning, + * and we also suppress the warning if there is a portion of the character + * class that isn't a Unicode property, and which matches above Unicode, \W + * or [\x{110000}] for example. + * (Note that in this case, unlike the Posix one above, there is no + * , because having a Unicode property forces Unicode + * semantics */ + if (properties) { + if (cp_list) { + + /* If it matters to the final outcome, see if a non-property + * component of the class matches above Unicode. If so, the + * warning gets suppressed. This is true even if just a single + * such code point is specified, as though not strictly correct if + * another such code point is matched against, the fact that they + * are using above-Unicode code points indicates they should know + * the issues involved */ + if (warn_super) { + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); + } + + _invlist_union(properties, cp_list, &cp_list); + SvREFCNT_dec_NN(properties); + } + else { + cp_list = properties; + } + + if (warn_super) { + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + } + } + + /* Here, we have calculated what code points should be in the character + * class. + * + * Now we can see about various optimizations. Fold calculation (which we + * did above) needs to take place before inversion. Otherwise /[^k]/i + * would invert to include K, which under /i would match k, which it + * shouldn't. Therefore we can't invert folded locale now, as it won't be + * folded until runtime */ + + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known + * at compile time. Besides not inverting folded locale now, we can't + * invert if there are things such as \w, which aren't known until runtime + * */ + if (cp_list + && invert + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! depends_list + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + _invlist_invert(cp_list); + + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + /* Clear the invert flag since have just done it here */ + invert = FALSE; + } + + if (ret_invlist) { + *ret_invlist = cp_list; + SvREFCNT_dec(swash); + + /* Discard the generated node */ + if (SIZE_ONLY) { + RExC_size = orig_size; + } + else { + RExC_emit = orig_emit; + } + return orig_emit; + } + + /* Some character classes are equivalent to other nodes. Such nodes take + * up less room and generally fewer operations to execute than ANYOF nodes. + * Above, we checked for and optimized into some such equivalents for + * certain common classes that are easy to test. Getting to this point in + * the code means that the class didn't get optimized there. Since this + * code is only executed in Pass 2, it is too late to save space--it has + * been allocated in Pass 1, and currently isn't given back. But turning + * things into an EXACTish node can allow the optimizer to join it to any + * adjacent such nodes. And if the class is equivalent to things like /./, + * expensive run-time swashes can be avoided. Now that we have more + * complete information, we can find things necessarily missed by the + * earlier code. I (khw) am not sure how much to look for here. It would + * be easy, but perhaps too slow, to check any candidates against all the + * node types they could possibly match using _invlistEQ(). */ + + if (cp_list + && ! invert + && ! depends_list + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) + { + UV start, end; + U8 op = END; /* The optimzation node-type */ + const char * cur_parse= RExC_parse; + + invlist_iterinit(cp_list); + if (! invlist_iternext(cp_list, &start, &end)) { + + /* Here, the list is empty. This happens, for example, when a + * Unicode property is the only thing in the character class, and + * it doesn't match anything. (perluniprops.pod notes such + * properties) */ + op = OPFAIL; + *flagp |= HASWIDTH|SIMPLE; + } + else if (start == end) { /* The range is a single code point */ + if (! invlist_iternext(cp_list, &start, &end) + + /* Don't do this optimization if it would require changing + * the pattern to UTF-8 */ + && (start < 256 || UTF)) + { + /* Here, the list contains a single code point. Can optimize + * into an EXACTish node */ + + value = start; + + if (! FOLD) { + op = EXACT; + } + else if (LOC) { + + /* A locale node under folding with one code point can be + * an EXACTFL, as its fold won't be calculated until + * runtime */ + op = EXACTFL; + } + else { + + /* Here, we are generally folding, but there is only one + * code point to match. If we have to, we use an EXACT + * node, but it would be better for joining with adjacent + * nodes in the optimization pass if we used the same + * EXACTFish node that any such are likely to be. We can + * do this iff the code point doesn't participate in any + * folds. For example, an EXACTF of a colon is the same as + * an EXACT one, since nothing folds to or from a colon. */ + if (value < 256) { + if (IS_IN_SOME_FOLD_L1(value)) { + op = EXACT; + } + } + else { + if (_invlist_contains_cp(PL_utf8_foldable, value)) { + op = EXACT; + } + } + + /* If we haven't found the node type, above, it means we + * can use the prevailing one */ + if (op == END) { + op = compute_EXACTish(pRExC_state); + } + } + } + } + else if (start == 0) { + if (end == UV_MAX) { + op = SANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + else if (end == '\n' - 1 + && invlist_iternext(cp_list, &start, &end) + && start == '\n' + 1 && end == UV_MAX) + { + op = REG_ANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + invlist_iterfinish(cp_list); + + if (op != END) { + RExC_parse = (char *)orig_parse; + RExC_emit = (regnode *)orig_emit; + + ret = reg_node(pRExC_state, op); + + RExC_parse = (char *)cur_parse; + + if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); + } + + SvREFCNT_dec_NN(cp_list); + return ret; + } + } + + /* Here, contains all the code points we can determine at + * compile time that match under all conditions. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * . While we are at it, see if everything above 255 is in the + * list, and if so, set a flag to speed up execution */ + + populate_ANYOF_from_invlist(ret, &cp_list); + + if (invert) { + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + } + + /* Here, the bitmap has been populated with all the Latin1 code points that + * always match. Can now add to the overall list those that match only + * when the target string is UTF-8 (). */ + if (depends_list) { + if (cp_list) { + _invlist_union(cp_list, depends_list, &cp_list); + SvREFCNT_dec_NN(depends_list); + } + else { + cp_list = depends_list; + } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; + } + + /* If there is a swash and more than one element, we can't use the swash in + * the optimization below. */ + if (swash && element_count > 1) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); + } + else { + AV * const av = newAV(); + SV *rv; + + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); + if (swash) { + assert(cp_list); + av_store(av, 1, swash); + SvREFCNT_dec_NN(cp_list); + } + else { + av_store(av, 1, &PL_sv_undef); + if (cp_list) { + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); + } + } + + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + + rv = newRV_noinc(MUTABLE_SV(av)); + n = add_data(pRExC_state, STR_WITH_LEN("s")); + RExC_rxi->data->data[n] = (void*)rv; + ARG_SET(node, n); + } +} + + +/* reg_skipcomment() + + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. + + Note it's the callers responsibility to ensure that we are + actually in /x mode + +*/ + +PERL_STATIC_INLINE char* +S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) +{ + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + + assert(*p == '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; + } + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; +} + +/* nextchar() + + Advances the parse position, and optionally absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + +STATIC char* +S_nextchar(pTHX_ RExC_state_t *pRExC_state) +{ + char* const retval = RExC_parse++; + + PERL_ARGS_ASSERT_NEXTCHAR; + + for (;;) { + if (RExC_end - RExC_parse >= 3 + && *RExC_parse == '(' + && RExC_parse[1] == '?' + && RExC_parse[2] == '#') + { + while (*RExC_parse != ')') { + if (RExC_parse == RExC_end) + FAIL("Sequence (?#... not terminated"); + RExC_parse++; + } + RExC_parse++; + continue; + } + if (RExC_flags & RXf_PMf_EXTENDED) { + char * p = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + if (p != RExC_parse) { + RExC_parse = p; + continue; + } + } + return retval; + } +} + +/* +- reg_node - emit a node +*/ +STATIC regnode * /* Location. */ +S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) +{ + regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 1; + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, (void*)RExC_emit, (void*)RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE(ptr, op); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + "reg_node", __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reganode - emit a node with an argument +*/ +STATIC regnode * /* Location. */ +S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) +{ + regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGANODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 2; + /* + We can't do this: + + assert(2==regarglen[op]+1); + + Anything larger than this has to allocate the extra amount. + If we changed this to be: + + RExC_size += (1 + regarglen[op]); + + then it wouldn't matter. Its not clear what side effect + might come from that so its not done so far. + -- dmq + */ + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, (void*)RExC_emit, (void*)RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE_ARG(ptr, op, arg); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reganode", + __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? + "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Cur_Node_Offset; + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reguni - emit (if appropriate) a Unicode character +*/ +PERL_STATIC_INLINE STRLEN +S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) +{ + PERL_ARGS_ASSERT_REGUNI; + + return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); +} + +/* +- reginsert - insert an operator in front of already-emitted operand +* +* Means relocating the operand. +*/ +STATIC void +S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) +{ + regnode *src; + regnode *dst; + regnode *place; + const int offset = regarglen[(U8)op]; + const int size = NODE_STEP_REGNODE + offset; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGINSERT; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(depth); +/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ + DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); + if (SIZE_ONLY) { + RExC_size += size; + return; + } + + src = RExC_emit; + RExC_emit += size; + dst = RExC_emit; + if (RExC_open_parens) { + int paren; + /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ + for ( paren=0 ; paren < RExC_npar ; paren++ ) { + if ( RExC_open_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("open"," - %d",size);*/ + RExC_open_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ + } + if ( RExC_close_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("close"," - %d",size);*/ + RExC_close_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ + } + } + } + + while (src > opnd) { + StructCopy(--src, --dst, regnode); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD 20010112 */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + "reg_insert", + __LINE__, + PL_reg_name[op], + (UV)(dst - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(src - RExC_emit_start), + (UV)(dst - RExC_emit_start), + (UV)RExC_offsets[0])); + Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); + Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); + } +#endif + } + + + place = opnd; /* Op node, where operand used to be. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reginsert", + __LINE__, + PL_reg_name[op], + (UV)(place - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(place - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(place, RExC_parse); + Set_Node_Length(place, 1); + } +#endif + src = NEXTOPER(place); + FILL_ADVANCE_NODE(place, op); + Zero(src, offset, regnode); +} + +/* +- regtail - set the next-pointer at the end of a node chain of p to val. +- SEE ALSO: regtail_study +*/ +/* TODO: All three parms should be const */ +STATIC void +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) +{ + regnode *scan; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if (SIZE_ONLY) + return; + + /* Find last node. */ + scan = p; + for (;;) { + regnode * const temp = regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tail" : "")); + regprop(RExC_rx, mysv, scan, NULL); + PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", + SvPV_nolen_const(mysv), REG_NODE_NUM(scan), + (temp == NULL ? "->" : ""), + (temp == NULL ? PL_reg_name[OP(val)] : "") + ); + }); + if (temp == NULL) + break; + scan = temp; + } + + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } +} + +#ifdef DEBUGGING +/* +- regtail_study - set the next-pointer at the end of a node chain of p to val. +- Look for optimizable sequences at the same time. +- currently only looks for EXACT chains. + +This is experimental code. The idea is to use this routine to perform +in place optimizations on branches and groups as they are constructed, +with the long term intention of removing optimization from study_chunk so +that it is purely analytical. + +Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used +to control which is which. + +*/ +/* TODO: All four parms should be const */ + +STATIC U8 +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) +{ + regnode *scan; + U8 exact = PSEUDO; +#ifdef EXPERIMENTAL_INPLACESCAN + I32 min = 0; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL_STUDY; + + + if (SIZE_ONLY) + return exact; + + /* Find last node. */ + + scan = p; + for (;;) { + regnode * const temp = regnext(scan); +#ifdef EXPERIMENTAL_INPLACESCAN + if (PL_regkind[OP(scan)] == EXACT) { + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) + return EXACT; + } +#endif + if ( exact ) { + switch (OP(scan)) { + case EXACT: + case EXACTF: + case EXACTFA_NO_TRIE: + case EXACTFA: + case EXACTFU: + case EXACTFU_SS: + case EXACTFL: + if( exact == PSEUDO ) + exact= OP(scan); + else if ( exact != OP(scan) ) + exact= 0; + case NOTHING: + break; + default: + exact= 0; + } + } + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); + regprop(RExC_rx, mysv, scan, NULL); + PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", + SvPV_nolen_const(mysv), + REG_NODE_NUM(scan), + PL_reg_name[exact]); + }); + if (temp == NULL) + break; + scan = temp; + } + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_PARSE_MSG(""); + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(val), + (IV)(val - scan) + ); + }); + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } + + return exact; +} +#endif + +/* + - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form + */ +#ifdef DEBUGGING + +static void +S_regdump_intflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitprogram, ri->program + 1, NULL, NULL, sv, 0, 0); + + /* Header fields of interest. */ + if (r->anchored_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), + RE_SV_DUMPLEN(r->anchored_substr), 30); + PerlIO_printf(Perl_debug_log, + "anchored %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_substr), + (IV)r->anchored_offset); + } else if (r->anchored_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), + RE_SV_DUMPLEN(r->anchored_utf8), 30); + PerlIO_printf(Perl_debug_log, + "anchored utf8 %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_utf8), + (IV)r->anchored_offset); + } + if (r->float_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), + RE_SV_DUMPLEN(r->float_substr), 30); + PerlIO_printf(Perl_debug_log, + "floating %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_substr), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } else if (r->float_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), + RE_SV_DUMPLEN(r->float_utf8), 30); + PerlIO_printf(Perl_debug_log, + "floating utf8 %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_utf8), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, + (const char *) + (r->check_substr == r->float_substr + && r->check_utf8 == r->float_utf8 + ? "(checking floating" : "(checking anchored")); + if (r->intflags & PREGf_NOSCAN) + PerlIO_printf(Perl_debug_log, " noscan"); + if (r->extflags & RXf_CHECK_ALL) + PerlIO_printf(Perl_debug_log, " isall"); + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, ") "); + + if (ri->regstclass) { + regprop(r, sv, ri->regstclass, NULL); + PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); + } + if (r->intflags & PREGf_ANCH) { + PerlIO_printf(Perl_debug_log, "anchored"); + if (r->intflags & PREGf_ANCH_BOL) + PerlIO_printf(Perl_debug_log, "(BOL)"); + if (r->intflags & PREGf_ANCH_MBOL) + PerlIO_printf(Perl_debug_log, "(MBOL)"); + if (r->intflags & PREGf_ANCH_SBOL) + PerlIO_printf(Perl_debug_log, "(SBOL)"); + if (r->intflags & PREGf_ANCH_GPOS) + PerlIO_printf(Perl_debug_log, "(GPOS)"); + PerlIO_putc(Perl_debug_log, ' '); + } + if (r->intflags & PREGf_GPOS_SEEN) + PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); + if (r->intflags & PREGf_SKIP) + PerlIO_printf(Perl_debug_log, "plus "); + if (r->intflags & PREGf_IMPLICIT) + PerlIO_printf(Perl_debug_log, "implicit "); + PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); + if (r->extflags & RXf_EVAL_SEEN) + PerlIO_printf(Perl_debug_log, "with eval "); + PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r({ + regdump_extflags("r->extflags: ",r->extflags); + regdump_intflags("r->intflags: ",r->intflags); + }); +#else + PERL_ARGS_ASSERT_REGDUMP; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(r); +#endif /* DEBUGGING */ +} + +/* +- regprop - printable representation of opcode, with run time support +*/ + +void +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) +{ +#ifdef DEBUGGING + int k; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { +#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ + || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ + || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ + || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ + || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \ + || _CC_VERTSPACE != 16 + #error Need to adjust order of anyofs[] +#endif + "\\w", + "\\W", + "\\d", + "\\D", + "[:alpha:]", + "[:^alpha:]", + "[:lower:]", + "[:^lower:]", + "[:upper:]", + "[:^upper:]", + "[:punct:]", + "[:^punct:]", + "[:print:]", + "[:^print:]", + "[:alnum:]", + "[:^alnum:]", + "[:graph:]", + "[:^graph:]", + "[:cased:]", + "[:^cased:]", + "\\s", + "\\S", + "[:blank:]", + "[:^blank:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:ascii:]", + "[:^ascii:]", + "\\v", + "\\V" + }; + RXi_GET_DECL(prog,progi); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPROP; + + sv_setpvs(sv, ""); + + if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ + /* It would be nice to FAIL() here, but this may be called from + regexec.c, and it would be hard to supply pRExC_state. */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); + sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ + + k = PL_regkind[OP(o)]; + + if (k == EXACT) { + sv_catpvs(sv, " "); + /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) + * is a crude hack but it may be the best for now since + * we have no flag "this EXACTish node was UTF-8" + * --jhi */ + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_ESCAPE_NONASCII | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); + } else if (k == TRIE) { + /* print the details of the trie in dumpuntil instead, as + * progi->data isn't available here */ + const char op = OP(o); + const U32 n = ARG(o); + const reg_ac_data * const ac = IS_TRIE_AC(op) ? + (reg_ac_data *)progi->data->data[n] : + NULL; + const reg_trie_data * const trie + = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; + + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); + DEBUG_TRIE_COMPILE_r( + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); + ); + if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { + sv_catpvs(sv, "["); + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); + sv_catpvs(sv, "]"); + } + + } else if (k == CURLY) { + if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); + } + else if (k == WHILEM && o->flags) /* Ordinal/of */ + Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { + Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ + if ( RXp_PAREN_NAMES(prog) ) { + if ( k != REF || (OP(o) < NREF)) { + AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + SV **name= av_fetch(list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + } + else { + AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + SV **name= av_fetch(list, nums[0], 0 ); + I32 n; + if (name) { + for ( n=0; noffs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } + } else if (k == GOSUB) + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); + else if (k == VERB) { + if (!o->flags) + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); + } else if (k == LOGICAL) + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); + else if (k == ANYOF) { + const U8 flags = ANYOF_FLAGS(o); + int do_sep = 0; + + + if (flags & ANYOF_LOCALE_FLAGS) + sv_catpvs(sv, "{loc}"); + if (flags & ANYOF_LOC_FOLD) + sv_catpvs(sv, "{i}"); + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + if (flags & ANYOF_INVERT) + sv_catpvs(sv, "^"); + + /* output what the standard cp 0-255 bitmap matches */ + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); + + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { + sv_catpv(sv, anyofs[i]); + do_sep = 1; + } + } + } + + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); + } + + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } + + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { + char *s = savesvpv(lv); + char * const origs = s; + + while (*s && *s != '\n') + s++; + + if (*s == '\n') { + const char * const t = ++s; + + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } + + if (byte_output) { + sv_catpvs(sv, " "); + } + + while (*s) { + if (*s == '\n') { + + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); + } + + out_dump: + + Safefree(origs); + SvREFCNT_dec_NN(lv); + } + + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); + } + } + } + + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + } + else if (k == POSIXD || k == NPOSIXD) { + U8 index = FLAGS(o) * 2; + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } + } + else { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + } + else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) + Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); +#else + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(sv); + PERL_UNUSED_ARG(o); + PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); +#endif /* DEBUGGING */ +} + + + +SV * +Perl_re_intuit_string(pTHX_ REGEXP * const r) +{ /* Assume that RE_INTUIT is set */ + struct regexp *const prog = ReANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_STRING; + PERL_UNUSED_CONTEXT; + + DEBUG_COMPILE_r( + { + const char * const s = SvPV_nolen_const(prog->check_substr + ? prog->check_substr : prog->check_utf8); + + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, + "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", + PL_colors[4], + prog->check_substr ? "" : "utf8 ", + PL_colors[5],PL_colors[0], + s, + PL_colors[1], + (strlen(s) > 60 ? "..." : "")); + } ); + + return prog->check_substr ? prog->check_substr : prog->check_utf8; +} + +/* + pregfree() + + handles refcounting and freeing the perl core regexp structure. When + it is necessary to actually free the structure the first thing it + does is call the 'free' method of the regexp_engine associated to + the regexp, allowing the handling of the void *pprivate; member + first. (This routine is not overridable by extensions, which is why + the extensions free is called first.) + + See regdupe and regdupe_internal if you change anything here. +*/ +#ifndef PERL_IN_XSUB_RE +void +Perl_pregfree(pTHX_ REGEXP *r) +{ + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) +{ + struct regexp *const r = ReANY(rx); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_PREGFREE2; + + if (r->mother_re) { + ReREFCNT_dec(r->mother_re); + } else { + CALLREGFREE_PVT(rx); /* free the private data */ + SvREFCNT_dec(RXp_PAREN_NAMES(r)); + Safefree(r->xpv_len_u.xpvlenu_pv); + } + if (r->substrs) { + SvREFCNT_dec(r->anchored_substr); + SvREFCNT_dec(r->anchored_utf8); + SvREFCNT_dec(r->float_substr); + SvREFCNT_dec(r->float_utf8); + Safefree(r->substrs); + } + RX_MATCH_COPY_FREE(rx); +#ifdef PERL_ANY_COW + SvREFCNT_dec(r->saved_copy); +#endif + Safefree(r->offs); + SvREFCNT_dec(r->qr_anoncv); + rx->sv_u.svu_rx = 0; +} + +/* reg_temp_copy() + + This is a hacky workaround to the structural issue of match results + being stored in the regexp structure which is in turn stored in + PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern + could be PL_curpm in multiple contexts, and could require multiple + result sets being associated with the pattern simultaneously, such + as when doing a recursive match with (??{$qr}) + + The solution is to make a lightweight copy of the regexp structure + when a qr// is returned from the code executed by (??{$qr}) this + lightweight copy doesn't actually own any of its data except for + the starp/end and the actual regexp structure itself. + +*/ + + +REGEXP * +Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) +{ + struct regexp *ret; + struct regexp *const r = ReANY(rx); + const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV; + + PERL_ARGS_ASSERT_REG_TEMP_COPY; + + if (!ret_x) + ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + else { + SvOK_off((SV *)ret_x); + if (islv) { + /* For PVLVs, SvANY points to the xpvlv body while sv_u points + to the regexp. (For SVt_REGEXPs, sv_upgrade has already + made both spots point to the same regexp body.) */ + REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); + assert(!SvPVX(ret_x)); + ret_x->sv_u.svu_rx = temp->sv_any; + temp->sv_any = NULL; + SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; + SvREFCNT_dec_NN(temp); + /* SvCUR still resides in the xpvlv struct, so the regexp copy- + ing below will not set it. */ + SvCUR_set(ret_x, SvCUR(rx)); + } + } + /* This ensures that SvTHINKFIRST(sv) is true, and hence that + sv_force_normal(sv) is called. */ + SvFAKE_on(ret_x); + ret = ReANY(ret_x); + + SvFLAGS(ret_x) |= SvUTF8(rx); + /* We share the same string buffer as the original regexp, on which we + hold a reference count, incremented when mother_re is set below. + The string pointer is copied here, being part of the regexp struct. + */ + memcpy(&(ret->xpv_cur), &(r->xpv_cur), + sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); + if (r->offs) { + const I32 npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + } + if (r->substrs) { + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + SvREFCNT_inc_void(ret->anchored_substr); + SvREFCNT_inc_void(ret->anchored_utf8); + SvREFCNT_inc_void(ret->float_substr); + SvREFCNT_inc_void(ret->float_utf8); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + } + RX_MATCH_COPIED_off(ret_x); +#ifdef PERL_ANY_COW + ret->saved_copy = NULL; +#endif + ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); + SvREFCNT_inc_void(ret->qr_anoncv); + + return ret_x; +} +#endif + +/* regfree_internal() + + Free the private data in a regexp. This is overloadable by + extensions. Perl takes care of the regexp structure in pregfree(), + this covers the *pprivate pointer which technically perl doesn't + know about, however of course we have to handle the + regexp_internal structure when no extension is in use. + + Note this is called before freeing anything in the regexp + structure. + */ + +void +Perl_regfree_internal(pTHX_ REGEXP * const rx) +{ + struct regexp *const r = ReANY(rx); + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGFREE_INTERNAL; + + DEBUG_COMPILE_r({ + if (!PL_colorset) + reginitcolors(); + { + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); + PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + PL_colors[4],PL_colors[5],s); + } + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) + Safefree(ri->u.offsets); /* 20010421 MJD */ +#endif + if (ri->code_blocks) { + int n; + for (n = 0; n < ri->num_code_blocks; n++) + SvREFCNT_dec(ri->code_blocks[n].src_regex); + Safefree(ri->code_blocks); + } + + if (ri->data) { + int n = ri->data->count; + + while (--n >= 0) { + /* If you add a ->what type here, update the comment in regcomp.h */ + switch (ri->data->what[n]) { + case 'a': + case 'r': + case 's': + case 'S': + case 'u': + SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); + break; + case 'f': + Safefree(ri->data->data[n]); + break; + case 'l': + case 'L': + break; + case 'T': + { /* Aho Corasick add-on structure for a trie node. + Used in stclass optimization only */ + U32 refcount; + reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif + OP_REFCNT_LOCK; + refcount = --aho->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(aho->states); + PerlMemShared_free(aho->fail); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } + } + } + break; + case 't': + { + /* trie structure. */ + U32 refcount; + reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif + OP_REFCNT_LOCK; + refcount = --trie->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(trie->charmap); + PerlMemShared_free(trie->states); + PerlMemShared_free(trie->trans); + if (trie->bitmap) + PerlMemShared_free(trie->bitmap); + if (trie->jump) + PerlMemShared_free(trie->jump); + PerlMemShared_free(trie->wordinfo); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + } + } + break; + default: + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); + } + } + Safefree(ri->data->what); + Safefree(ri->data); + } + + Safefree(ri); +} + +#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t)) +#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) +#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) + +/* + re_dup - duplicate a regexp. + + This routine is expected to clone a given regexp structure. It is only + compiled under USE_ITHREADS. + + After all of the core data stored in struct regexp is duplicated + the regexp_engine.dupe method is used to copy any private data + stored in the *pprivate pointer. This allows extensions to handle + any duplication it needs to do. + + See pregfree() and regfree_internal() if you change anything here. +*/ +#if defined(USE_ITHREADS) +#ifndef PERL_IN_XSUB_RE +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) +{ + dVAR; + I32 npar; + const struct regexp *r = ReANY(sstr); + struct regexp *ret = ReANY(dstr); + + PERL_ARGS_ASSERT_RE_DUP_GUTS; + + npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + + if (ret->substrs) { + /* Do it this way to avoid reading from *r after the StructCopy(). + That way, if any of the sv_dup_inc()s dislodge *r from the L1 + cache, it doesn't matter. */ + const bool anchored = r->check_substr + ? r->check_substr == r->anchored_substr + : r->check_utf8 == r->anchored_utf8; + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); + ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); + ret->float_substr = sv_dup_inc(ret->float_substr, param); + ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + + if (ret->check_substr) { + if (anchored) { + assert(r->check_utf8 == r->anchored_utf8); + ret->check_substr = ret->anchored_substr; + ret->check_utf8 = ret->anchored_utf8; + } else { + assert(r->check_substr == r->float_substr); + assert(r->check_utf8 == r->float_utf8); + ret->check_substr = ret->float_substr; + ret->check_utf8 = ret->float_utf8; + } + } else if (ret->check_utf8) { + if (anchored) { + ret->check_utf8 = ret->anchored_utf8; + } else { + ret->check_utf8 = ret->float_utf8; + } + } + } + + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); + ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); + + if (ret->pprivate) + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); + + if (RX_MATCH_COPIED(dstr)) + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); + else + ret->subbeg = NULL; +#ifdef PERL_ANY_COW + ret->saved_copy = NULL; +#endif + + /* Whether mother_re be set or no, we need to copy the string. We + cannot refrain from copying it when the storage points directly to + our mother regexp, because that's + 1: a buffer in a different thread + 2: something we no longer hold a reference on + so we need to copy it locally. */ + RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); + ret->mother_re = NULL; +} +#endif /* PERL_IN_XSUB_RE */ + +/* + regdupe_internal() + + This is the internal complement to regdupe() which is used to copy + the structure pointed to by the *pprivate pointer in the regexp. + This is the core version of the extension overridable cloning hook. + The regexp structure being duplicated will be copied by perl prior + to this and will be provided as the regexp *r argument, however + with the /old/ structures pprivate pointer value. Thus this routine + may override any copying normally done by perl. + + It returns a pointer to the new regexp_internal structure. +*/ + +void * +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) +{ + dVAR; + struct regexp *const r = ReANY(rx); + regexp_internal *reti; + int len; + RXi_GET_DECL(r,ri); + + PERL_ARGS_ASSERT_REGDUPE_INTERNAL; + + len = ProgLen(ri); + + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); + Copy(ri->program, reti->program, len+1, regnode); + + reti->num_code_blocks = ri->num_code_blocks; + if (ri->code_blocks) { + int n; + Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, + struct reg_code_block); + Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, + struct reg_code_block); + for (n = 0; n < ri->num_code_blocks; n++) + reti->code_blocks[n].src_regex = (REGEXP*) + sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); + } + else + reti->code_blocks = NULL; + + reti->regstclass = NULL; + + if (ri->data) { + struct reg_data *d; + const int count = ri->data->count; + int i; + + Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), + char, struct reg_data); + Newx(d->what, count, U8); + + d->count = count; + for (i = 0; i < count; i++) { + d->what[i] = ri->data->what[i]; + switch (d->what[i]) { + /* see also regcomp.h and regfree_internal() */ + case 'a': /* actually an AV, but the dup function is identical. */ + case 'r': + case 's': + case 'S': + case 'u': /* actually an HV, but the dup function is identical. */ + d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); + break; + case 'f': + /* This is cheating. */ + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); + reti->regstclass = (regnode*)d->data[i]; + break; + case 'T': + /* Trie stclasses are readonly and can thus be shared + * without duplication. We free the stclass in pregfree + * when the corresponding reg_ac_data struct is freed. + */ + reti->regstclass= ri->regstclass; + /* FALLTHROUGH */ + case 't': + OP_REFCNT_LOCK; + ((reg_trie_data*)ri->data->data[i])->refcount++; + OP_REFCNT_UNLOCK; + /* FALLTHROUGH */ + case 'l': + case 'L': + d->data[i] = ri->data->data[i]; + break; + default: + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); + } + } + + reti->data = d; + } + else + reti->data = NULL; + + reti->name_list_idx = ri->name_list_idx; + +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) { + Newx(reti->u.offsets, 2*len+1, U32); + Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); + } +#else + SetProgLen(reti,len); +#endif + + return (void*)reti; +} + +#endif /* USE_ITHREADS */ + +#ifndef PERL_IN_XSUB_RE + +/* + - regnext - dig the "next" pointer out of a node + */ +regnode * +Perl_regnext(pTHX_ regnode *p) +{ + I32 offset; + + if (!p) + return(NULL); + + if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); + } + + offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); + if (offset == 0) + return(NULL); + + return(p+offset); +} +#endif + +STATIC void +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) +{ + va_list args; + STRLEN l1 = strlen(pat1); + STRLEN l2 = strlen(pat2); + char buf[512]; + SV *msv; + const char *message; + + PERL_ARGS_ASSERT_RE_CROAK2; + + if (l1 > 510) + l1 = 510; + if (l1 + l2 > 510) + l2 = 510 - l1; + Copy(pat1, buf, l1 , char); + Copy(pat2, buf + l1, l2 , char); + buf[l1 + l2] = '\n'; + buf[l1 + l2 + 1] = '\0'; + va_start(args, pat2); + msv = vmess(buf, &args); + va_end(args); + message = SvPV_const(msv,l1); + if (l1 > 512) + l1 = 512; + Copy(message, buf, l1 , char); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); +} + +/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ + +#ifndef PERL_IN_XSUB_RE +void +Perl_save_re_context(pTHX) +{ + /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ + if (PL_curpm) { + const REGEXP * const rx = PM_GETRE(PL_curpm); + if (rx) { + U32 i; + for (i = 1; i <= RX_NPARENS(rx); i++) { + char digits[TYPE_CHARS(long)]; + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); + GV *const *const gvp + = (GV**)hv_fetch(PL_defstash, digits, len, 0); + + if (gvp) { + GV * const gv = *gvp; + if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) + save_scalar(gv); + } + } + } + } +} +#endif + +#ifdef DEBUGGING + +STATIC void +S_put_byte(pTHX_ SV *sv, int c) +{ + PERL_ARGS_ASSERT_PUT_BYTE; + + if (!isPRINT(c)) { + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; + } + } + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } +} + +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end'. It assumes that only ASCII printables are displayable + * as-is (though some of these will be escaped by put_byte()). For the + * time being, this subroutine only works for latin1 (< 256) code points */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + while (start <= end) { + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) { + put_byte(sv, start); + } + break; + } + + /* For small ranges that include printable ASCII characters, it's more + * legible to print those characters rather than hex values. For + * larger ranges that include more than printables, it's probably + * clearer to just give the start and end points of the range in hex, + * and that's all we can do if there aren't any printables within the + * range + * + * On ASCII platforms the range of printables is contiguous. If the + * entire range is printable, we print each character as such. If the + * range is partially printable and partially not, it's less likely + * that the individual printables are meaningful, especially if all or + * almost all of them are in the range. But we err on the side of the + * individual printables being meaningful by using the hex only if the + * range contains all but 2 of the printables. + * + * On EBCDIC platforms, the printables are scattered around so that the + * maximum range length containing only them is about 10. Anything + * longer we treat as hex; otherwise we examine the range character by + * character to see */ +#ifdef EBCDIC + if (start < 256 && (((end < 255) ? end : 255) - start <= 10)) +#else + if ((isPRINT_A(start) && isPRINT_A(end)) + || (end >= 0x7F && (isPRINT_A(start) && start > 0x21)) + || ((end < 0x7D && isPRINT_A(end)) && start < 0x20)) +#endif + { + /* If the range beginning isn't an ASCII printable, we find the + * last such in the range, then split the output, so all the + * non-printables are in one subrange; then process the remaining + * portion as usual. If the entire range isn't printables, we + * don't split, but drop down to print as hex */ + if (! isPRINT_A(start)) { + UV temp_end = start + 1; + while (temp_end <= end && ! isPRINT_A(temp_end)) { + temp_end++; + } + if (temp_end <= end) { + put_range(sv, start, temp_end - 1); + start = temp_end; + continue; + } + } + + /* If the range beginning is a digit, output a subrange of just the + * digits, then process the remaining portion as usual */ + if (isDIGIT_A(start)) { + put_byte(sv, start); + sv_catpvs(sv, "-"); + while (start <= end && isDIGIT_A(start)) start++; + put_byte(sv, start - 1); + continue; + } + + /* Similarly for alphabetics. Because in both ASCII and EBCDIC, + * the code points for upper and lower A-Z and a-z aren't + * intermixed, the resulting subrange will consist solely of either + * upper- or lower- alphabetics */ + if (isALPHA_A(start)) { + put_byte(sv, start); + sv_catpvs(sv, "-"); + while (start <= end && isALPHA_A(start)) start++; + put_byte(sv, start - 1); + continue; + } + + /* We output any remaining printables as individual characters */ + if (isPUNCT_A(start) || isSPACE_A(start)) { + while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) { + put_byte(sv, start); + start++; + } + continue; + } + } + + /* Here is a control or non-ascii. Output the range or subrange as + * hex. */ + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + break; + } +} + +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i < 256; i++) { + if (BITMAP_TEST((U8 *) bitmap,i)) { + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j < 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); + has_output_anything = TRUE; + i = j; + } + } + + return has_output_anything; +} + +#define CLEAR_OPTSTART \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ + } STMT_END + +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); + +STATIC const regnode * +S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, + const regnode *last, const regnode *plast, + SV* sv, I32 indent, U32 depth) +{ + U8 op = PSEUDO; /* Arbitrary non-END op. */ + const regnode *next; + const regnode *optstart= NULL; + + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMPUNTIL; + +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, + last ? last-start : 0,plast ? plast-start : 0); +#endif + + if (plast && plast < last) + last= plast; + + while (PL_regkind[op] != END && (!last || node < last)) { + assert(node); + /* While that wasn't END last time... */ + NODE_ALIGN(node); + op = OP(node); + if (op == CLOSE || op == WHILEM) + indent--; + next = regnext((regnode *)node); + + /* Where, what. */ + if (OP(node) == OPTIMIZED) { + if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) + optstart = node; + else + goto after_print; + } else + CLEAR_OPTSTART; + + regprop(r, sv, node, NULL); + PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), + (int)(2*indent + 1), "", SvPVX_const(sv)); + + if (OP(node) != OPTIMIZED) { + if (next == NULL) /* Next ptr. */ + PerlIO_printf(Perl_debug_log, " (0)"); + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) + PerlIO_printf(Perl_debug_log, " (FAIL)"); + else + PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); + (void)PerlIO_putc(Perl_debug_log, '\n'); + } + + after_print: + if (PL_regkind[(U8)op] == BRANCHJ) { + assert(next); + { + const regnode *nnode = (OP(next) == LONGJMP + ? regnext((regnode *)next) + : next); + if (last && nnode > last) + nnode = last; + DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); + } + } + else if (PL_regkind[(U8)op] == BRANCH) { + assert(next); + DUMPUNTIL(NEXTOPER(node), next); + } + else if ( PL_regkind[(U8)op] == TRIE ) { + const regnode *this_trie = node; + const char op = OP(node); + const U32 n = ARG(node); + const reg_ac_data * const ac = op>=AHOCORASICK ? + (reg_ac_data *)ri->data->data[n] : + NULL; + const reg_trie_data * const trie = + (reg_trie_data*)ri->data->data[optrie]; +#ifdef DEBUGGING + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); +#endif + const regnode *nextbranch= NULL; + I32 word_idx; + sv_setpvs(sv, ""); + for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { + SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); + + PerlIO_printf(Perl_debug_log, "%*s%s ", + (int)(2*(indent+3)), "", + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT + ) + : "???" + ); + if (trie->jump) { + U16 dist= trie->jump[word_idx+1]; + PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", + (UV)((dist ? this_trie + dist : next) - start)); + if (dist) { + if (!nextbranch) + nextbranch= this_trie + trie->jump[0]; + DUMPUNTIL(this_trie + dist, nextbranch); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode *)nextbranch); + } else { + PerlIO_printf(Perl_debug_log, "\n"); + } + } + if (last && next > last) + node= last; + else + node= next; + } + else if ( op == CURLY ) { /* "next" might be very big: optimizer */ + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, + NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); + } + else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { + assert(next); + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); + } + else if ( op == PLUS || op == STAR) { + DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); + } + else if (PL_regkind[(U8)op] == ANYOF) { + /* arglen 1 + class block */ + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); + node = NEXTOPER(node); + } + else if (PL_regkind[(U8)op] == EXACT) { + /* Literal string, where present. */ + node += NODE_SZ_STR(node) - 1; + node = NEXTOPER(node); + } + else { + node = NEXTOPER(node); + node += regarglen[(U8)op]; + } + if (op == CURLYX || op == OPEN) + indent++; + } + CLEAR_OPTSTART; +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); +#endif + return node; +} + +#endif /* DEBUGGING */ + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021003/orig/regexec.c b/src/5021003/orig/regexec.c new file mode 100644 index 0000000..33fb5da --- /dev/null +++ b/src/5021003/orig/regexec.c @@ -0,0 +1,8211 @@ +/* regexec.c + */ + +/* + * One Ring to rule them all, One Ring to find them + & + * [p.v of _The Lord of the Rings_, opening poem] + * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] + * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] + */ + +/* This file contains functions for executing a regular expression. See + * also regcomp.c which funnily enough, contains functions for compiling + * a regular expression. + * + * This file is also copied at build time to ext/re/re_exec.c, where + * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. + * This causes the main functions to be compiled under new names and with + * debugging support added, which makes "use re 'debug'" work. + */ + +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +/* + * pregcomp and pregexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + **** Alterations to Henry's code are... + **** + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others + **** + **** You may distribute under the terms of either the GNU General Public + **** License or the Artistic License, as specified in the README file. + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#define PERL_IN_REGEXEC_C +#include "perl.h" + +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +#else +# include "regcomp.h" +#endif + +#include "inline_invlist.c" +#include "unicode_constants.h" + +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + +#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) + +#ifndef STATIC +#define STATIC static +#endif + +/* Valid only for non-utf8 strings: avoids the reginclass + * call if there are no complications: i.e., if everything matchable is + * straight forward in the bitmap */ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ + : ANYOF_BITMAP_TEST(p,*(c))) + +/* + * Forwards. + */ + +#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b) + +#define HOPc(pos,off) \ + (char *)(reginfo->is_utf8_target \ + ? reghop3((U8*)pos, off, \ + (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ + : (U8*)(pos + off)) + +#define HOPBACKc(pos, off) \ + (char*)(reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ + : (pos - off >= reginfo->strbeg) \ + ? (U8*)pos - off \ + : NULL) + +#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) +#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) + +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) + +#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ +#define NEXTCHR_IS_EOS (nextchr < 0) + +#define SET_nextchr \ + nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) + +#define SET_locinput(p) \ + locinput = (p); \ + SET_nextchr + + +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ + if (!swash_ptr) { \ + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ + swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ + 1, 0, invlist, &flags); \ + assert(swash_ptr); \ + } \ + } STMT_END + +/* If in debug mode, we test that a known character properly matches */ +#ifdef DEBUGGING +# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ + property_name, \ + invlist, \ + utf8_char_in_property) \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ + assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); +#else +# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ + property_name, \ + invlist, \ + utf8_char_in_property) \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) +#endif + +#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ + PL_utf8_swash_ptrs[_CC_WORDCHAR], \ + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); + +#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ + STMT_START { \ + LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ + "_X_regular_begin", \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ + LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ + "_X_extend", \ + NULL, \ + COMBINING_GRAVE_ACCENT_UTF8); \ + } STMT_END + +#define PLACEHOLDER /* Something for the preprocessor to grab onto */ +/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ + +/* for use after a quantifier and before an EXACT-like node -- japhy */ +/* it would be nice to rework regcomp.sym to generate this stuff. sigh + * + * NOTE that *nothing* that affects backtracking should be in here, specifically + * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a + * node that is in between two EXACT like nodes when ascertaining what the required + * "follow" character is. This should probably be moved to regex compile time + * although it may be done at run time beause of the REF possibility - more + * investigation required. -- demerphq +*/ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ + (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ +) +#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) + +#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) + +#if 0 +/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so + we don't need this definition. */ +#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) + +#else +/* ... so we use this as its faster. */ +#define IS_TEXT(rn) ( OP(rn)==EXACT ) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) +#define IS_TEXTF(rn) ( OP(rn)==EXACTF ) +#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) + +#endif + +/* + Search for mandatory following text node; for lookahead, the text must + follow but for lookbehind (rn->flags != 0) we skip to the next step. +*/ +#define FIND_NEXT_IMPT(rn) STMT_START { \ + while (JUMPABLE(rn)) { \ + const OPCODE type = OP(rn); \ + if (type == SUSPEND || PL_regkind[type] == CURLY) \ + rn = NEXTOPER(NEXTOPER(rn)); \ + else if (type == PLUS) \ + rn = NEXTOPER(rn); \ + else if (type == IFMATCH) \ + rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ + else rn += NEXT_OFF(rn); \ + } \ +} STMT_END + +/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode. + * These are for the pre-composed Hangul syllables, which are all in a + * contiguous block and arranged there in such a way so as to facilitate + * alorithmic determination of their characteristics. As such, they don't need + * a swash, but can be determined by simple arithmetic. Almost all are + * GCB=LVT, but every 28th one is a GCB=LV */ +#define SBASE 0xAC00 /* Start of block */ +#define SCount 11172 /* Length of block */ +#define TCount 28 + +#define SLAB_FIRST(s) (&(s)->states[0]) +#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) + +static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); +static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); +static regmatch_state * S_push_slab(pTHX); + +#define REGCP_PAREN_ELEMS 3 +#define REGCP_OTHER_ELEMS 3 +#define REGCP_FRAME_ELEMS 1 +/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and + * are needed for the regexp context stack bookkeeping. */ + +STATIC CHECKPOINT +S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) +{ + const int retval = PL_savestack_ix; + const int paren_elems_to_push = + (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS; + const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; + const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; + I32 p; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCPPUSH; + + if (paren_elems_to_push < 0) + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u", + (int)paren_elems_to_push, (int)maxopenparen, + (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS); + + if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) + Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf + " out of range (%lu-%ld)", + total_elems, + (unsigned long)maxopenparen, + (long)parenfloor); + + SSGROW(total_elems + REGCP_FRAME_ELEMS); + + DEBUG_BUFFERS_r( + if ((int)maxopenparen > (int)parenfloor) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { +/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); + SSPUSHINT(rex->offs[p].start_tmp); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", + (UV)p, + (IV)rex->offs[p].start, + (IV)rex->offs[p].start_tmp, + (IV)rex->offs[p].end + )); + } +/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ + SSPUSHINT(maxopenparen); + SSPUSHINT(rex->lastparen); + SSPUSHINT(rex->lastcloseparen); + SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ + + return retval; +} + +/* These are needed since we do not localize EVAL nodes: */ +#define REGCP_SET(cp) \ + DEBUG_STATE_r( \ + PerlIO_printf(Perl_debug_log, \ + " Setting an EVAL scope, savestack=%"IVdf"\n", \ + (IV)PL_savestack_ix)); \ + cp = PL_savestack_ix + +#define REGCP_UNWIND(cp) \ + DEBUG_STATE_r( \ + if (cp != PL_savestack_ix) \ + PerlIO_printf(Perl_debug_log, \ + " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ + (IV)(cp), (IV)PL_savestack_ix)); \ + regcpblow(cp) + +#define UNWIND_PAREN(lp, lcp) \ + for (n = rex->lastparen; n > lp; n--) \ + rex->offs[n].end = -1; \ + rex->lastparen = n; \ + rex->lastcloseparen = lcp; + + +STATIC void +S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) +{ + UV i; + U32 paren; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCPPOP; + + /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ + i = SSPOPUV; + assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ + i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ + rex->lastcloseparen = SSPOPINT; + rex->lastparen = SSPOPINT; + *maxopenparen_p = SSPOPINT; + + i -= REGCP_OTHER_ELEMS; + /* Now restore the parentheses context. */ + DEBUG_BUFFERS_r( + if (i || rex->lastparen + 1 <= rex->nparens) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + paren = *maxopenparen_p; + for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { + SSize_t tmps; + rex->offs[paren].start_tmp = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; + if (paren <= rex->lastparen) + rex->offs[paren].end = tmps; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n", + (UV)paren, + (IV)rex->offs[paren].start, + (IV)rex->offs[paren].start_tmp, + (IV)rex->offs[paren].end, + (paren > rex->lastparen ? "(skipped)" : "")); + ); + paren--; + } +#if 1 + /* It would seem that the similar code in regtry() + * already takes care of this, and in fact it is in + * a better location to since this code can #if 0-ed out + * but the code in regtry() is needed or otherwise tests + * requiring null fields (pat.t#187 and split.t#{13,14} + * (as of patchlevel 7877) will fail. Then again, + * this code seems to be necessary or otherwise + * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ + * --jhi updated by dapm */ + for (i = rex->lastparen + 1; i <= rex->nparens; i++) { + if (i > *maxopenparen_p) + rex->offs[i].start = -1; + rex->offs[i].end = -1; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %s ..-1 undeffing\n", + (UV)i, + (i > *maxopenparen_p) ? "-1" : " " + )); + } +#endif +} + +/* restore the parens and associated vars at savestack position ix, + * but without popping the stack */ + +STATIC void +S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p) +{ + I32 tmpix = PL_savestack_ix; + PL_savestack_ix = ix; + regcppop(rex, maxopenparen_p); + PL_savestack_ix = tmpix; +} + +#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ + +STATIC bool +S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) +{ + /* Returns a boolean as to whether or not 'character' is a member of the + * Posix character class given by 'classnum' that should be equivalent to a + * value in the typedef '_char_class_number'. + * + * Ideally this could be replaced by a just an array of function pointers + * to the C library functions that implement the macros this calls. + * However, to compile, the precise function signatures are required, and + * these may vary from platform to to platform. To avoid having to figure + * out what those all are on each platform, I (khw) am using this method, + * which adds an extra layer of function call overhead (unless the C + * optimizer strips it away). But we don't particularly care about + * performance with locales anyway. */ + + switch ((_char_class_number) classnum) { + case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character); + case _CC_ENUM_ALPHA: return isALPHA_LC(character); + case _CC_ENUM_ASCII: return isASCII_LC(character); + case _CC_ENUM_BLANK: return isBLANK_LC(character); + case _CC_ENUM_CASED: return isLOWER_LC(character) + || isUPPER_LC(character); + case _CC_ENUM_CNTRL: return isCNTRL_LC(character); + case _CC_ENUM_DIGIT: return isDIGIT_LC(character); + case _CC_ENUM_GRAPH: return isGRAPH_LC(character); + case _CC_ENUM_LOWER: return isLOWER_LC(character); + case _CC_ENUM_PRINT: return isPRINT_LC(character); + case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character); + case _CC_ENUM_PUNCT: return isPUNCT_LC(character); + case _CC_ENUM_SPACE: return isSPACE_LC(character); + case _CC_ENUM_UPPER: return isUPPER_LC(character); + case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character); + case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character); + default: /* VERTSPACE should never occur in locales */ + Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum); + } + + assert(0); /* NOTREACHED */ + return FALSE; +} + +STATIC bool +S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) +{ + /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded + * 'character' is a member of the Posix character class given by 'classnum' + * that should be equivalent to a value in the typedef + * '_char_class_number'. + * + * This just calls isFOO_lc on the code point for the character if it is in + * the range 0-255. Outside that range, all characters avoid Unicode + * rules, ignoring any locale. So use the Unicode function if this class + * requires a swash, and use the Unicode macro otherwise. */ + + PERL_ARGS_ASSERT_ISFOO_UTF8_LC; + + if (UTF8_IS_INVARIANT(*character)) { + return isFOO_lc(classnum, *character); + } + else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { + return isFOO_lc(classnum, + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); + } + + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Initialize the swash unless done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) + character, + TRUE /* is UTF */ )); + } + + switch ((_char_class_number) classnum) { + case _CC_ENUM_SPACE: + case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character); + + case _CC_ENUM_BLANK: return is_HORIZWS_high(character); + case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character); + case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character); + default: break; + } + + return FALSE; /* Things like CNTRL are always below 256 */ +} + +/* + * pregexec and friends + */ + +#ifndef PERL_IN_XSUB_RE +/* + - pregexec - match a regexp against a string + */ +I32 +Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) +/* stringarg: the point in the string at which to begin matching */ +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >= minend bytes after stringarg. */ +/* screamer: SV being matched: only used for utf8 flag, pos() etc; string + * itself is accessed via the pointers above */ +/* nosave: For optimizations. */ +{ + PERL_ARGS_ASSERT_PREGEXEC; + + return + regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, + nosave ? 0 : REXEC_COPY_STR); +} +#endif + + + +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. + */ + +char * +Perl_re_intuit_start(pTHX_ + REGEXP * const rx, + SV *sv, + const char * const strbeg, + char *strpos, + char *strend, + const U32 flags, + re_scream_pos_data *data) +{ + struct regexp *const prog = ReANY(rx); + SSize_t start_shift = prog->check_offset_min; + /* Should be nonnegative! */ + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; + SV *check; + const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ + char *check_at = NULL; /* check substr found at this pos */ + const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; + RXi_GET_DECL(prog,progi); + regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ + regmatch_info *const reginfo = ®info_buf; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_START; + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(data); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ + if (prog->minlen > strend - strpos) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too short...\n")); + goto fail; + } + + reginfo->is_utf8_target = cBOOL(utf8_target); + reginfo->info_aux = NULL; + reginfo->strbeg = strbeg; + reginfo->strend = strend; + reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); + reginfo->intuit = 1; + /* not actually used within intuit, but zero for safety anyway */ + reginfo->poscache_maxiter = 0; + + if (utf8_target) { + if (!prog->check_utf8 && prog->check_substr) + to_utf8_substr(prog); + check = prog->check_utf8; + } else { + if (!prog->check_substr && prog->check_utf8) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); + } + } + check = prog->check_substr; + } + + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); + } + }); + + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ + + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); + + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; + } + + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; + } + if (slen && (*SvPVX_const(check) != *s + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); + goto fail_finish; + } + + check_at = s; + goto success_at_start; + } + } + } + + end_shift = prog->check_end_shift; + +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ + if (end_shift < 0) + Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", + (IV)end_shift, RX_PRECOMP(prog)); +#endif + + restart: + + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ + + { + U8* start_point; + U8* end_point; + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); + + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; + } else { + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; + } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } + } + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", + (int)(end_point - start_point), + (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), + start_point); + }); + + check_at = fbm_instr( start_point, end_point, + check, multiline ? FBMrf_MULTILINE : 0); + + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ + + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ + + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); + } + + + /* now look for the 'other' substring if defined */ + + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) + { + /* Take into account the "other" substring. */ + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ + + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); + + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); + } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; + } + else { + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); + } + } + } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); + } + + postprocess_substr_matches: + + /* handle the extra constraint of /^.../m if present */ + + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); + goto fail_finish; + } + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) + { + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; + } + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; + } + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); + } + + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ + + if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { + const U8* const str = (U8*)STRING(progi->regstclass); + + /* XXX this value could be pre-computed */ + const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT + ? (reginfo->is_utf8_pat + ? utf8_distance(str + STR_LEN(progi->regstclass), str) + : STR_LEN(progi->regstclass)) + : 1); + char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } + else + endpos= strend; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); + + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, + reginfo); + if (!s) { + if (endpos == strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) + goto fail; + + /* Contradict one of substrings */ + if (prog->anchored_substr || prog->anchored_utf8) { + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; + } + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; + } + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; + } + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; + } + + /* Success !!! */ + + if (rx_origin != s) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) + ); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Does not contradict STCLASS...\n"); + ); + } + } + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; + } + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; + + fail_finish: /* Substring not found */ + if (prog->check_substr || prog->check_utf8) /* could be removed already */ + BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ + fail: + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", + PL_colors[4], PL_colors[5])); + return NULL; +} + + +#define DECL_TRIE_TYPE(scan) \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ + trie_type = ((scan->flags == EXACT) \ + ? (utf8_target ? trie_utf8 : trie_plain) \ + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + +#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ +STMT_START { \ + STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ + switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ + case trie_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ + len = UTF8SKIP(uc); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ + case trie_latin_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_utf8: \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + break; \ + case trie_plain: \ + uvc = (UV)*uc; \ + len = 1; \ + } \ + if (uvc < 256) { \ + charid = trie->charmap[ uvc ]; \ + } \ + else { \ + charid = 0; \ + if (widecharmap) { \ + SV** const svpp = hv_fetch(widecharmap, \ + (char*)&uvc, sizeof(UV), 0); \ + if (svpp) \ + charid = (U16)SvIV(*svpp); \ + } \ + } \ +} STMT_END + +#define DUMP_EXEC_POS(li,s,doutf8) \ + dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ + startpos, doutf8) + +#define REXEC_FBC_EXACTISH_SCAN(COND) \ +STMT_START { \ + while (s <= e) { \ + if ( (COND) \ + && (ln == 1 || folder(s, pat_string, ln)) \ + && (reginfo->intuit || regtry(reginfo, &s)) )\ + goto got_it; \ + s++; \ + } \ +} STMT_END + +#define REXEC_FBC_UTF8_SCAN(CODE) \ +STMT_START { \ + while (s < strend) { \ + CODE \ + s += UTF8SKIP(s); \ + } \ +} STMT_END + +#define REXEC_FBC_SCAN(CODE) \ +STMT_START { \ + while (s < strend) { \ + CODE \ + s++; \ + } \ +} STMT_END + +#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \ +REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \ + if (COND) { \ + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it; \ + else \ + tmp = doevery; \ + } \ + else \ + tmp = 1; \ +) + +#define REXEC_FBC_CLASS_SCAN(COND) \ +REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ + if (COND) { \ + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it; \ + else \ + tmp = doevery; \ + } \ + else \ + tmp = 1; \ +) + +#define REXEC_FBC_CSCAN(CONDUTF8,COND) \ + if (utf8_target) { \ + REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \ + } \ + else { \ + REXEC_FBC_CLASS_SCAN(COND); \ + } + +/* The three macros below are slightly different versions of the same logic. + * + * The first is for /a and /aa when the target string is UTF-8. This can only + * match ascii, but it must advance based on UTF-8. The other two handle the + * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking + * for the boundary (or non-boundary) between a word and non-word character. + * The utf8 and non-utf8 cases have the same logic, but the details must be + * different. Find the "wordness" of the character just prior to this one, and + * compare it with the wordness of this one. If they differ, we have a + * boundary. At the beginning of the string, pretend that the previous + * character was a new-line. + * + * All these macros uncleanly have side-effects with each other and outside + * variables. So far it's been too much trouble to clean-up + * + * TEST_NON_UTF8 is the macro or function to call to test if its byte input is + * a word character or not. + * IF_SUCCESS is code to do if it finds that we are at a boundary between + * word/non-word + * IF_FAIL is code to do if we aren't at a boundary between word/non-word + * + * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we + * are looking for a boundary or for a non-boundary. If we are looking for a + * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and + * see if this tentative match actually works, and if so, to quit the loop + * here. And vice-versa if we are looking for a non-boundary. + * + * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and + * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of + * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be + * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal + * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that + * complement. But in that branch we complement tmp, meaning that at the + * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s), + * which means at the top of the loop in the next iteration, it is + * TEST_NON_UTF8(s-1) */ +#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + tmp = !tmp; \ + IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + +/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and + * TEST_UTF8 is a macro that for the same input code points returns identically + * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */ +#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ + if (s == reginfo->strbeg) { \ + tmp = '\n'; \ + } \ + else { /* Back-up to the start of the previous character */ \ + U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ + } \ + tmp = TEST_UV(tmp); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ + REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + if (tmp == ! (TEST_UTF8((U8 *) s))) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); + +/* Like the above two macros. UTF8_CODE is the complete code for handling + * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc + * macros below */ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + if (utf8_target) { \ + UTF8_CODE \ + } \ + else { /* Not utf8 */ \ + tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_SCAN( /* advances s while s < strend */ \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + IF_SUCCESS; \ + tmp = !tmp; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + } \ + /* Here, things have been set up by the previous code so that tmp is the \ + * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \ + * utf8ness of the target). We also have to check if this matches against \ + * the EOS, which we treat as a \n (which is the same value in both UTF-8 \ + * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \ + * string */ \ + if (tmp == ! TEST_NON_UTF8('\n')) { \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } + +/* This is the macro to use when we want to see if something that looks like it + * could match, actually does, and if so exits the loop */ +#define REXEC_FBC_TRYIT \ + if ((reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it + +/* The only difference between the BOUND and NBOUND cases is that + * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in + * NBOUND. This is accomplished by passing it as either the if or else clause, + * with the other one being empty (PLACEHOLDER is defined as empty). + * + * The TEST_FOO parameters are for operating on different forms of input, but + * all should be ones that return identically for the same underlying code + * points */ +#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ + TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ + TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ + TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + +#define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ + TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + + +/* We know what class REx starts with. Try to find this position... */ +/* if reginfo->intuit, its a dryrun */ +/* annoyingly all the vars in this routine have different names from their counterparts + in regmatch. /grrr */ +STATIC char * +S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, + const char *strend, regmatch_info *reginfo) +{ + dVAR; + const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; + char *pat_string; /* The pattern's exactish string */ + char *pat_end; /* ptr to end char of pat_string */ + re_fold_t folder; /* Function for computing non-utf8 folds */ + const U8 *fold_array; /* array for folding ords < 256 */ + STRLEN ln; + STRLEN lnc; + U8 c1; + U8 c2; + char *e; + I32 tmp = 1; /* Scratch variable? */ + const bool utf8_target = reginfo->is_utf8_target; + UV utf8_fold_flags = 0; + const bool is_utf8_pat = reginfo->is_utf8_pat; + bool to_complement = FALSE; /* Invert the result? Taking the xor of this + with a result inverts that result, as 0^1 = + 1 and 1^1 = 0 */ + _char_class_number classnum; + + RXi_GET_DECL(prog,progi); + + PERL_ARGS_ASSERT_FIND_BYCLASS; + + /* We know what class it must start with. */ + switch (OP(c)) { + case ANYOF: + if (utf8_target) { + REXEC_FBC_UTF8_CLASS_SCAN( + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); + } + else { + REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); + } + break; + case CANY: + REXEC_FBC_SCAN( + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) + goto got_it; + else + tmp = doevery; + ); + break; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + if (is_utf8_pat || utf8_target) { + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf_utf8; + } + fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */ + folder = foldEQ_latin1; /* /a, except the sharp s one which */ + goto do_exactf_non_utf8; /* isn't dealt with by these */ + + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + if (utf8_target) { + utf8_fold_flags = 0; + goto do_exactf_utf8; + } + fold_array = PL_fold; + folder = foldEQ; + goto do_exactf_non_utf8; + + case EXACTFL: + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_exactf_utf8; + } + fold_array = PL_fold_locale; + folder = foldEQ_locale; + goto do_exactf_non_utf8; + + case EXACTFU_SS: + if (is_utf8_pat) { + utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + } + goto do_exactf_utf8; + + case EXACTFU: + if (is_utf8_pat || utf8_target) { + utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; + goto do_exactf_utf8; + } + + /* Any 'ss' in the pattern should have been replaced by regcomp, + * so we don't have to worry here about this single special case + * in the Latin1 range */ + fold_array = PL_fold_latin1; + folder = foldEQ_latin1; + + /* FALLTHROUGH */ + + do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there + are no glitches with fold-length differences + between the target string and pattern */ + + /* The idea in the non-utf8 EXACTF* cases is to first find the + * first character of the EXACTF* node and then, if necessary, + * case-insensitively compare the full text of the node. c1 is the + * first character. c2 is its fold. This logic will not work for + * Unicode semantics and the german sharp ss, which hence should + * not be compiled into a node that gets here. */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + + /* We know that we have to match at least 'ln' bytes (which is the + * same as characters, since not utf8). If we have to match 3 + * characters, and there are only 2 availabe, we know without + * trying that it will fail; so don't start a match past the + * required minimum number from the far end */ + e = HOP3c(strend, -((SSize_t)ln), s); + + if (reginfo->intuit && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + c1 = *pat_string; + c2 = fold_array[c1]; + if (c1 == c2) { /* If char and fold are the same */ + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); + } + else { + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); + } + break; + + do_exactf_utf8: + { + unsigned expansion; + + /* If one of the operands is in utf8, we can't use the simpler folding + * above, due to the fact that many different characters can have the + * same fold, or portion of a fold, or different- length fold */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_end = pat_string + ln; + lnc = is_utf8_pat /* length to match in characters */ + ? utf8_length((U8 *) pat_string, (U8 *) pat_end) + : ln; + + /* We have 'lnc' characters to match in the pattern, but because of + * multi-character folding, each character in the target can match + * up to 3 characters (Unicode guarantees it will never exceed + * this) if it is utf8-encoded; and up to 2 if not (based on the + * fact that the Latin 1 folds are already determined, and the + * only multi-char fold in that range is the sharp-s folding to + * 'ss'. Thus, a pattern character can match as little as 1/3 of a + * string character. Adjust lnc accordingly, rounding up, so that + * if we need to match at least 4+1/3 chars, that really is 5. */ + expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; + lnc = (lnc + expansion - 1) / expansion; + + /* As in the non-UTF8 case, if we have to match 3 characters, and + * only 2 are left, it's guaranteed to fail, so don't start a + * match that would require us to go beyond the end of the string + */ + e = HOP3c(strend, -((SSize_t)lnc), s); + + if (reginfo->intuit && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + /* XXX Note that we could recalculate e to stop the loop earlier, + * as the worst case expansion above will rarely be met, and as we + * go along we would usually find that e moves further to the left. + * This would happen only after we reached the point in the loop + * where if there were no expansion we should fail. Unclear if + * worth the expense */ + + while (s <= e) { + char *my_strend= (char *)strend; + if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, + pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags) + && (reginfo->intuit || regtry(reginfo, &s)) ) + { + goto got_it; + } + s += (utf8_target) ? UTF8SKIP(s) : 1; + } + break; + } + + case BOUNDL: + FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); + break; + case NBOUNDL: + FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); + break; + case BOUND: + FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case BOUNDA: + FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A); + break; + case NBOUND: + FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case NBOUNDA: + FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A); + break; + case BOUNDU: + FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case NBOUNDU: + FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case LNBREAK: + REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), + is_LNBREAK_latin1_safe(s, strend) + ); + break; + + /* The argument to all the POSIX node types is the class number to pass to + * _generic_isCC() to build a mask for searching in PL_charclass[] */ + + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: + REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), + to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); + break; + + case NPOSIXD: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: + if (utf8_target) { + goto posix_utf8; + } + goto posixa; + + case NPOSIXA: + if (utf8_target) { + /* The complement of something that matches only ASCII matches all + * non-ASCII, plus everything in ASCII that isn't in the class. */ + REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s) + || ! _generic_isCC_A(*s, FLAGS(c))); + break; + } + + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXA: + posixa: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character. */ + REXEC_FBC_CLASS_SCAN( + to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); + break; + + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s, + FLAGS(c)))); + } + else { + + posix_utf8: + classnum = (_char_class_number) FLAGS(c); + if (classnum < _FIRST_NON_SWASH_CC) { + while (s < strend) { + + /* We avoid loading in the swash as long as possible, but + * should we have to, we jump to a separate loop. This + * extra 'if' statement is what keeps this code from being + * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */ + if (UTF8_IS_ABOVE_LATIN1(*s)) { + goto found_above_latin1; + } + if ((UTF8_IS_INVARIANT(*s) + && to_complement ^ cBOOL(_generic_isCC((U8) *s, + classnum))) + || (UTF8_IS_DOWNGRADEABLE_START(*s) + && to_complement ^ cBOOL( + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), + classnum)))) + { + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) + goto got_it; + else { + tmp = doevery; + } + } + else { + tmp = 1; + } + s += UTF8SKIP(s); + } + } + else switch (classnum) { /* These classes are implemented as + macros */ + case _CC_ENUM_SPACE: /* XXX would require separate code if we + revert the change of \v matching this */ + /* FALLTHROUGH */ + + case _CC_ENUM_PSXSPC: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isSPACE_utf8(s))); + break; + + case _CC_ENUM_BLANK: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isBLANK_utf8(s))); + break; + + case _CC_ENUM_XDIGIT: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isXDIGIT_utf8(s))); + break; + + case _CC_ENUM_VERTSPACE: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isVERTWS_utf8(s))); + break; + + case _CC_ENUM_CNTRL: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isCNTRL_utf8(s))); + break; + + default: + Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum); + assert(0); /* NOTREACHED */ + } + } + break; + + found_above_latin1: /* Here we have to load a swash to get the result + for the current code point */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + /* This is a copy of the loop above for swash classes, though using the + * FBC macro instead of being expanded out. Since we've loaded the + * swash, we don't have to check for that each time through the loop */ + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(_generic_utf8( + classnum, + s, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) s, TRUE)))); + break; + + case AHOCORASICKC: + case AHOCORASICK: + { + DECL_TRIE_TYPE(c); + /* what trie are we using right now */ + reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; + reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ]; + HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); + + const char *last_start = strend - trie->minlen; +#ifdef DEBUGGING + const char *real_start = s; +#endif + STRLEN maxlen = trie->maxlen; + SV *sv_points; + U8 **points; /* map of where we were in the input string + when reading a given char. For ASCII this + is unnecessary overhead as the relationship + is always 1:1, but for Unicode, especially + case folded Unicode this is not true. */ + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + U8 *bitmap=NULL; + + + GET_RE_DEBUG_FLAGS_DECL; + + /* We can't just allocate points here. We need to wrap it in + * an SV so it gets freed properly if there is a croak while + * running the match */ + ENTER; + SAVETMPS; + sv_points=newSV(maxlen * sizeof(U8 *)); + SvCUR_set(sv_points, + maxlen * sizeof(U8 *)); + SvPOK_on(sv_points); + sv_2mortal(sv_points); + points=(U8**)SvPV_nolen(sv_points ); + if ( trie_type != trie_utf8_fold + && (trie->bitmap || OP(c)==AHOCORASICKC) ) + { + if (trie->bitmap) + bitmap=(U8*)trie->bitmap; + else + bitmap=(U8*)ANYOF_BITMAP(c); + } + /* this is the Aho-Corasick algorithm modified a touch + to include special handling for long "unknown char" sequences. + The basic idea being that we use AC as long as we are dealing + with a possible matching char, when we encounter an unknown char + (and we have not encountered an accepting state) we scan forward + until we find a legal starting char. + AC matching is basically that of trie matching, except that when + we encounter a failing transition, we fall back to the current + states "fail state", and try the current char again, a process + we repeat until we reach the root state, state 1, or a legal + transition. If we fail on the root state then we can either + terminate if we have reached an accepting state previously, or + restart the entire process from the beginning if we have not. + + */ + while (s <= last_start) { + const U32 uniflags = UTF8_ALLOW_DEFAULT; + U8 *uc = (U8*)s; + U16 charid = 0; + U32 base = 1; + U32 state = 1; + UV uvc = 0; + STRLEN len = 0; + STRLEN foldlen = 0; + U8 *uscan = (U8*)NULL; + U8 *leftmost = NULL; +#ifdef DEBUGGING + U32 accepted_word= 0; +#endif + U32 pointpos = 0; + + while ( state && uc <= (U8*)strend ) { + int failed=0; + U32 word = aho->states[ state ].wordnum; + + if( state==1 ) { + if ( bitmap ) { + DEBUG_TRIE_EXECUTE_r( + if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + dump_exec_pos( (char *)uc, c, strend, real_start, + (char *)uc, utf8_target ); + PerlIO_printf( Perl_debug_log, + " Scanning for legal start char...\n"); + } + ); + if (utf8_target) { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc += UTF8SKIP(uc); + } + } else { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; + } + } + s= (char *)uc; + } + if (uc >(U8*)last_start) break; + } + + if ( word ) { + U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ]; + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=word); + leftmost= lpos; + } + if (base==0) break; + + } + points[pointpos++ % maxlen]= uc; + if (foldlen || uc < (U8*)strend) { + REXEC_TRIE_READ_CHAR(trie_type, trie, + widecharmap, uc, + uscan, len, uvc, charid, foldlen, + foldbuf, uniflags); + DEBUG_TRIE_EXECUTE_r({ + dump_exec_pos( (char *)uc, c, strend, + real_start, s, utf8_target); + PerlIO_printf(Perl_debug_log, + " Charid:%3u CP:%4"UVxf" ", + charid, uvc); + }); + } + else { + len = 0; + charid = 0; + } + + + do { +#ifdef DEBUGGING + word = aho->states[ state ].wordnum; +#endif + base = aho->states[ state ].trans.base; + + DEBUG_TRIE_EXECUTE_r({ + if (failed) + dump_exec_pos( (char *)uc, c, strend, real_start, + s, utf8_target ); + PerlIO_printf( Perl_debug_log, + "%sState: %4"UVxf", word=%"UVxf, + failed ? " Fail transition to " : "", + (UV)state, (UV)word); + }); + if ( base ) { + U32 tmp; + I32 offset; + if (charid && + ( ((offset = base + charid + - 1 - trie->uniquecharcount)) >= 0) + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state + && (tmp=trie->trans[offset].next)) + { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - legal\n")); + state = tmp; + break; + } + else { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - fail\n")); + failed = 1; + state = aho->fail[state]; + } + } + else { + /* we must be accepting here */ + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - accepting\n")); + failed = 1; + break; + } + } while(state); + uc += len; + if (failed) { + if (leftmost) + break; + if (!state) state = 1; + } + } + if ( aho->states[ state ].wordnum ) { + U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ]; + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=aho->states[ state ].wordnum); + leftmost = lpos; + } + } + if (leftmost) { + s = (char*)leftmost; + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( + Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n", + (UV)accepted_word, (IV)(s - real_start) + ); + }); + if (reginfo->intuit || regtry(reginfo, &s)) { + FREETMPS; + LEAVE; + goto got_it; + } + s = HOPc(s,1); + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n"); + }); + } else { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log,"No match.\n")); + break; + } + } + FREETMPS; + LEAVE; + } + break; + default: + Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); + } + return 0; + got_it: + return s; +} + +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +static void +S_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) + { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + + +/* + - regexec_flags - match a regexp against a string + */ +I32 +Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) +/* stringarg: the point in the string at which to begin matching */ +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >= minend bytes after stringarg. */ +/* sv: SV being matched: only used for utf8 flag, pos() etc; string + * itself is accessed via the pointers above */ +/* data: May be used for some additional optimizations. + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ + +{ + struct regexp *const prog = ReANY(rx); + char *s; + regnode *c; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ + const bool utf8_target = cBOOL(DO_UTF8(sv)); + I32 multiline; + RXi_GET_DECL(prog,progi); + regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ + regmatch_info *const reginfo = ®info_buf; + regexp_paren_pair *swap = NULL; + I32 oldsave; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGEXEC_FLAGS; + PERL_UNUSED_ARG(data); + + /* Be paranoid... */ + if (prog == NULL || stringarg == NULL) { + Perl_croak(aTHX_ "NULL regexp parameter"); + } + + DEBUG_EXECUTE_r( + debug_start_match(rx, utf8_target, stringarg, strend, + "Matching"); + ); + + startpos = stringarg; + + if (prog->intflags & PREGf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if prog->gofs is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ + + if (prog->intflags & PREGf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->intflags & PREGf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; + } + + /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), + * which will call destuctors to reset PL_regmatch_state, free higher + * PL_regmatch_slabs, and clean up regmatch_info_aux and + * regmatch_info_aux_eval */ + + oldsave = PL_savestack_ix; + + s = startpos; + + if ((prog->extflags & RXf_USE_INTUIT) + && !(flags & REXEC_CHECKED)) + { + s = re_intuit_start(rx, sv, strbeg, startpos, strend, + flags, NULL); + if (!s) + return 0; + + if (prog->extflags & RXf_CHECK_ALL) { + /* we can match based purely on the result of INTUIT. + * Set up captures etc just for $& and $-[0] + * (an intuit-only match wont have $1,$2,..) */ + assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + /* match via INTUIT shouldn't have any captures. + * Let @-, @+, $^N know */ + prog->lastparen = prog->lastcloseparen = 0; + RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, strend, + sv, flags, utf8_target); + + return 1; + } + } + + multiline = prog->extflags & RXf_PMf_MULTILINE; + + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "String too short [regexec_flags]...\n")); + goto phooey; + } + + /* Check validity of program. */ + if (UCHARAT(progi->program) != REG_MAGIC) { + Perl_croak(aTHX_ "corrupted regexp program"); + } + + RX_MATCH_TAINTED_off(rx); + + reginfo->prog = rx; /* Yes, sorry that this is confusing. */ + reginfo->intuit = 0; + reginfo->is_utf8_target = cBOOL(utf8_target); + reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); + reginfo->warned = FALSE; + reginfo->strbeg = strbeg; + reginfo->sv = sv; + reginfo->poscache_maxiter = 0; /* not yet started a countdown */ + reginfo->strend = strend; + /* see how far we have to get to not match where we matched before */ + reginfo->till = stringarg + minend; + + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { + /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after + S_cleanup_regmatch_info_aux has executed (registered by + SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies + magic belonging to this SV. + Not newSVsv, either, as it does not COW. + */ + assert(!IS_PADGV(sv)); + reginfo->sv = newSV(0); + SvSetSV_nosteal(reginfo->sv, sv); + SAVEFREESV(reginfo->sv); + } + + /* reserve next 2 or 3 slots in PL_regmatch_state: + * slot N+0: may currently be in use: skip it + * slot N+1: use for regmatch_info_aux struct + * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s + * slot N+3: ready for use by regmatch() + */ + + { + regmatch_state *old_regmatch_state; + regmatch_slab *old_regmatch_slab; + int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; + + /* on first ever match, allocate first slab */ + if (!PL_regmatch_slab) { + Newx(PL_regmatch_slab, 1, regmatch_slab); + PL_regmatch_slab->prev = NULL; + PL_regmatch_slab->next = NULL; + PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); + } + + old_regmatch_state = PL_regmatch_state; + old_regmatch_slab = PL_regmatch_slab; + + for (i=0; i <= max; i++) { + if (i == 1) + reginfo->info_aux = &(PL_regmatch_state->u.info_aux); + else if (i ==2) + reginfo->info_aux_eval = + reginfo->info_aux->info_aux_eval = + &(PL_regmatch_state->u.info_aux_eval); + + if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) + PL_regmatch_state = S_push_slab(aTHX); + } + + /* note initial PL_regmatch_state position; at end of match we'll + * pop back to there and free any higher slabs */ + + reginfo->info_aux->old_regmatch_state = old_regmatch_state; + reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; + reginfo->info_aux->poscache = NULL; + + SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); + + if ((prog->extflags & RXf_EVAL_SEEN)) + S_setup_eval_state(aTHX_ reginfo); + else + reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; + } + + /* If there is a "must appear" string, look for it. */ + + if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { + /* We have to be careful. If the previous successful match + was from this regex we don't want a subsequent partially + successful match to clobber the old results. + So when we detect this possibility we add a swap buffer + to the re, and switch the buffer each match. If we fail, + we switch it back; otherwise we leave it swapped. + */ + swap = prog->offs; + /* do we need a save destructor here for eval dies? */ + Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap), + PTR2UV(prog->offs) + )); + } + + /* Simplest case: anchored match need be tried only once. */ + /* [unless only anchor is BOL and multiline is set] */ + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { + if (s == startpos && regtry(reginfo, &s)) + goto got_it; + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ + { + char *end; + + if (minlen) + dontbother = minlen - 1; + end = HOP3c(strend, -dontbother, strbeg) - 1; + /* for multiline we only have to try after newlines */ + if (prog->check_substr || prog->check_utf8) { + /* because of the goto we can not easily reuse the macros for bifurcating the + unicode/non-unicode match modes here like we do elsewhere - demerphq */ + if (utf8_target) { + if (s == startpos) + goto after_try_utf8; + while (1) { + if (regtry(reginfo, &s)) { + goto got_it; + } + after_try_utf8: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, strbeg, + s + UTF8SKIP(s), strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s += UTF8SKIP(s); + } + } + } /* end search for check string in unicode */ + else { + if (s == startpos) { + goto after_try_latin; + } + while (1) { + if (regtry(reginfo, &s)) { + goto got_it; + } + after_try_latin: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, strbeg, + s + 1, strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s++; + } + } + } /* end search for check string in latin*/ + } /* end search for check string */ + else { /* search for newline */ + if (s > startpos) { + /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ + s--; + } + /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ + while (s <= end) { /* note it could be possible to match at the end of the string */ + if (*s++ == '\n') { /* don't need PL_utf8skip here */ + if (regtry(reginfo, &s)) + goto got_it; + } + } + } /* end search for newline */ + } /* end anchored/multiline check string search */ + goto phooey; + } else if (prog->intflags & PREGf_ANCH_GPOS) + { + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) + goto got_it; + goto phooey; + } + + /* Messy cases: unanchored match. */ + if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { + /* we have /x+whatever/ */ + /* it must be a one character string (XXXX Except is_utf8_pat?) */ + char ch; +#ifdef DEBUGGING + int did_match = 0; +#endif + if (utf8_target) { + if (! prog->anchored_utf8) { + to_utf8_substr(prog); + } + ch = SvPVX_const(prog->anchored_utf8)[0]; + REXEC_FBC_SCAN( + if (*s == ch) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (regtry(reginfo, &s)) goto got_it; + s += UTF8SKIP(s); + while (s < strend && *s == ch) + s += UTF8SKIP(s); + } + ); + + } + else { + if (! prog->anchored_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + ch = SvPVX_const(prog->anchored_substr)[0]; + REXEC_FBC_SCAN( + if (*s == ch) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (regtry(reginfo, &s)) goto got_it; + s++; + while (s < strend && *s == ch) + s++; + } + ); + } + DEBUG_EXECUTE_r(if (!did_match) + PerlIO_printf(Perl_debug_log, + "Did not find anchored character...\n") + ); + } + else if (prog->anchored_substr != NULL + || prog->anchored_utf8 != NULL + || ((prog->float_substr != NULL || prog->float_utf8 != NULL) + && prog->float_max_offset < strend - s)) { + SV *must; + SSize_t back_max; + SSize_t back_min; + char *last; + char *last1; /* Last position checked before */ +#ifdef DEBUGGING + int did_match = 0; +#endif + if (prog->anchored_substr || prog->anchored_utf8) { + if (utf8_target) { + if (! prog->anchored_utf8) { + to_utf8_substr(prog); + } + must = prog->anchored_utf8; + } + else { + if (! prog->anchored_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + must = prog->anchored_substr; + } + back_max = back_min = prog->anchored_offset; + } else { + if (utf8_target) { + if (! prog->float_utf8) { + to_utf8_substr(prog); + } + must = prog->float_utf8; + } + else { + if (! prog->float_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + must = prog->float_substr; + } + back_max = prog->float_max_offset; + back_min = prog->float_min_offset; + } + + if (back_min<0) { + last = strend; + } else { + last = HOP3c(strend, /* Cannot start after this */ + -(SSize_t)(CHR_SVLEN(must) + - (SvTAIL(must) != 0) + back_min), strbeg); + } + if (s > reginfo->strbeg) + last1 = HOPc(s, -1); + else + last1 = s - 1; /* bogus */ + + /* XXXX check_substr already used to find "s", can optimize if + check_substr==must. */ + dontbother = 0; + strend = HOPc(strend, -dontbother); + while ( (s <= last) && + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), + (unsigned char*)strend, must, + multiline ? FBMrf_MULTILINE : 0)) ) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (HOPc(s, -back_max) > last1) { + last1 = HOPc(s, -back_min); + s = HOPc(s, -back_max); + } + else { + char * const t = (last1 >= reginfo->strbeg) + ? HOPc(last1, 1) : last1 + 1; + + last1 = HOPc(s, -back_min); + s = t; + } + if (utf8_target) { + while (s <= last1) { + if (regtry(reginfo, &s)) + goto got_it; + if (s >= last1) { + s++; /* to break out of outer loop */ + break; + } + s += UTF8SKIP(s); + } + } + else { + while (s <= last1) { + if (regtry(reginfo, &s)) + goto got_it; + s++; + } + } + } + DEBUG_EXECUTE_r(if (!did_match) { + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", + ((must == prog->anchored_substr || must == prog->anchored_utf8) + ? "anchored" : "floating"), + quoted, RE_SV_TAIL(must)); + }); + goto phooey; + } + else if ( (c = progi->regstclass) ) { + if (minlen) { + const OPCODE op = OP(progi->regstclass); + /* don't bother with what can't match */ + if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE) + strend = HOPc(strend, -(minlen - 1)); + } + DEBUG_EXECUTE_r({ + SV * const prop = sv_newmortal(); + regprop(prog, prop, c, reginfo); + { + RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), + s,strend-s,60); + PerlIO_printf(Perl_debug_log, + "Matching stclass %.*s against %s (%d bytes)\n", + (int)SvCUR(prop), SvPVX_const(prop), + quoted, (int)(strend - s)); + } + }); + if (find_byclass(prog, c, s, strend, reginfo)) + goto got_it; + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n")); + } + else { + dontbother = 0; + if (prog->float_substr != NULL || prog->float_utf8 != NULL) { + /* Trim the end. */ + char *last= NULL; + SV* float_real; + STRLEN len; + const char *little; + + if (utf8_target) { + if (! prog->float_utf8) { + to_utf8_substr(prog); + } + float_real = prog->float_utf8; + } + else { + if (! prog->float_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + float_real = prog->float_substr; + } + + little = SvPV_const(float_real, len); + if (SvTAIL(float_real)) { + /* This means that float_real contains an artificial \n on + * the end due to the presence of something like this: + * /foo$/ where we can match both "foo" and "foo\n" at the + * end of the string. So we have to compare the end of the + * string first against the float_real without the \n and + * then against the full float_real with the string. We + * have to watch out for cases where the string might be + * smaller than the float_real or the float_real without + * the \n. */ + char *checkpos= strend - len; + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%sChecking for float_real.%s\n", + PL_colors[4], PL_colors[5])); + if (checkpos + 1 < strbeg) { + /* can't match, even if we remove the trailing \n + * string is too short to match */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString shorter than required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (memEQ(checkpos + 1, little, len - 1)) { + /* can match, the end of the string matches without the + * "\n" */ + last = checkpos + 1; + } else if (checkpos < strbeg) { + /* cant match, string is too short when the "\n" is + * included */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (!multiline) { + /* non multiline match, so compare with the "\n" at the + * end of the string */ + if (memEQ(checkpos, little, len)) { + last= checkpos; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } + } else { + /* multiline match, so we have to search for a place + * where the full string is located */ + goto find_last; + } + } else { + find_last: + if (len) + last = rninstr(s, strend, little, little + len); + else + last = strend; /* matching "$" */ + } + if (!last) { + /* at one point this block contained a comment which was + * probably incorrect, which said that this was a "should not + * happen" case. Even if it was true when it was written I am + * pretty sure it is not anymore, so I have removed the comment + * and replaced it with this one. Yves */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "String does not contain required substring, cannot match.\n" + )); + goto phooey; + } + dontbother = strend - last + prog->float_min_offset; + } + if (minlen && (dontbother < minlen)) + dontbother = minlen - 1; + strend -= dontbother; /* this one's always in bytes! */ + /* We don't know much -- general case. */ + if (utf8_target) { + for (;;) { + if (regtry(reginfo, &s)) + goto got_it; + if (s >= strend) + break; + s += UTF8SKIP(s); + }; + } + else { + do { + if (regtry(reginfo, &s)) + goto got_it; + } while (s++ < strend); + } + } + + /* Failure. */ + goto phooey; + +got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + DEBUG_BUFFERS_r( + if (swap) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap) + ); + ); + Safefree(swap); + + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ + + LEAVE_SCOPE(oldsave); + + if (RXp_PAREN_NAMES(prog)) + (void)hv_iterinit(RXp_PAREN_NAMES(prog)); + + RX_MATCH_UTF8_set(rx, utf8_target); + + /* make sure $`, $&, $', and $digit will work later */ + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); + + return 1; + +phooey: + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", + PL_colors[4], PL_colors[5])); + + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ + + LEAVE_SCOPE(oldsave); + + if (swap) { + /* we failed :-( roll it back */ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(prog->offs), + PTR2UV(swap) + )); + Safefree(prog->offs); + prog->offs = swap; + } + return 0; +} + + +/* Set which rex is pointed to by PL_reg_curpm, handling ref counting. + * Do inc before dec, in case old and new rex are the same */ +#define SET_reg_curpm(Re2) \ + if (reginfo->info_aux_eval) { \ + (void)ReREFCNT_inc(Re2); \ + ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ + PM_SETRE((PL_reg_curpm), (Re2)); \ + } + + +/* + - regtry - try match at specific point + */ +STATIC I32 /* 0 failure, 1 success */ +S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) +{ + CHECKPOINT lastcp; + REGEXP *const rx = reginfo->prog; + regexp *const prog = ReANY(rx); + SSize_t result; + RXi_GET_DECL(prog,progi); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTRY; + + reginfo->cutpoint=NULL; + + prog->offs[0].start = *startposp - reginfo->strbeg; + prog->lastparen = 0; + prog->lastcloseparen = 0; + + /* XXXX What this code is doing here?!!! There should be no need + to do this again and again, prog->lastparen should take care of + this! --ilya*/ + + /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. + * Actually, the code in regcppop() (which Ilya may be meaning by + * prog->lastparen), is not needed at all by the test suite + * (op/regexp, op/pat, op/split), but that code is needed otherwise + * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ + * Meanwhile, this code *is* needed for the + * above-mentioned test suite tests to succeed. The common theme + * on those tests seems to be returning null fields from matches. + * --jhi updated by dapm */ +#if 1 + if (prog->nparens) { + regexp_paren_pair *pp = prog->offs; + I32 i; + for (i = prog->nparens; i > (I32)prog->lastparen; i--) { + ++pp; + pp->start = -1; + pp->end = -1; + } + } +#endif + REGCP_SET(lastcp); + result = regmatch(reginfo, *startposp, progi->program + 1); + if (result != -1) { + prog->offs[0].end = result; + return 1; + } + if (reginfo->cutpoint) + *startposp= reginfo->cutpoint; + REGCP_UNWIND(lastcp); + return 0; +} + + +#define sayYES goto yes +#define sayNO goto no +#define sayNO_SILENT goto no_silent + +/* we dont use STMT_START/END here because it leads to + "unreachable code" warnings, which are bogus, but distracting. */ +#define CACHEsayNO \ + if (ST.cache_mask) \ + reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ + sayNO + +/* this is used to determine how far from the left messages like + 'failed...' are printed. It should be set such that messages + are inline with the regop output that created them. +*/ +#define REPORT_CODE_OFF 32 + + +#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ +#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ +#define CHRTEST_NOT_A_CP_1 -999 +#define CHRTEST_NOT_A_CP_2 -998 + +/* grab a new slab and return the first slot in it */ + +STATIC regmatch_state * +S_push_slab(pTHX) +{ +#if PERL_VERSION < 9 && !defined(PERL_CORE) + dMY_CXT; +#endif + regmatch_slab *s = PL_regmatch_slab->next; + if (!s) { + Newx(s, 1, regmatch_slab); + s->prev = PL_regmatch_slab; + s->next = NULL; + PL_regmatch_slab->next = s; + } + PL_regmatch_slab = s; + return SLAB_FIRST(s); +} + + +/* push a new state then goto it */ + +#define PUSH_STATE_GOTO(state, node, input) \ + pushinput = input; \ + scan = node; \ + st->resume_state = state; \ + goto push_state; + +/* push a new state with success backtracking, then goto it */ + +#define PUSH_YES_STATE_GOTO(state, node, input) \ + pushinput = input; \ + scan = node; \ + st->resume_state = state; \ + goto push_yes_state; + + + + +/* + +regmatch() - main matching routine + +This is basically one big switch statement in a loop. We execute an op, +set 'next' to point the next op, and continue. If we come to a point which +we may need to backtrack to on failure such as (A|B|C), we push a +backtrack state onto the backtrack stack. On failure, we pop the top +state, and re-enter the loop at the state indicated. If there are no more +states to pop, we return failure. + +Sometimes we also need to backtrack on success; for example /A+/, where +after successfully matching one A, we need to go back and try to +match another one; similarly for lookahead assertions: if the assertion +completes successfully, we backtrack to the state just before the assertion +and then carry on. In these cases, the pushed state is marked as +'backtrack on success too'. This marking is in fact done by a chain of +pointers, each pointing to the previous 'yes' state. On success, we pop to +the nearest yes state, discarding any intermediate failure-only states. +Sometimes a yes state is pushed just to force some cleanup code to be +called at the end of a successful match or submatch; e.g. (??{$re}) uses +it to free the inner regex. + +Note that failure backtracking rewinds the cursor position, while +success backtracking leaves it alone. + +A pattern is complete when the END op is executed, while a subpattern +such as (?=foo) is complete when the SUCCESS op is executed. Both of these +ops trigger the "pop to last yes state if any, otherwise return true" +behaviour. + +A common convention in this function is to use A and B to refer to the two +subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is +the subpattern to be matched possibly multiple times, while B is the entire +rest of the pattern. Variable and state names reflect this convention. + +The states in the main switch are the union of ops and failure/success of +substates associated with with that op. For example, IFMATCH is the op +that does lookahead assertions /(?=A)B/ and so the IFMATCH state means +'execute IFMATCH'; while IFMATCH_A is a state saying that we have just +successfully matched A and IFMATCH_A_fail is a state saying that we have +just failed to match A. Resume states always come in pairs. The backtrack +state we push is marked as 'IFMATCH_A', but when that is popped, we resume +at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking +on success or failure. + +The struct that holds a backtracking state is actually a big union, with +one variant for each major type of op. The variable st points to the +top-most backtrack struct. To make the code clearer, within each +block of code we #define ST to alias the relevant union. + +Here's a concrete example of a (vastly oversimplified) IFMATCH +implementation: + + switch (state) { + .... + +#define ST st->u.ifmatch + + case IFMATCH: // we are executing the IFMATCH op, (?=A)B + ST.foo = ...; // some state we wish to save + ... + // push a yes backtrack state with a resume value of + // IFMATCH_A/IFMATCH_A_fail, then continue execution at the + // first node of A: + PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); + // NOTREACHED + + case IFMATCH_A: // we have successfully executed A; now continue with B + next = B; + bar = ST.foo; // do something with the preserved value + break; + + case IFMATCH_A_fail: // A failed, so the assertion failed + ...; // do some housekeeping, then ... + sayNO; // propagate the failure + +#undef ST + + ... + } + +For any old-timers reading this who are familiar with the old recursive +approach, the code above is equivalent to: + + case IFMATCH: // we are executing the IFMATCH op, (?=A)B + { + int foo = ... + ... + if (regmatch(A)) { + next = B; + bar = foo; + break; + } + ...; // do some housekeeping, then ... + sayNO; // propagate the failure + } + +The topmost backtrack state, pointed to by st, is usually free. If you +want to claim it, populate any ST.foo fields in it with values you wish to +save, then do one of + + PUSH_STATE_GOTO(resume_state, node, newinput); + PUSH_YES_STATE_GOTO(resume_state, node, newinput); + +which sets that backtrack state's resume value to 'resume_state', pushes a +new free entry to the top of the backtrack stack, then goes to 'node'. +On backtracking, the free slot is popped, and the saved state becomes the +new free state. An ST.foo field in this new top state can be temporarily +accessed to retrieve values, but once the main loop is re-entered, it +becomes available for reuse. + +Note that the depth of the backtrack stack constantly increases during the +left-to-right execution of the pattern, rather than going up and down with +the pattern nesting. For example the stack is at its maximum at Z at the +end of the pattern, rather than at X in the following: + + /(((X)+)+)+....(Y)+....Z/ + +The only exceptions to this are lookahead/behind assertions and the cut, +(?>A), which pop all the backtrack states associated with A before +continuing. + +Backtrack state structs are allocated in slabs of about 4K in size. +PL_regmatch_state and st always point to the currently active state, +and PL_regmatch_slab points to the slab currently containing +PL_regmatch_state. The first time regmatch() is called, the first slab is +allocated, and is never freed until interpreter destruction. When the slab +is full, a new one is allocated and chained to the end. At exit from +regmatch(), slabs allocated since entry are freed. + +*/ + + +#define DEBUG_STATE_pp(pp) \ + DEBUG_STATE_r({ \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ + PerlIO_printf(Perl_debug_log, \ + " %*s"pp" %s%s%s%s%s\n", \ + depth*2, "", \ + PL_reg_name[st->resume_state], \ + ((st==yes_state||st==mark_state) ? "[" : ""), \ + ((st==yes_state) ? "Y" : ""), \ + ((st==mark_state) ? "M" : ""), \ + ((st==yes_state||st==mark_state) ? "]" : "") \ + ); \ + }); + + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1) + +#ifdef DEBUGGING + +STATIC void +S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, + const char *start, const char *end, const char *blurb) +{ + const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; + + PERL_ARGS_ASSERT_DEBUG_START_MATCH; + + if (!PL_colorset) + reginitcolors(); + { + RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), + RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); + + RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), + start, end - start, 60); + + PerlIO_printf(Perl_debug_log, + "%s%s REx%s %s against %s\n", + PL_colors[4], blurb, PL_colors[5], s0, s1); + + if (utf8_target||utf8_pat) + PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", + utf8_pat ? "pattern" : "", + utf8_pat && utf8_target ? " and " : "", + utf8_target ? "string" : "" + ); + } +} + +STATIC void +S_dump_exec_pos(pTHX_ const char *locinput, + const regnode *scan, + const char *loc_regeol, + const char *loc_bostr, + const char *loc_reg_starttry, + const bool utf8_target) +{ + const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; + const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ + int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); + /* The part of the string before starttry has one color + (pref0_len chars), between starttry and current + position another one (pref_len - pref0_len chars), + after the current position the third one. + We assume that pref0_len <= pref_len, otherwise we + decrease pref0_len. */ + int pref_len = (locinput - loc_bostr) > (5 + taill) - l + ? (5 + taill) - l : locinput - loc_bostr; + int pref0_len; + + PERL_ARGS_ASSERT_DUMP_EXEC_POS; + + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) + pref_len++; + pref0_len = pref_len - (locinput - loc_reg_starttry); + if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) + l = ( loc_regeol - locinput > (5 + taill) - pref_len + ? (5 + taill) - pref_len : loc_regeol - locinput); + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) + l--; + if (pref0_len < 0) + pref0_len = 0; + if (pref0_len > pref_len) + pref0_len = pref_len; + { + const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0; + + RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), + (locinput - pref_len),pref0_len, 60, 4, 5); + + RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), + (locinput - pref_len + pref0_len), + pref_len - pref0_len, 60, 2, 3); + + RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), + locinput, loc_regeol - locinput, 10, 0, 1); + + const STRLEN tlen=len0+len1+len2; + PerlIO_printf(Perl_debug_log, + "%4"IVdf" <%.*s%.*s%s%.*s>%*s|", + (IV)(locinput - loc_bostr), + len0, s0, + len1, s1, + (docolor ? "" : "> <"), + len2, s2, + (int)(tlen > 19 ? 0 : 19 - tlen), + ""); + } +} + +#endif + +/* reg_check_named_buff_matched() + * Checks to see if a named buffer has matched. The data array of + * buffer numbers corresponding to the buffer is expected to reside + * in the regexp->data->data array in the slot stored in the ARG() of + * node involved. Note that this routine doesn't actually care about the + * name, that information is not preserved from compilation to execution. + * Returns the index of the leftmost defined buffer with the given name + * or 0 if non of the buffers matched. + */ +STATIC I32 +S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) +{ + I32 n; + RXi_GET_DECL(rex,rexi); + SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + + PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; + + for ( n=0; nlastparen >= nums[n] && + rex->offs[nums[n]].end != -1) + { + return nums[n]; + } + } + return 0; +} + + +static bool +S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, + U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) +{ + /* This function determines if there are one or two characters that match + * the first character of the passed-in EXACTish node , and if + * so, returns them in the passed-in pointers. + * + * If it determines that no possible character in the target string can + * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if + * the first character in requires UTF-8 to represent, and the + * target string isn't in UTF-8.) + * + * If there are more than two characters that could match the beginning of + * , or if more context is required to determine a match or not, + * it sets both * and * to CHRTEST_VOID. + * + * The motiviation behind this function is to allow the caller to set up + * tight loops for matching. If is of type EXACT, there is + * only one possible character that can match its first character, and so + * the situation is quite simple. But things get much more complicated if + * folding is involved. It may be that the first character of an EXACTFish + * node doesn't participate in any possible fold, e.g., punctuation, so it + * can be matched only by itself. The vast majority of characters that are + * in folds match just two things, their lower and upper-case equivalents. + * But not all are like that; some have multiple possible matches, or match + * sequences of more than one character. This function sorts all that out. + * + * Consider the patterns A*B or A*?B where A and B are arbitrary. In a + * loop of trying to match A*, we know we can't exit where the thing + * following it isn't a B. And something can't be a B unless it is the + * beginning of B. By putting a quick test for that beginning in a tight + * loop, we can rule out things that can't possibly be B without having to + * break out of the loop, thus avoiding work. Similarly, if A is a single + * character, we can make a tight loop matching A*, using the outputs of + * this function. + * + * If the target string to match isn't in UTF-8, and there aren't + * complications which require CHRTEST_VOID, * and * are set to + * the one or two possible octets (which are characters in this situation) + * that can match. In all cases, if there is only one character that can + * match, * and * will be identical. + * + * If the target string is in UTF-8, the buffers pointed to by + * and will contain the one or two UTF-8 sequences of bytes that + * can match the beginning of . They should be declared with at + * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is + * undefined what these contain.) If one or both of the buffers are + * invariant under UTF-8, *, and * will also be set to the + * corresponding invariant. If variant, the corresponding * and/or + * * will be set to a negative number(s) that shouldn't match any code + * point (unless inappropriately coerced to unsigned). * will equal + * * if and only if and are the same. */ + + const bool utf8_target = reginfo->is_utf8_target; + + UV c1 = (UV)CHRTEST_NOT_A_CP_1; + UV c2 = (UV)CHRTEST_NOT_A_CP_2; + bool use_chrtest_void = FALSE; + const bool is_utf8_pat = reginfo->is_utf8_pat; + + /* Used when we have both utf8 input and utf8 output, to avoid converting + * to/from code points */ + bool utf8_has_been_setup = FALSE; + + dVAR; + + U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + + if (OP(text_node) == EXACT) { + + /* In an exact node, only one thing can be matched, that first + * character. If both the pat and the target are UTF-8, we can just + * copy the input to the output, avoiding finding the code point of + * that character */ + if (!is_utf8_pat) { + c2 = c1 = *pat; + } + else if (utf8_target) { + Copy(pat, c1_utf8, UTF8SKIP(pat), U8); + Copy(pat, c2_utf8, UTF8SKIP(pat), U8); + utf8_has_been_setup = TRUE; + } + else { + c2 = c1 = valid_utf8_to_uvchr(pat, NULL); + } + } + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; + } + } + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } + } + + pat = folded; + pat_end = d; + } + } + + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 255) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { + + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; + } + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 255, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255. */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } + } + } + } + } + } + else /* Here, c1 is <= 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } + } + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { + + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; + + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; + break; + + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } + } + } + } + + /* Here have figured things out. Set up the returns */ + if (use_chrtest_void) { + *c2p = *c1p = CHRTEST_VOID; + } + else if (utf8_target) { + if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ + uvchr_to_utf8(c1_utf8, c1); + uvchr_to_utf8(c2_utf8, c2); + } + + /* Invariants are stored in both the utf8 and byte outputs; Use + * negative numbers otherwise for the byte ones. Make sure that the + * byte ones are the same iff the utf8 ones are the same */ + *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; + *c2p = (UTF8_IS_INVARIANT(*c2_utf8)) + ? *c2_utf8 + : (c1 == c2) + ? CHRTEST_NOT_A_CP_1 + : CHRTEST_NOT_A_CP_2; + } + else if (c1 > 255) { + if (c2 > 255) { /* both possibilities are above what a non-utf8 string + can represent */ + return FALSE; + } + + *c1p = *c2p = c2; /* c2 is the only representable value */ + } + else { /* c1 is representable; see about c2 */ + *c1p = c1; + *c2p = (c2 < 256) ? c2 : c1; + } + + return TRUE; +} + +/* returns -1 on failure, $+[0] on success */ +STATIC SSize_t +S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) +{ +#if PERL_VERSION < 9 && !defined(PERL_CORE) + dMY_CXT; +#endif + dVAR; + const bool utf8_target = reginfo->is_utf8_target; + const U32 uniflags = UTF8_ALLOW_DEFAULT; + REGEXP *rex_sv = reginfo->prog; + regexp *rex = ReANY(rex_sv); + RXi_GET_DECL(rex,rexi); + /* the current state. This is a cached copy of PL_regmatch_state */ + regmatch_state *st; + /* cache heavy used fields of st in registers */ + regnode *scan; + regnode *next; + U32 n = 0; /* general value; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ + char *locinput = startpos; + char *pushinput; /* where to continue after a PUSH */ + I32 nextchr; /* is always set to UCHARAT(locinput) */ + + bool result = 0; /* return value of S_regmatch */ + int depth = 0; /* depth of backtrack stack */ + U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ + const U32 max_nochange_depth = + (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? + 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; + regmatch_state *yes_state = NULL; /* state to pop to on success of + subpattern */ + /* mark_state piggy backs on the yes_state logic so that when we unwind + the stack on success we can update the mark_state as we go */ + regmatch_state *mark_state = NULL; /* last mark state we have seen */ + regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ + struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ + U32 state_num; + bool no_final = 0; /* prevent failure from backtracking? */ + bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ + char *startpoint = locinput; + SV *popmark = NULL; /* are we looking for a mark? */ + SV *sv_commit = NULL; /* last mark name seen in failure */ + SV *sv_yes_mark = NULL; /* last mark name we have seen + during a successful match */ + U32 lastopen = 0; /* last open we saw */ + bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; + SV* const oreplsv = GvSVn(PL_replgv); + /* these three flags are set by various ops to signal information to + * the very next op. They have a useful lifetime of exactly one loop + * iteration, and are not preserved or restored by state pushes/pops + */ + bool sw = 0; /* the condition value in (?(cond)a|b) */ + bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ + int logical = 0; /* the following EVAL is: + 0: (?{...}) + 1: (?(?{...})X|Y) + 2: (??{...}) + or the following IFMATCH/UNLESSM is: + false: plain (?=foo) + true: used as a condition: (?(?=foo)) + */ + PAD* last_pad = NULL; + dMULTICALL; + I32 gimme = G_SCALAR; + CV *caller_cv = NULL; /* who called us */ + CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ + CHECKPOINT runops_cp; /* savestack position before executing EVAL */ + U32 maxopenparen = 0; /* max '(' index seen so far */ + int to_complement; /* Invert the result? */ + _char_class_number classnum; + bool is_utf8_pat = reginfo->is_utf8_pat; + +#ifdef DEBUGGING + GET_RE_DEBUG_FLAGS_DECL; +#endif + + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ + multicall_oldcatch = 0; + multicall_cv = NULL; + cx = NULL; + PERL_UNUSED_VAR(multicall_cop); + PERL_UNUSED_VAR(newsp); + + + PERL_ARGS_ASSERT_REGMATCH; + + DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ + PerlIO_printf(Perl_debug_log,"regmatch start\n"); + })); + + st = PL_regmatch_state; + + /* Note that nextchr is a byte even in UTF */ + SET_nextchr; + scan = prog; + while (scan != NULL) { + + DEBUG_EXECUTE_r( { + SV * const prop = sv_newmortal(); + regnode *rnext=regnext(scan); + DUMP_EXEC_POS( locinput, scan, utf8_target ); + regprop(rex, prop, scan, reginfo); + + PerlIO_printf(Perl_debug_log, + "%3"IVdf":%*s%s(%"IVdf")\n", + (IV)(scan - rexi->program), depth*2, "", + SvPVX_const(prop), + (PL_regkind[OP(scan)] == END || !rnext) ? + 0 : (IV)(rnext - rexi->program)); + }); + + next = scan + NEXT_OFF(scan); + if (next == scan) + next = NULL; + state_num = OP(scan); + + reenter_switch: + to_complement = 0; + + SET_nextchr; + assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); + + switch (state_num) { + case BOL: /* /^../ */ + case SBOL: /* /^../s */ + if (locinput == reginfo->strbeg) + break; + sayNO; + + case MBOL: /* /^../m */ + if (locinput == reginfo->strbeg || + (!NEXTCHR_IS_EOS && locinput[-1] == '\n')) + { + break; + } + sayNO; + + case GPOS: /* \G */ + if (locinput == reginfo->ganch) + break; + sayNO; + + case KEEPS: /* \K */ + /* update the startpoint */ + st->u.keeper.val = rex->offs[0].start; + rex->offs[0].start = locinput - reginfo->strbeg; + PUSH_STATE_GOTO(KEEPS_next, next, locinput); + /* NOTREACHED */ + assert(0); + + case KEEPS_next_fail: + /* rollback the start point change */ + rex->offs[0].start = st->u.keeper.val; + sayNO_SILENT; + /* NOTREACHED */ + assert(0); + + case MEOL: /* /..$/m */ + if (!NEXTCHR_IS_EOS && nextchr != '\n') + sayNO; + break; + + case EOL: /* /..$/ */ + /* FALLTHROUGH */ + case SEOL: /* /..$/s */ + if (!NEXTCHR_IS_EOS && nextchr != '\n') + sayNO; + if (reginfo->strend - locinput > 1) + sayNO; + break; + + case EOS: /* \z */ + if (!NEXTCHR_IS_EOS) + sayNO; + break; + + case SANY: /* /./s */ + if (NEXTCHR_IS_EOS) + sayNO; + goto increment_locinput; + + case CANY: /* \C */ + if (NEXTCHR_IS_EOS) + sayNO; + locinput++; + break; + + case REG_ANY: /* /./ */ + if ((NEXTCHR_IS_EOS) || nextchr == '\n') + sayNO; + goto increment_locinput; + + +#undef ST +#define ST st->u.trie + case TRIEC: /* (ab|cd) with known charclass */ + /* In this case the charclass data is available inline so + we can fail fast without a lot of extra overhead. + */ + if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + /* NOTREACHED */ + assert(0); + } + /* FALLTHROUGH */ + case TRIE: /* (ab|cd) */ + /* the basic plan of execution of the trie is: + * At the beginning, run though all the states, and + * find the longest-matching word. Also remember the position + * of the shortest matching word. For example, this pattern: + * 1 2 3 4 5 + * ab|a|x|abcd|abc + * when matched against the string "abcde", will generate + * accept states for all words except 3, with the longest + * matching word being 4, and the shortest being 2 (with + * the position being after char 1 of the string). + * + * Then for each matching word, in word order (i.e. 1,2,4,5), + * we run the remainder of the pattern; on each try setting + * the current position to the character following the word, + * returning to try the next word on failure. + * + * We avoid having to build a list of words at runtime by + * using a compile-time structure, wordinfo[].prev, which + * gives, for each word, the previous accepting word (if any). + * In the case above it would contain the mappings 1->2, 2->0, + * 3->0, 4->5, 5->1. We can use this table to generate, from + * the longest word (4 above), a list of all words, by + * following the list of prev pointers; this gives us the + * unordered list 4,5,1,2. Then given the current word we have + * just tried, we can go through the list and find the + * next-biggest word to try (so if we just failed on word 2, + * the next in the list is 4). + * + * Since at runtime we don't record the matching position in + * the string for each word, we have to work that out for + * each word we're about to process. The wordinfo table holds + * the character length of each word; given that we recorded + * at the start: the position of the shortest word and its + * length in chars, we just need to move the pointer the + * difference between the two char lengths. Depending on + * Unicode status and folding, that's cheap or expensive. + * + * This algorithm is optimised for the case where are only a + * small number of accept states, i.e. 0,1, or maybe 2. + * With lots of accepts states, and having to try all of them, + * it becomes quadratic on number of accept states to find all + * the next words. + */ + + { + /* what type of TRIE am I? (utf8 makes this contextual) */ + DECL_TRIE_TYPE(scan); + + /* what trie are we using right now */ + reg_trie_data * const trie + = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; + HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); + U32 state = trie->startstate; + + if ( trie->bitmap + && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) + { + if (trie->states[ state ].wordnum) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %smatched empty string...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + if (!trie->jump) + break; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + } + } + + { + U8 *uc = ( U8* )locinput; + + STRLEN len = 0; + STRLEN foldlen = 0; + U8 *uscan = (U8*)NULL; + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + U32 charcount = 0; /* how many input chars we have matched */ + U32 accepted = 0; /* have we seen any accepting states? */ + + ST.jump = trie->jump; + ST.me = scan; + ST.firstpos = NULL; + ST.longfold = FALSE; /* char longer if folded => it's harder */ + ST.nextword = 0; + + /* fully traverse the TRIE; note the position of the + shortest accept state and the wordnum of the longest + accept state */ + + while ( state && uc <= (U8*)(reginfo->strend) ) { + U32 base = trie->states[ state ].trans.base; + UV uvc = 0; + U16 charid = 0; + U16 wordnum; + wordnum = trie->states[ state ].wordnum; + + if (wordnum) { /* it's an accept state */ + if (!accepted) { + accepted = 1; + /* record first match position */ + if (ST.longfold) { + ST.firstpos = (U8*)locinput; + ST.firstchars = 0; + } + else { + ST.firstpos = uc; + ST.firstchars = charcount; + } + } + if (!ST.nextword || wordnum < ST.nextword) + ST.nextword = wordnum; + ST.topword = wordnum; + } + + DEBUG_TRIE_EXECUTE_r({ + DUMP_EXEC_POS( (char *)uc, scan, utf8_target ); + PerlIO_printf( Perl_debug_log, + "%*s %sState: %4"UVxf" Accepted: %c ", + 2+depth * 2, "", PL_colors[4], + (UV)state, (accepted ? 'Y' : 'N')); + }); + + /* read a char and goto next state */ + if ( base && (foldlen || uc < (U8*)(reginfo->strend))) { + I32 offset; + REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, + uscan, len, uvc, charid, foldlen, + foldbuf, uniflags); + charcount++; + if (foldlen>0) + ST.longfold = TRUE; + if (charid && + ( ((offset = + base + charid - 1 - trie->uniquecharcount)) >= 0) + + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state) + { + state = trie->trans[offset].next; + } + else { + state = 0; + } + uc += len; + + } + else { + state = 0; + } + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log, + "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n", + charid, uvc, (UV)state, PL_colors[5] ); + ); + } + if (!accepted) + sayNO; + + /* calculate total number of accept states */ + { + U16 w = ST.topword; + accepted = 0; + while (w) { + w = trie->wordinfo[w].prev; + accepted++; + } + ST.accepted = accepted; + } + + DEBUG_EXECUTE_r( + PerlIO_printf( Perl_debug_log, + "%*s %sgot %"IVdf" possible matches%s\n", + REPORT_CODE_OFF + depth * 2, "", + PL_colors[4], (IV)ST.accepted, PL_colors[5] ); + ); + goto trie_first_try; /* jump into the fail handler */ + }} + /* NOTREACHED */ + assert(0); + + case TRIE_next_fail: /* we failed - try next alternative */ + { + U8 *uc; + if ( ST.jump) { + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + if (!--ST.accepted) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sTRIE failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); + sayNO_SILENT; + } + { + /* Find next-highest word to process. Note that this code + * is O(N^2) per trie run (O(N) per branch), so keep tight */ + U16 min = 0; + U16 word; + U16 const nextword = ST.nextword; + reg_trie_wordinfo * const wordinfo + = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; + for (word=ST.topword; word; word=wordinfo[word].prev) { + if (word > nextword && (!min || word < min)) + min = word; + } + ST.nextword = min; + } + + trie_first_try: + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } + + if ( ST.jump) { + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + REGCP_SET(ST.cp); + } + + /* find start char of end of current word */ + { + U32 chars; /* how many chars to skip */ + reg_trie_data * const trie + = (reg_trie_data*)rexi->data->data[ARG(ST.me)]; + + assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) + >= ST.firstchars); + chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) + - ST.firstchars; + uc = ST.firstpos; + + if (ST.longfold) { + /* the hard option - fold each char in turn and find + * its folded length (which may be different */ + U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; + STRLEN foldlen; + STRLEN len; + UV uvc; + U8 *uscan; + + while (chars) { + if (utf8_target) { + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, + uniflags); + uc += len; + } + else { + uvc = *uc; + uc++; + } + uvc = to_uni_fold(uvc, foldbuf, &foldlen); + uscan = foldbuf; + while (foldlen) { + if (!--chars) + break; + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, + uniflags); + uscan += len; + foldlen -= len; + } + } + } + else { + if (utf8_target) + while (chars--) + uc += UTF8SKIP(uc); + else + uc += chars; + } + } + + scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) + ? ST.jump[ST.nextword] + : NEXT_OFF(ST.me)); + + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sTRIE matched word #%d, continuing%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + ST.nextword, + PL_colors[5] + ); + }); + + if (ST.accepted > 1 || has_cutgroup) { + PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc); + /* NOTREACHED */ + assert(0); + } + /* only one choice left - just continue */ + DEBUG_EXECUTE_r({ + AV *const trie_words + = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); + SV ** const tmp = av_fetch( trie_words, + ST.nextword-1, 0 ); + SV *sv= tmp ? sv_newmortal() : NULL; + + PerlIO_printf( Perl_debug_log, + "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], + ST.nextword, + tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII + ) + : "not compiled under -Dr", + PL_colors[5] ); + }); + + locinput = (char*)uc; + continue; /* execute rest of RE */ + /* NOTREACHED */ + assert(0); + } +#undef ST + + case EXACT: { /* /abc/ */ + char *s = STRING(scan); + ln = STR_LEN(scan); + if (utf8_target != is_utf8_pat) { + /* The target and the pattern have differing utf8ness. */ + char *l = locinput; + const char * const e = s + ln; + + if (utf8_target) { + /* The target is utf8, the pattern is not utf8. + * Above-Latin1 code points can't match the pattern; + * invariants match exactly, and the other Latin1 ones need + * to be downgraded to a single byte in order to do the + * comparison. (If we could be confident that the target + * is not malformed, this could be refactored to have fewer + * tests by just assuming that if the first bytes match, it + * is an invariant, but there are tests in the test suite + * dealing with (??{...}) which violate this) */ + while (s < e) { + if (l >= reginfo->strend + || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) + { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)l)) { + if (*l != *s) { + sayNO; + } + l++; + } + else { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { + sayNO; + } + l += 2; + } + s++; + } + } + else { + /* The target is not utf8, the pattern is utf8. */ + while (s < e) { + if (l >= reginfo->strend + || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) + { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)s)) { + if (*s != *l) { + sayNO; + } + s++; + } + else { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { + sayNO; + } + s += 2; + } + l++; + } + } + locinput = l; + } + else { + /* The target and the pattern have the same utf8ness. */ + /* Inline the first character, for speed. */ + if (reginfo->strend - locinput < ln + || UCHARAT(s) != nextchr + || (ln > 1 && memNE(s, locinput, ln))) + { + sayNO; + } + locinput += ln; + } + break; + } + + case EXACTFL: { /* /abc/il */ + re_fold_t folder; + const U8 * fold_array; + const char * s; + U32 fold_utf8_flags; + + folder = foldEQ_locale; + fold_array = PL_fold_locale; + fold_utf8_flags = FOLDEQ_LOCALE; + goto do_exactf; + + case EXACTFU_SS: /* /\x{df}/iu */ + case EXACTFU: /* /abc/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; + goto do_exactf; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: /* /abc/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf; + + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + folder = foldEQ; + fold_array = PL_fold; + fold_utf8_flags = 0; + + do_exactf: + s = STRING(scan); + ln = STR_LEN(scan); + + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { + /* Either target or the pattern are utf8, or has the issue where + * the fold lengths may differ. */ + const char * const l = locinput; + char *e = reginfo->strend; + + if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat, + l, &e, 0, utf8_target, fold_utf8_flags)) + { + sayNO; + } + locinput = e; + break; + } + + /* Neither the target nor the pattern are utf8 */ + if (UCHARAT(s) != nextchr + && !NEXTCHR_IS_EOS + && UCHARAT(s) != fold_array[nextchr]) + { + sayNO; + } + if (reginfo->strend - locinput < ln) + sayNO; + if (ln > 1 && ! folder(s, locinput, ln)) + sayNO; + locinput += ln; + break; + } + + /* XXX Could improve efficiency by separating these all out using a + * macro or in-line function. At that point regcomp.c would no longer + * have to set the FLAGS fields of these */ + case BOUNDL: /* /\b/l */ + case NBOUNDL: /* /\B/l */ + case BOUND: /* /\b/ */ + case BOUNDU: /* /\b/u */ + case BOUNDA: /* /\b/a */ + case NBOUND: /* /\B/ */ + case NBOUNDU: /* /\B/u */ + case NBOUNDA: /* /\B/a */ + /* was last char in word? */ + if (utf8_target + && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET + && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET) + { + if (locinput == reginfo->strbeg) + ln = '\n'; + else { + const U8 * const r = + reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); + + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); + } + if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { + ln = isWORDCHAR_uni(ln); + if (NEXTCHR_IS_EOS) + n = 0; + else { + LOAD_UTF8_CHARCLASS_ALNUM(); + n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput, + utf8_target); + } + } + else { + ln = isWORDCHAR_LC_uvchr(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); + } + } + else { + + /* Here the string isn't utf8, or is utf8 and only ascii + * characters are to match \w. In the latter case looking at + * the byte just prior to the current one may be just the final + * byte of a multi-byte character. This is ok. There are two + * cases: + * 1) it is a single byte character, and then the test is doing + * just what it's supposed to. + * 2) it is a multi-byte character, in which case the final + * byte is never mistakable for ASCII, and so the test + * will say it is not a word character, which is the + * correct answer. */ + ln = (locinput != reginfo->strbeg) ? + UCHARAT(locinput - 1) : '\n'; + switch (FLAGS(scan)) { + case REGEX_UNICODE_CHARSET: + ln = isWORDCHAR_L1(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr); + break; + case REGEX_LOCALE_CHARSET: + ln = isWORDCHAR_LC(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr); + break; + case REGEX_DEPENDS_CHARSET: + ln = isWORDCHAR(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr); + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + ln = isWORDCHAR_A(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr); + break; + default: + Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); + } + } + /* Note requires that all BOUNDs be lower than all NBOUNDs in + * regcomp.sym */ + if (((!ln) == (!n)) == (OP(scan) < NBOUND)) + sayNO; + break; + + case ANYOF: /* /[abc]/ */ + if (NEXTCHR_IS_EOS) + sayNO; + if (utf8_target) { + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) + sayNO; + locinput += UTF8SKIP(locinput); + } + else { + if (!REGINCLASS(rex, scan, (U8*)locinput)) + sayNO; + locinput++; + } + break; + + /* The argument (FLAGS) to all the POSIX node types is the class number + * */ + + case NPOSIXL: /* \W or [:^punct:] etc. under /l */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: /* \w or [:punct:] etc. under /l */ + if (NEXTCHR_IS_EOS) + sayNO; + + /* Use isFOO_lc() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) { + sayNO; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, + *(locinput + 1)))))) + { + sayNO; + } + } + else { /* Here, must be an above Latin-1 code point */ + goto utf8_posix_not_eos; + } + + /* Here, must be utf8 */ + locinput += UTF8SKIP(locinput); + break; + + case NPOSIXD: /* \W or [:^punct:] etc. under /d */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: /* \w or [:punct:] etc. under /d */ + if (utf8_target) { + goto utf8_posix; + } + goto posixa; + + case NPOSIXA: /* \W or [:^punct:] etc. under /a */ + + if (NEXTCHR_IS_EOS) { + sayNO; + } + + /* All UTF-8 variants match */ + if (! UTF8_IS_INVARIANT(nextchr)) { + goto increment_locinput; + } + + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXA: /* \w or [:punct:] etc. under /a */ + + posixa: + /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in + * UTF-8, and also from NPOSIXA even in UTF-8 when the current + * character is a single byte */ + + if (NEXTCHR_IS_EOS + || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, + FLAGS(scan))))) + { + sayNO; + } + + /* Here we are either not in utf8, or we matched a utf8-invariant, + * so the next char is the next byte */ + locinput++; + break; + + case NPOSIXU: /* \W or [:^punct:] etc. under /u */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: /* \w or [:punct:] etc. under /u */ + utf8_posix: + if (NEXTCHR_IS_EOS) { + sayNO; + } + utf8_posix_not_eos: + + /* Use _generic_isCC() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(_generic_isCC(nextchr, + FLAGS(scan))))) + { + sayNO; + } + locinput++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, + *(locinput + 1)), + FLAGS(scan))))) + { + sayNO; + } + locinput += 2; + } + else { /* Handle above Latin-1 code points */ + classnum = (_char_class_number) FLAGS(scan); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, uses a swash to find such code points. Load if if + * not done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] + = _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + if (! (to_complement + ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) locinput, TRUE)))) + { + sayNO; + } + } + else { /* Here, uses macros to find above Latin-1 code points */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate + code if we revert the change + of \v matching this */ + case _CC_ENUM_PSXSPC: + if (! (to_complement + ^ cBOOL(is_XPERLSPACE_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_BLANK: + if (! (to_complement + ^ cBOOL(is_HORIZWS_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_XDIGIT: + if (! (to_complement + ^ cBOOL(is_XDIGIT_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_VERTSPACE: + if (! (to_complement + ^ cBOOL(is_VERTWS_high(locinput)))) + { + sayNO; + } + break; + default: /* The rest, e.g. [:cntrl:], can't match + above Latin1 */ + if (! to_complement) { + sayNO; + } + break; + } + } + locinput += UTF8SKIP(locinput); + } + break; + + case CLUMP: /* Match \X: logical Unicode character. This is defined as + a Unicode extended Grapheme Cluster */ + /* From http://www.unicode.org/reports/tr29 (5.2 version). An + extended Grapheme Cluster is: + + CR LF + | Prepend* Begin Extend* + | . + + Begin is: ( Special_Begin | ! Control ) + Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) + Extend is: ( Grapheme_Extend | Spacing_Mark ) + Control is: [ GCB_Control | CR | LF ] + Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) + + If we create a 'Regular_Begin' = Begin - Special_Begin, then + we can rewrite + + Begin is ( Regular_Begin + Special Begin ) + + It turns out that 98.4% of all Unicode code points match + Regular_Begin. Doing it this way eliminates a table match in + the previous implementation for almost all Unicode code points. + + There is a subtlety with Prepend* which showed up in testing. + Note that the Begin, and only the Begin is required in: + | Prepend* Begin Extend* + Also, Begin contains '! Control'. A Prepend must be a + '! Control', which means it must also be a Begin. What it + comes down to is that if we match Prepend* and then find no + suitable Begin afterwards, that if we backtrack the last + Prepend, that one will be a suitable Begin. + */ + + if (NEXTCHR_IS_EOS) + sayNO; + if (! utf8_target) { + + /* Match either CR LF or '.', as all the other possibilities + * require utf8 */ + locinput++; /* Match the . or CR */ + if (nextchr == '\r' /* And if it was CR, and the next is LF, + match the LF */ + && locinput < reginfo->strend + && UCHARAT(locinput) == '\n') + { + locinput++; + } + } + else { + + /* Utf8: See if is ( CR LF ); already know that locinput < + * reginfo->strend, so locinput+1 is in bounds */ + if ( nextchr == '\r' && locinput+1 < reginfo->strend + && UCHARAT(locinput + 1) == '\n') + { + locinput += 2; + } + else { + STRLEN len; + + /* In case have to backtrack to beginning, then match '.' */ + char *starting = locinput; + + /* In case have to backtrack the last prepend */ + char *previous_prepend = NULL; + + LOAD_UTF8_CHARCLASS_GCB(); + + /* Match (prepend)* */ + while (locinput < reginfo->strend + && (len = is_GCB_Prepend_utf8(locinput))) + { + previous_prepend = locinput; + locinput += len; + } + + /* As noted above, if we matched a prepend character, but + * the next thing won't match, back off the last prepend we + * matched, as it is guaranteed to match the begin */ + if (previous_prepend + && (locinput >= reginfo->strend + || (! swash_fetch(PL_utf8_X_regular_begin, + (U8*)locinput, utf8_target) + && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput))) + ) + { + locinput = previous_prepend; + } + + /* Note that here we know reginfo->strend > locinput, as we + * tested that upon input to this switch case, and if we + * moved locinput forward, we tested the result just above + * and it either passed, or we backed off so that it will + * now pass */ + if (swash_fetch(PL_utf8_X_regular_begin, + (U8*)locinput, utf8_target)) { + locinput += UTF8SKIP(locinput); + } + else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) { + + /* Here did not match the required 'Begin' in the + * second term. So just match the very first + * character, the '.' of the final term of the regex */ + locinput = starting + UTF8SKIP(starting); + goto exit_utf8; + } else { + + /* Here is a special begin. It can be composed of + * several individual characters. One possibility is + * RI+ */ + if ((len = is_GCB_RI_utf8(locinput))) { + locinput += len; + while (locinput < reginfo->strend + && (len = is_GCB_RI_utf8(locinput))) + { + locinput += len; + } + } else if ((len = is_GCB_T_utf8(locinput))) { + /* Another possibility is T+ */ + locinput += len; + while (locinput < reginfo->strend + && (len = is_GCB_T_utf8(locinput))) + { + locinput += len; + } + } else { + + /* Here, neither RI+ nor T+; must be some other + * Hangul. That means it is one of the others: L, + * LV, LVT or V, and matches: + * L* (L | LVT T* | V * V* T* | LV V* T*) */ + + /* Match L* */ + while (locinput < reginfo->strend + && (len = is_GCB_L_utf8(locinput))) + { + locinput += len; + } + + /* Here, have exhausted L*. If the next character + * is not an LV, LVT nor V, it means we had to have + * at least one L, so matches L+ in the original + * equation, we have a complete hangul syllable. + * Are done. */ + + if (locinput < reginfo->strend + && is_GCB_LV_LVT_V_utf8(locinput)) + { + /* Otherwise keep going. Must be LV, LVT or V. + * See if LVT, by first ruling out V, then LV */ + if (! is_GCB_V_utf8(locinput) + /* All but every TCount one is LV */ + && (valid_utf8_to_uvchr((U8 *) locinput, + NULL) + - SBASE) + % TCount != 0) + { + locinput += UTF8SKIP(locinput); + } else { + + /* Must be V or LV. Take it, then match + * V* */ + locinput += UTF8SKIP(locinput); + while (locinput < reginfo->strend + && (len = is_GCB_V_utf8(locinput))) + { + locinput += len; + } + } + + /* And any of LV, LVT, or V can be followed + * by T* */ + while (locinput < reginfo->strend + && (len = is_GCB_T_utf8(locinput))) + { + locinput += len; + } + } + } + } + + /* Match any extender */ + while (locinput < reginfo->strend + && swash_fetch(PL_utf8_X_extend, + (U8*)locinput, utf8_target)) + { + locinput += UTF8SKIP(locinput); + } + } + exit_utf8: + if (locinput > reginfo->strend) sayNO; + } + break; + + case NREFFL: /* /\g{name}/il */ + { /* The capture buffer cases. The ones beginning with N for the + named buffers just convert to the equivalent numbered and + pretend they were called as the corresponding numbered buffer + op. */ + /* don't initialize these in the declaration, it makes C++ + unhappy */ + const char *s; + char type; + re_fold_t folder; + const U8 *fold_array; + UV utf8_fold_flags; + + folder = foldEQ_locale; + fold_array = PL_fold_locale; + type = REFFL; + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_nref; + + case NREFFA: /* /\g{name}/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFA; + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_nref; + + case NREFFU: /* /\g{name}/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFU; + utf8_fold_flags = 0; + goto do_nref; + + case NREFF: /* /\g{name}/i */ + folder = foldEQ; + fold_array = PL_fold; + type = REFF; + utf8_fold_flags = 0; + goto do_nref; + + case NREF: /* /\g{name}/ */ + type = REF; + folder = NULL; + fold_array = NULL; + utf8_fold_flags = 0; + do_nref: + + /* For the named back references, find the corresponding buffer + * number */ + n = reg_check_named_buff_matched(rex,scan); + + if ( ! n ) { + sayNO; + } + goto do_nref_ref_common; + + case REFFL: /* /\1/il */ + folder = foldEQ_locale; + fold_array = PL_fold_locale; + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_ref; + + case REFFA: /* /\1/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_ref; + + case REFFU: /* /\1/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + utf8_fold_flags = 0; + goto do_ref; + + case REFF: /* /\1/i */ + folder = foldEQ; + fold_array = PL_fold; + utf8_fold_flags = 0; + goto do_ref; + + case REF: /* /\1/ */ + folder = NULL; + fold_array = NULL; + utf8_fold_flags = 0; + + do_ref: + type = OP(scan); + n = ARG(scan); /* which paren pair */ + + do_nref_ref_common: + ln = rex->offs[n].start; + reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ + if (rex->lastparen < n || ln == -1) + sayNO; /* Do not match unless seen CLOSEn. */ + if (ln == rex->offs[n].end) + break; + + s = reginfo->strbeg + ln; + if (type != REF /* REF can do byte comparison */ + && (utf8_target || type == REFFU || type == REFFL)) + { + char * limit = reginfo->strend; + + /* This call case insensitively compares the entire buffer + * at s, with the current input starting at locinput, but + * not going off the end given by reginfo->strend, and + * returns in upon success, how much of the + * current input was matched */ + if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, + locinput, &limit, 0, utf8_target, utf8_fold_flags)) + { + sayNO; + } + locinput = limit; + break; + } + + /* Not utf8: Inline the first character, for speed. */ + if (!NEXTCHR_IS_EOS && + UCHARAT(s) != nextchr && + (type == REF || + UCHARAT(s) != fold_array[nextchr])) + sayNO; + ln = rex->offs[n].end - ln; + if (locinput + ln > reginfo->strend) + sayNO; + if (ln > 1 && (type == REF + ? memNE(s, locinput, ln) + : ! folder(s, locinput, ln))) + sayNO; + locinput += ln; + break; + } + + case NOTHING: /* null op; e.g. the 'nothing' following + * the '*' in m{(a+|b)*}' */ + break; + case TAIL: /* placeholder while compiling (A|B|C) */ + break; + + case BACK: /* ??? doesn't appear to be used ??? */ + break; + +#undef ST +#define ST st->u.eval + { + SV *ret; + REGEXP *re_sv; + regexp *re; + regexp_internal *rei; + regnode *startpoint; + + case GOSTART: /* (?R) */ + case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) + Perl_croak(aTHX_ "Infinite recursion in regex"); + if ( ++nochange_depth > max_nochange_depth ) + Perl_croak(aTHX_ + "Pattern subroutine nesting without pos change" + " exceeded limit in regex"); + } else { + nochange_depth = 0; + } + re_sv = rex_sv; + re = rex; + rei = rexi; + if (OP(scan)==GOSUB) { + startpoint = scan + ARG2L(scan); + ST.close_paren = ARG(scan); + } else { + startpoint = rei->program+1; + ST.close_paren = 0; + } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ + goto eval_recurse_doit; + + /* NOTREACHED */ + assert(0); + + case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if ( ++nochange_depth > max_nochange_depth ) + Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); + } else { + nochange_depth = 0; + } + { + /* execute the code in the {...} */ + + dSP; + IV before; + OP * const oop = PL_op; + COP * const ocurcop = PL_curcop; + OP *nop; + CV *newcv; + + /* save *all* paren positions */ + regcppush(rex, 0, maxopenparen); + REGCP_SET(runops_cp); + + if (!caller_cv) + caller_cv = find_runcv(NULL); + + n = ARG(scan); + + if (rexi->data->what[n] == 'r') { /* code from an external qr */ + newcv = (ReANY( + (REGEXP*)(rexi->data->data[n]) + ))->qr_anoncv + ; + nop = (OP*)rexi->data->data[n+1]; + } + else if (rexi->data->what[n] == 'l') { /* literal code */ + newcv = caller_cv; + nop = (OP*)rexi->data->data[n]; + assert(CvDEPTH(newcv)); + } + else { + /* literal with own CV */ + assert(rexi->data->what[n] == 'L'); + newcv = rex->qr_anoncv; + nop = (OP*)rexi->data->data[n]; + } + + /* normally if we're about to execute code from the same + * CV that we used previously, we just use the existing + * CX stack entry. However, its possible that in the + * meantime we may have backtracked, popped from the save + * stack, and undone the SAVECOMPPAD(s) associated with + * PUSH_MULTICALL; in which case PL_comppad no longer + * points to newcv's pad. */ + if (newcv != last_pushed_cv || PL_comppad != last_pad) + { + U8 flags = (CXp_SUB_RE | + ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); + if (last_pushed_cv) { + CHANGE_MULTICALL_FLAGS(newcv, flags); + } + else { + PUSH_MULTICALL_FLAGS(newcv, flags); + } + last_pushed_cv = newcv; + } + else { + /* these assignments are just to silence compiler + * warnings */ + multicall_cop = NULL; + newsp = NULL; + } + last_pad = PL_comppad; + + /* the initial nextstate you would normally execute + * at the start of an eval (which would cause error + * messages to come from the eval), may be optimised + * away from the execution path in the regex code blocks; + * so manually set PL_curcop to it initially */ + { + OP *o = cUNOPx(nop)->op_first; + assert(o->op_type == OP_NULL); + if (o->op_targ == OP_SCOPE) { + o = cUNOPo->op_first; + } + else { + assert(o->op_targ == OP_LEAVE); + o = cUNOPo->op_first; + assert(o->op_type == OP_ENTER); + o = OP_SIBLING(o); + } + + if (o->op_type != OP_STUB) { + assert( o->op_type == OP_NEXTSTATE + || o->op_type == OP_DBSTATE + || (o->op_type == OP_NULL + && ( o->op_targ == OP_NEXTSTATE + || o->op_targ == OP_DBSTATE + ) + ) + ); + PL_curcop = (COP*)o; + } + } + nop = nop->op_next; + + DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, + " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); + + rex->offs[0].end = locinput - reginfo->strbeg; + if (reginfo->info_aux_eval->pos_magic) + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); + + if (sv_yes_mark) { + SV *sv_mrk = get_sv("REGMARK", 1); + sv_setsv(sv_mrk, sv_yes_mark); + } + + /* we don't use MULTICALL here as we want to call the + * first op of the block of interest, rather than the + * first op of the sub */ + before = (IV)(SP-PL_stack_base); + PL_op = nop; + CALLRUNOPS(aTHX); /* Scalar context. */ + SPAGAIN; + if ((IV)(SP-PL_stack_base) == before) + ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ + else { + ret = POPs; + PUTBACK; + } + + /* before restoring everything, evaluate the returned + * value, so that 'uninit' warnings don't use the wrong + * PL_op or pad. Also need to process any magic vars + * (e.g. $1) *before* parentheses are restored */ + + PL_op = NULL; + + re_sv = NULL; + if (logical == 0) /* (?{})/ */ + sv_setsv(save_scalar(PL_replgv), ret); /* $^R */ + else if (logical == 1) { /* /(?(?{...})X|Y)/ */ + sw = cBOOL(SvTRUE(ret)); + logical = 0; + } + else { /* /(??{}) */ + /* if its overloaded, let the regex compiler handle + * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); + if (!SvAMAGIC(ret)) { + SV *sv = ret; + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) == SVt_REGEXP) + re_sv = (REGEXP*) sv; + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); + if (mg) + re_sv = (REGEXP *) mg->mg_obj; + } + + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { + ret = sv_mortalcopy(ret); + (void) SvPV_force_nolen(ret); + } + } + + } + + /* *** Note that at this point we don't restore + * PL_comppad, (or pop the CxSUB) on the assumption it may + * be used again soon. This is safe as long as nothing + * in the regexp code uses the pad ! */ + PL_op = oop; + PL_curcop = ocurcop; + S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); + PL_curpm = PL_reg_curpm; + + if (logical != 2) + break; + } + + /* only /(??{})/ from now on */ + logical = 0; + { + /* extract RE object from returned value; compiling if + * necessary */ + + if (re_sv) { + re_sv = reg_temp_copy(NULL, re_sv); + } + else { + U32 pm_flags = 0; + + if (SvUTF8(ret) && IN_BYTES) { + /* In use 'bytes': make a copy of the octet + * sequence, but without the flag on */ + STRLEN len; + const char *const p = SvPV(ret, len); + ret = newSVpvn_flags(p, len, SVs_TEMP); + } + if (rex->intflags & PREGf_USE_RE_EVAL) + pm_flags |= PMf_USE_RE_EVAL; + + /* if we got here, it should be an engine which + * supports compiling code blocks and stuff */ + assert(rex->engine && rex->engine->op_comp); + assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); + re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, + rex->engine, NULL, NULL, + /* copy /msix etc to inner pattern */ + scan->flags, + pm_flags); + + if (!(SvFLAGS(ret) + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { + /* This isn't a first class regexp. Instead, it's + caching a regexp onto an existing, Perl visible + scalar. */ + sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); + } + } + SAVEFREESV(re_sv); + re = ReANY(re_sv); + } + RXp_MATCH_COPIED_off(re); + re->subbeg = rex->subbeg; + re->sublen = rex->sublen; + re->suboffset = rex->suboffset; + re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; + rei = RXi_GET(re); + DEBUG_EXECUTE_r( + debug_start_match(re_sv, utf8_target, locinput, + reginfo->strend, "Matching embedded"); + ); + startpoint = rei->program + 1; + ST.close_paren = 0; /* only used for GOSUB */ + /* Save all the seen positions so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + /* and set maxopenparen to 0, since we are starting a "fresh" match */ + maxopenparen = 0; + /* run the pattern returned from (??{...}) */ + + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ + + /* invalidate the S-L poscache. We're now executing a + * different set of WHILEM ops (and their associated + * indexes) against the same string, so the bits in the + * cache are meaningless. Setting maxiter to zero forces + * the cache to be invalidated and zeroed before reuse. + * XXX This is too dramatic a measure. Ideally we should + * save the old cache and restore when running the outer + * pattern again */ + reginfo->poscache_maxiter = 0; + + /* the new regexp might have a different is_utf8_pat than we do */ + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); + + ST.prev_rex = rex_sv; + ST.prev_curlyx = cur_curlyx; + rex_sv = re_sv; + SET_reg_curpm(rex_sv); + rex = re; + rexi = rei; + cur_curlyx = NULL; + ST.B = next; + ST.prev_eval = cur_eval; + cur_eval = st; + /* now continue from first node in postoned RE */ + PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput); + /* NOTREACHED */ + assert(0); + } + + case EVAL_AB: /* cleanup after a successful (??{A})B */ + /* note: this is called twice; first after popping B, then A */ + rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } + cur_eval = ST.prev_eval; + cur_curlyx = ST.prev_curlyx; + + /* Invalidate cache. See "invalidate" comment above. */ + reginfo->poscache_maxiter = 0; + if ( nochange_depth ) + nochange_depth--; + sayYES; + + + case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ + /* note: this is called twice; first after popping B, then A */ + rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + cur_eval = ST.prev_eval; + cur_curlyx = ST.prev_curlyx; + /* Invalidate cache. See "invalidate" comment above. */ + reginfo->poscache_maxiter = 0; + if ( nochange_depth ) + nochange_depth--; + sayNO_SILENT; +#undef ST + + case OPEN: /* ( */ + n = ARG(scan); /* which paren pair */ + rex->offs[n].start_tmp = locinput - reginfo->strbeg; + if (n > maxopenparen) + maxopenparen = n; + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n", + PTR2UV(rex), + PTR2UV(rex->offs), + (UV)n, + (IV)rex->offs[n].start_tmp, + (UV)maxopenparen + )); + lastopen = n; + break; + +/* XXX really need to log other places start/end are set too */ +#define CLOSE_CAPTURE \ + rex->offs[n].start = rex->offs[n].start_tmp; \ + rex->offs[n].end = locinput - reginfo->strbeg; \ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \ + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \ + PTR2UV(rex), \ + PTR2UV(rex->offs), \ + (UV)n, \ + (IV)rex->offs[n].start, \ + (IV)rex->offs[n].end \ + )) + + case CLOSE: /* ) */ + n = ARG(scan); /* which paren pair */ + CLOSE_CAPTURE; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; + if (cur_eval && cur_eval->u.eval.close_paren == n) { + goto fake_end; + } + break; + + case ACCEPT: /* (*ACCEPT) */ + if (ARG(scan)){ + regnode *cursor; + for (cursor=scan; + cursor && OP(cursor)!=END; + cursor=regnext(cursor)) + { + if ( OP(cursor)==CLOSE ){ + n = ARG(cursor); + if ( n <= lastopen ) { + CLOSE_CAPTURE; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; + if ( n == ARG(scan) || (cur_eval && + cur_eval->u.eval.close_paren == n)) + break; + } + } + } + } + goto fake_end; + /* NOTREACHED */ + + case GROUPP: /* (?(1)) */ + n = ARG(scan); /* which paren pair */ + sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); + break; + + case NGROUPP: /* (?()) */ + /* reg_check_named_buff_matched returns 0 for no match */ + sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); + break; + + case INSUBP: /* (?(R)) */ + n = ARG(scan); + sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n)); + break; + + case DEFINEP: /* (?(DEFINE)) */ + sw = 0; + break; + + case IFTHEN: /* (?(cond)A|B) */ + reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ + if (sw) + next = NEXTOPER(NEXTOPER(scan)); + else { + next = scan + ARG(scan); + if (OP(next) == IFTHEN) /* Fake one. */ + next = NEXTOPER(NEXTOPER(next)); + } + break; + + case LOGICAL: /* modifier for EVAL and IFMATCH */ + logical = scan->flags; + break; + +/******************************************************************* + +The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ +pattern, where A and B are subpatterns. (For simple A, CURLYM or +STAR/PLUS/CURLY/CURLYN are used instead.) + +A*B is compiled as + +On entry to the subpattern, CURLYX is called. This pushes a CURLYX +state, which contains the current count, initialised to -1. It also sets +cur_curlyx to point to this state, with any previous value saved in the +state block. + +CURLYX then jumps straight to the WHILEM op, rather than executing A, +since the pattern may possibly match zero times (i.e. it's a while {} loop +rather than a do {} while loop). + +Each entry to WHILEM represents a successful match of A. The count in the +CURLYX block is incremented, another WHILEM state is pushed, and execution +passes to A or B depending on greediness and the current count. + +For example, if matching against the string a1a2a3b (where the aN are +substrings that match /A/), then the match progresses as follows: (the +pushed states are interspersed with the bits of strings matched so far): + + + + a1 + a1 a2 + a1 a2 a3 + a1 a2 a3 b + +(Contrast this with something like CURLYM, which maintains only a single +backtrack state: + + a1 + a1 a2 + a1 a2 a3 + a1 a2 a3 b +) + +Each WHILEM state block marks a point to backtrack to upon partial failure +of A or B, and also contains some minor state data related to that +iteration. The CURLYX block, pointed to by cur_curlyx, contains the +overall state, such as the count, and pointers to the A and B ops. + +This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx +must always point to the *current* CURLYX block, the rules are: + +When executing CURLYX, save the old cur_curlyx in the CURLYX state block, +and set cur_curlyx to point the new block. + +When popping the CURLYX block after a successful or unsuccessful match, +restore the previous cur_curlyx. + +When WHILEM is about to execute B, save the current cur_curlyx, and set it +to the outer one saved in the CURLYX block. + +When popping the WHILEM block after a successful or unsuccessful B match, +restore the previous cur_curlyx. + +Here's an example for the pattern (AI* BI)*BO +I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: + +cur_ +curlyx backtrack stack +------ --------------- +NULL +CO +CI ai +CO ai bi +NULL ai bi bo + +At this point the pattern succeeds, and we work back down the stack to +clean up, restoring as we go: + +CO ai bi +CI ai +CO +NULL + +*******************************************************************/ + +#define ST st->u.curlyx + + case CURLYX: /* start of /A*B/ (for complex A) */ + { + /* No need to save/restore up to this paren */ + I32 parenfloor = scan->flags; + + assert(next); /* keep Coverity happy */ + if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ + next += ARG(next); + + /* XXXX Probably it is better to teach regpush to support + parenfloor > maxopenparen ... */ + if (parenfloor > (I32)rex->lastparen) + parenfloor = rex->lastparen; /* Pessimization... */ + + ST.prev_curlyx= cur_curlyx; + cur_curlyx = st; + ST.cp = PL_savestack_ix; + + /* these fields contain the state of the current curly. + * they are accessed by subsequent WHILEMs */ + ST.parenfloor = parenfloor; + ST.me = scan; + ST.B = next; + ST.minmod = minmod; + minmod = 0; + ST.count = -1; /* this will be updated by WHILEM */ + ST.lastloc = NULL; /* this will be updated by WHILEM */ + + PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput); + /* NOTREACHED */ + assert(0); + } + + case CURLYX_end: /* just finished matching all of A*B */ + cur_curlyx = ST.prev_curlyx; + sayYES; + /* NOTREACHED */ + assert(0); + + case CURLYX_end_fail: /* just failed to match all of A*B */ + regcpblow(ST.cp); + cur_curlyx = ST.prev_curlyx; + sayNO; + /* NOTREACHED */ + assert(0); + + +#undef ST +#define ST st->u.whilem + + case WHILEM: /* just matched an A in /A*B/ (for complex A) */ + { + /* see the discussion above about CURLYX/WHILEM */ + I32 n; + int min, max; + regnode *A; + + assert(cur_curlyx); /* keep Coverity happy */ + + min = ARG1(cur_curlyx->u.curlyx.me); + max = ARG2(cur_curlyx->u.curlyx.me); + A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS; + n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ + ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; + ST.cache_offset = 0; + ST.cache_mask = 0; + + + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: matched %ld out of %d..%d\n", + REPORT_CODE_OFF+depth*2, "", (long)n, min, max) + ); + + /* First just match a string of min A's. */ + + if (n < min) { + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + cur_curlyx->u.curlyx.lastloc = locinput; + REGCP_SET(ST.lastcp); + + PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput); + /* NOTREACHED */ + assert(0); + } + + /* If degenerate A matches "", assume A done. */ + + if (locinput == cur_curlyx->u.curlyx.lastloc) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: empty match detected, trying continuation...\n", + REPORT_CODE_OFF+depth*2, "") + ); + goto do_whilem_B_max; + } + + /* super-linear cache processing. + * + * The idea here is that for certain types of CURLYX/WHILEM - + * principally those whose upper bound is infinity (and + * excluding regexes that have things like \1 and other very + * non-regular expresssiony things), then if a pattern like + * /....A*.../ fails and we backtrack to the WHILEM, then we + * make a note that this particular WHILEM op was at string + * position 47 (say) when the rest of pattern failed. Then, if + * we ever find ourselves back at that WHILEM, and at string + * position 47 again, we can just fail immediately rather than + * running the rest of the pattern again. + * + * This is very handy when patterns start to go + * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up + * with a combinatorial explosion of backtracking. + * + * The cache is implemented as a bit array, with one bit per + * string byte position per WHILEM op (up to 16) - so its + * between 0.25 and 2x the string size. + * + * To avoid allocating a poscache buffer every time, we do an + * initially countdown; only after we have executed a WHILEM + * op (string-length x #WHILEMs) times do we allocate the + * cache. + * + * The top 4 bits of scan->flags byte say how many different + * relevant CURLLYX/WHILEM op pairs there are, while the + * bottom 4-bits is the identifying index number of this + * WHILEM. + */ + + if (scan->flags) { + + if (!reginfo->poscache_maxiter) { + /* start the countdown: Postpone detection until we + * know the match is not *that* much linear. */ + reginfo->poscache_maxiter + = (reginfo->strend - reginfo->strbeg + 1) + * (scan->flags>>4); + /* possible overflow for long strings and many CURLYX's */ + if (reginfo->poscache_maxiter < 0) + reginfo->poscache_maxiter = I32_MAX; + reginfo->poscache_iter = reginfo->poscache_maxiter; + } + + if (reginfo->poscache_iter-- == 0) { + /* initialise cache */ + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; + regmatch_info_aux *const aux = reginfo->info_aux; + if (aux->poscache) { + if ((SSize_t)reginfo->poscache_size < size) { + Renew(aux->poscache, size, char); + reginfo->poscache_size = size; + } + Zero(aux->poscache, size, char); + } + else { + reginfo->poscache_size = size; + Newxz(aux->poscache, size, char); + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%swhilem: Detected a super-linear match, switching on caching%s...\n", + PL_colors[4], PL_colors[5]) + ); + } + + if (reginfo->poscache_iter < 0) { + /* have we already failed at this position? */ + SSize_t offset, mask; + + reginfo->poscache_iter = -1; /* stop eventual underflow */ + offset = (scan->flags & 0xf) - 1 + + (locinput - reginfo->strbeg) + * (scan->flags>>4); + mask = 1 << (offset % 8); + offset /= 8; + if (reginfo->info_aux->poscache[offset] & mask) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: (cache) already tried at this position...\n", + REPORT_CODE_OFF+depth*2, "") + ); + sayNO; /* cache records failure */ + } + ST.cache_offset = offset; + ST.cache_mask = mask; + } + } + + /* Prefer B over A for minimal matching. */ + + if (cur_curlyx->u.curlyx.minmod) { + ST.save_curlyx = cur_curlyx; + cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; + ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor, + maxopenparen); + REGCP_SET(ST.lastcp); + PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, + locinput); + /* NOTREACHED */ + assert(0); + } + + /* Prefer A over B for maximal matching. */ + + if (n < max) { /* More greed allowed? */ + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + cur_curlyx->u.curlyx.lastloc = locinput; + REGCP_SET(ST.lastcp); + PUSH_STATE_GOTO(WHILEM_A_max, A, locinput); + /* NOTREACHED */ + assert(0); + } + goto do_whilem_B_max; + } + /* NOTREACHED */ + assert(0); + + case WHILEM_B_min: /* just matched B in a minimal match */ + case WHILEM_B_max: /* just matched B in a maximal match */ + cur_curlyx = ST.save_curlyx; + sayYES; + /* NOTREACHED */ + assert(0); + + case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ + cur_curlyx = ST.save_curlyx; + cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + /* NOTREACHED */ + assert(0); + + case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ + /* FALLTHROUGH */ + case WHILEM_A_pre_fail: /* just failed to match even minimal A */ + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + /* NOTREACHED */ + assert(0); + + case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); /* Restore some previous $s? */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%*s whilem: failed, trying continuation...\n", + REPORT_CODE_OFF+depth*2, "") + ); + do_whilem_B_max: + if (cur_curlyx->u.curlyx.count >= REG_INFTY + && ckWARN(WARN_REGEXP) + && !reginfo->warned) + { + reginfo->warned = TRUE; + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion limit (%d) " + "exceeded", + REG_INFTY - 1); + } + + /* now try B */ + ST.save_curlyx = cur_curlyx; + cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; + PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, + locinput); + /* NOTREACHED */ + assert(0); + + case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ + cur_curlyx = ST.save_curlyx; + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + + if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { + /* Maximum greed exceeded */ + if (cur_curlyx->u.curlyx.count >= REG_INFTY + && ckWARN(WARN_REGEXP) + && !reginfo->warned) + { + reginfo->warned = TRUE; + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion " + "limit (%d) exceeded", + REG_INFTY - 1); + } + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "") + ); + /* Try grabbing another A and see if it helps. */ + cur_curlyx->u.curlyx.lastloc = locinput; + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + REGCP_SET(ST.lastcp); + PUSH_STATE_GOTO(WHILEM_A_min, + /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, + locinput); + /* NOTREACHED */ + assert(0); + +#undef ST +#define ST st->u.branch + + case BRANCHJ: /* /(...|A|...)/ with long next pointer */ + next = scan + ARG(scan); + if (next == scan) + next = NULL; + scan = NEXTOPER(scan); + /* FALLTHROUGH */ + + case BRANCH: /* /(...|A|...)/ */ + scan = NEXTOPER(scan); /* scan now points to inner node */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + ST.next_branch = next; + REGCP_SET(ST.cp); + + /* Now go into the branch */ + if (has_cutgroup) { + PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput); + } else { + PUSH_STATE_GOTO(BRANCH_next, scan, locinput); + } + /* NOTREACHED */ + assert(0); + + case CUTGROUP: /* /(*THEN)/ */ + sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : + MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + PUSH_STATE_GOTO(CUTGROUP_next, next, locinput); + /* NOTREACHED */ + assert(0); + + case CUTGROUP_next_fail: + do_cutgroup = 1; + no_final = 1; + if (st->u.mark.mark_name) + sv_commit = st->u.mark.mark_name; + sayNO; + /* NOTREACHED */ + assert(0); + + case BRANCH_next: + sayYES; + /* NOTREACHED */ + assert(0); + + case BRANCH_next_fail: /* that branch failed; try the next, if any */ + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + scan = ST.next_branch; + /* no more branches? */ + if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sBRANCH failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); + sayNO_SILENT; + } + continue; /* execute next BRANCH[J] op */ + /* NOTREACHED */ + assert(0); + + case MINMOD: /* next op will be non-greedy, e.g. A*? */ + minmod = 1; + break; + +#undef ST +#define ST st->u.curlym + + case CURLYM: /* /A{m,n}B/ where A is fixed-length */ + + /* This is an optimisation of CURLYX that enables us to push + * only a single backtracking state, no matter how many matches + * there are in {m,n}. It relies on the pattern being constant + * length, with no parens to influence future backrefs + */ + + ST.me = scan; + scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + + /* if paren positive, emulate an OPEN/CLOSE around A */ + if (ST.me->flags) { + U32 paren = ST.me->flags; + if (paren > maxopenparen) + maxopenparen = paren; + scan += NEXT_OFF(scan); /* Skip former OPEN. */ + } + ST.A = scan; + ST.B = next; + ST.alen = 0; + ST.count = 0; + ST.minmod = minmod; + minmod = 0; + ST.c1 = CHRTEST_UNINIT; + REGCP_SET(ST.cp); + + if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ + goto curlym_do_B; + + curlym_do_A: /* execute the A in /A{m,n}B/ */ + PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */ + /* NOTREACHED */ + assert(0); + + case CURLYM_A: /* we've just matched an A */ + ST.count++; + /* after first match, determine A's length: u.curlym.alen */ + if (ST.count == 1) { + if (reginfo->is_utf8_target) { + char *s = st->locinput; + while (s < locinput) { + ST.alen++; + s += UTF8SKIP(s); + } + } + else { + ST.alen = locinput - st->locinput; + } + if (ST.alen == 0) + ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); + } + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n", + (int)(REPORT_CODE_OFF+(depth*2)), "", + (IV) ST.count, (IV)ST.alen) + ); + + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags) + goto fake_end; + + { + I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); + if ( max == REG_INFTY || ST.count < max ) + goto curlym_do_A; /* try to match another A */ + } + goto curlym_do_B; /* try to match B */ + + case CURLYM_A_fail: /* just failed to match an A */ + REGCP_UNWIND(ST.cp); + + if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ + || (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags)) + sayNO; + + curlym_do_B: /* execute the B in /A{m,n}B/ */ + if (ST.c1 == CHRTEST_UNINIT) { + /* calculate c1 and c2 for possible match of 1st char + * following curly */ + ST.c1 = ST.c2 = CHRTEST_VOID; + assert(ST.B); + if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { + regnode *text_node = ST.B; + if (! HAS_TEXT(text_node)) + FIND_NEXT_IMPT(text_node); + /* this used to be + + (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT) + + But the former is redundant in light of the latter. + + if this changes back then the macro for + IS_TEXT and friends need to change. + */ + if (PL_regkind[OP(text_node)] == EXACT) { + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, + reginfo)) + { + sayNO; + } + } + } + } + + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM trying tail with matches=%"IVdf"...\n", + (int)(REPORT_CODE_OFF+(depth*2)), + "", (IV)ST.count) + ); + if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { + if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { + if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + valid_utf8_to_uvchr((U8 *) locinput, NULL), + valid_utf8_to_uvchr(ST.c1_utf8, NULL), + valid_utf8_to_uvchr(ST.c2_utf8, NULL)) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + else if (nextchr != ST.c1 && nextchr != ST.c2) { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + (int) nextchr, ST.c1, ST.c2) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + + if (ST.me->flags) { + /* emulate CLOSE: mark current A as captured */ + I32 paren = ST.me->flags; + if (ST.count) { + rex->offs[paren].start + = HOPc(locinput, -ST.alen) - reginfo->strbeg; + rex->offs[paren].end = locinput - reginfo->strbeg; + if ((U32)paren > rex->lastparen) + rex->lastparen = paren; + rex->lastcloseparen = paren; + } + else + rex->offs[paren].end = -1; + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags) + { + if (ST.count) + goto fake_end; + else + sayNO; + } + } + + PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */ + /* NOTREACHED */ + assert(0); + + case CURLYM_B_fail: /* just failed to match a B */ + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + if (ST.minmod) { + I32 max = ARG2(ST.me); + if (max != REG_INFTY && ST.count == max) + sayNO; + goto curlym_do_A; /* try to match a further A */ + } + /* backtrack one A */ + if (ST.count == ARG1(ST.me) /* min */) + sayNO; + ST.count--; + SET_locinput(HOPc(locinput, -ST.alen)); + goto curlym_do_B; /* try to match B */ + +#undef ST +#define ST st->u.curly + +#define CURLY_SETPAREN(paren, success) \ + if (paren) { \ + if (success) { \ + rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \ + rex->offs[paren].end = locinput - reginfo->strbeg; \ + if (paren > rex->lastparen) \ + rex->lastparen = paren; \ + rex->lastcloseparen = paren; \ + } \ + else { \ + rex->offs[paren].end = -1; \ + rex->lastparen = ST.lastparen; \ + rex->lastcloseparen = ST.lastcloseparen; \ + } \ + } + + case STAR: /* /A*B/ where A is width 1 char */ + ST.paren = 0; + ST.min = 0; + ST.max = REG_INFTY; + scan = NEXTOPER(scan); + goto repeat; + + case PLUS: /* /A+B/ where A is width 1 char */ + ST.paren = 0; + ST.min = 1; + ST.max = REG_INFTY; + scan = NEXTOPER(scan); + goto repeat; + + case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ + ST.paren = scan->flags; /* Which paren to set */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + if (ST.paren > maxopenparen) + maxopenparen = ST.paren; + ST.min = ARG1(scan); /* min to match */ + ST.max = ARG2(scan); /* max to match */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + ST.min=1; + ST.max=1; + } + scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); + goto repeat; + + case CURLY: /* /A{m,n}B/ where A is width 1 char */ + ST.paren = 0; + ST.min = ARG1(scan); /* min to match */ + ST.max = ARG2(scan); /* max to match */ + scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + repeat: + /* + * Lookahead to avoid useless match attempts + * when we know what character comes next. + * + * Used to only do .*x and .*?x, but now it allows + * for )'s, ('s and (?{ ... })'s to be in the way + * of the quantifier and the EXACT-like node. -- japhy + */ + + assert(ST.min <= ST.max); + if (! HAS_TEXT(next) && ! JUMPABLE(next)) { + ST.c1 = ST.c2 = CHRTEST_VOID; + } + else { + regnode *text_node = next; + + if (! HAS_TEXT(text_node)) + FIND_NEXT_IMPT(text_node); + + if (! HAS_TEXT(text_node)) + ST.c1 = ST.c2 = CHRTEST_VOID; + else { + if ( PL_regkind[OP(text_node)] != EXACT ) { + ST.c1 = ST.c2 = CHRTEST_VOID; + } + else { + + /* Currently we only get here when + + PL_rekind[OP(text_node)] == EXACT + + if this changes back then the macro for IS_TEXT and + friends need to change. */ + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, + reginfo)) + { + sayNO; + } + } + } + } + + ST.A = scan; + ST.B = next; + if (minmod) { + char *li = locinput; + minmod = 0; + if (ST.min && + regrepeat(rex, &li, ST.A, reginfo, ST.min, depth) + < ST.min) + sayNO; + SET_locinput(li); + ST.count = ST.min; + REGCP_SET(ST.cp); + if (ST.c1 == CHRTEST_VOID) + goto curly_try_B_min; + + ST.oldloc = locinput; + + /* set ST.maxpos to the furthest point along the + * string that could possibly match */ + if (ST.max == REG_INFTY) { + ST.maxpos = reginfo->strend - 1; + if (utf8_target) + while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) + ST.maxpos--; + } + else if (utf8_target) { + int m = ST.max - ST.min; + for (ST.maxpos = locinput; + m >0 && ST.maxpos < reginfo->strend; m--) + ST.maxpos += UTF8SKIP(ST.maxpos); + } + else { + ST.maxpos = locinput + ST.max - ST.min; + if (ST.maxpos >= reginfo->strend) + ST.maxpos = reginfo->strend - 1; + } + goto curly_try_B_min_known; + + } + else { + /* avoid taking address of locinput, so it can remain + * a register var */ + char *li = locinput; + ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth); + if (ST.count < ST.min) + sayNO; + SET_locinput(li); + if ((ST.count > ST.min) + && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) + { + /* A{m,n} must come at the end of the string, there's + * no point in backing off ... */ + ST.min = ST.count; + /* ...except that $ and \Z can match before *and* after + newline at the end. Consider "\n\n" =~ /\n+\Z\n/. + We may back off by one in this case. */ + if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) + ST.min--; + } + REGCP_SET(ST.cp); + goto curly_try_B_max; + } + /* NOTREACHED */ + assert(0); + + case CURLY_B_min_known_fail: + /* failed to find B in a non-greedy match where c1,c2 valid */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* Couldn't or didn't -- move forward. */ + ST.oldloc = locinput; + if (utf8_target) + locinput += UTF8SKIP(locinput); + else + locinput++; + ST.count++; + curly_try_B_min_known: + /* find the next place where 'B' could work, then call B */ + { + int n; + if (utf8_target) { + n = (ST.oldloc == locinput) ? 0 : 1; + if (ST.c1 == ST.c2) { + /* set n to utf8_distance(oldloc, locinput) */ + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); + n++; + } + } + else { + /* set n to utf8_distance(oldloc, locinput) */ + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); + n++; + } + } + } + else { /* Not utf8_target */ + if (ST.c1 == ST.c2) { + while (locinput <= ST.maxpos && + UCHARAT(locinput) != ST.c1) + locinput++; + } + else { + while (locinput <= ST.maxpos + && UCHARAT(locinput) != ST.c1 + && UCHARAT(locinput) != ST.c2) + locinput++; + } + n = locinput - ST.oldloc; + } + if (locinput > ST.maxpos) + sayNO; + if (n) { + /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is + * at b; check that everything between oldloc and + * locinput matches */ + char *li = ST.oldloc; + ST.count += n; + if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n) + sayNO; + assert(n == REG_INFTY || locinput == li); + } + CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput); + } + /* NOTREACHED */ + assert(0); + + case CURLY_B_min_fail: + /* failed to find B in a non-greedy match where c1,c2 invalid */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* failed -- move forward one */ + { + char *li = locinput; + if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) { + sayNO; + } + locinput = li; + } + { + ST.count++; + if (ST.count <= ST.max || (ST.max == REG_INFTY && + ST.count > 0)) /* count overflow ? */ + { + curly_try_B_min: + CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput); + } + } + sayNO; + /* NOTREACHED */ + assert(0); + + curly_try_B_max: + /* a successful greedy match: now try to match B */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + { + bool could_match = locinput < reginfo->strend; + + /* If it could work, try it. */ + if (ST.c1 != CHRTEST_VOID && could_match) { + if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) + { + could_match = memEQ(locinput, + ST.c1_utf8, + UTF8SKIP(locinput)) + || memEQ(locinput, + ST.c2_utf8, + UTF8SKIP(locinput)); + } + else { + could_match = UCHARAT(locinput) == ST.c1 + || UCHARAT(locinput) == ST.c2; + } + } + if (ST.c1 == CHRTEST_VOID || could_match) { + CURLY_SETPAREN(ST.paren, ST.count); + PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput); + /* NOTREACHED */ + assert(0); + } + } + /* FALLTHROUGH */ + + case CURLY_B_max_fail: + /* failed to find B in a greedy match */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* back up. */ + if (--ST.count < ST.min) + sayNO; + locinput = HOPc(locinput, -1); + goto curly_try_B_max; + +#undef ST + + case END: /* last op of main pattern */ + fake_end: + if (cur_eval) { + /* we've just finished A in /(??{A})B/; now continue with B */ + + st->u.eval.prev_rex = rex_sv; /* inner */ + + /* Save *all* the positions. */ + st->u.eval.cp = regcppush(rex, 0, maxopenparen); + rex_sv = cur_eval->u.eval.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + cur_curlyx = cur_eval->u.eval.prev_curlyx; + + REGCP_SET(st->u.eval.lastcp); + + /* Restore parens of the outer rex without popping the + * savestack */ + S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp, + &maxopenparen); + + st->u.eval.prev_eval = cur_eval; + cur_eval = cur_eval->u.eval.prev_eval; + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n", + REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval));); + if ( nochange_depth ) + nochange_depth--; + + PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B, + locinput); /* match B */ + } + + if (locinput < reginfo->till) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", + PL_colors[4], + (long)(locinput - startpos), + (long)(reginfo->till - startpos), + PL_colors[5])); + + sayNO_SILENT; /* Cannot match: too short. */ + } + sayYES; /* Success! */ + + case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %ssubpattern success...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])); + sayYES; /* Success! */ + +#undef ST +#define ST st->u.ifmatch + + { + char *newstart; + + case SUSPEND: /* (?>A) */ + ST.wanted = 1; + newstart = locinput; + goto do_ifmatch; + + case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?flags) { + char * const s = HOPBACKc(locinput, scan->flags); + if (!s) { + /* trivial fail */ + if (logical) { + logical = 0; + sw = 1 - cBOOL(ST.wanted); + } + else if (ST.wanted) + sayNO; + next = scan + ARG(scan); + if (next == scan) + next = NULL; + break; + } + newstart = s; + } + else + newstart = locinput; + + do_ifmatch: + ST.me = scan; + ST.logical = logical; + logical = 0; /* XXX: reset state of logical once it has been saved into ST */ + + /* execute body of (?...A) */ + PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart); + /* NOTREACHED */ + assert(0); + } + + case IFMATCH_A_fail: /* body of (?...A) failed */ + ST.wanted = !ST.wanted; + /* FALLTHROUGH */ + + case IFMATCH_A: /* body of (?...A) succeeded */ + if (ST.logical) { + sw = cBOOL(ST.wanted); + } + else if (!ST.wanted) + sayNO; + + if (OP(ST.me) != SUSPEND) { + /* restore old position except for (?>...) */ + locinput = st->locinput; + } + scan = ST.me + ARG(ST.me); + if (scan == ST.me) + scan = NULL; + continue; /* execute B */ + +#undef ST + + case LONGJMP: /* alternative with many branches compiles to + * (BRANCHJ; EXACT ...; LONGJMP ) x N */ + next = scan + ARG(scan); + if (next == scan) + next = NULL; + break; + + case COMMIT: /* (*COMMIT) */ + reginfo->cutpoint = reginfo->strend; + /* FALLTHROUGH */ + + case PRUNE: /* (*PRUNE) */ + if (!scan->flags) + sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + PUSH_STATE_GOTO(COMMIT_next, next, locinput); + /* NOTREACHED */ + assert(0); + + case COMMIT_next_fail: + no_final = 1; + /* FALLTHROUGH */ + + case OPFAIL: /* (*FAIL) */ + sayNO; + /* NOTREACHED */ + assert(0); + +#define ST st->u.mark + case MARKPOINT: /* (*MARK:foo) */ + ST.prev_mark = mark_state; + ST.mark_name = sv_commit = sv_yes_mark + = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + mark_state = st; + ST.mark_loc = locinput; + PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput); + /* NOTREACHED */ + assert(0); + + case MARKPOINT_next: + mark_state = ST.prev_mark; + sayYES; + /* NOTREACHED */ + assert(0); + + case MARKPOINT_next_fail: + if (popmark && sv_eq(ST.mark_name,popmark)) + { + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + popmark = NULL; /* we found our mark */ + sv_commit = ST.mark_name; + + DEBUG_EXECUTE_r({ + PerlIO_printf(Perl_debug_log, + "%*s %ssetting cutpoint to mark:%"SVf"...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], SVfARG(sv_commit), PL_colors[5]); + }); + } + mark_state = ST.prev_mark; + sv_yes_mark = mark_state ? + mark_state->u.mark.mark_name : NULL; + sayNO; + /* NOTREACHED */ + assert(0); + + case SKIP: /* (*SKIP) */ + if (scan->flags) { + /* (*SKIP) : if we fail we cut here*/ + ST.mark_name = NULL; + ST.mark_loc = locinput; + PUSH_STATE_GOTO(SKIP_next,next, locinput); + } else { + /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, + otherwise do nothing. Meaning we need to scan + */ + regmatch_state *cur = mark_state; + SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + + while (cur) { + if ( sv_eq( cur->u.mark.mark_name, + find ) ) + { + ST.mark_name = find; + PUSH_STATE_GOTO( SKIP_next, next, locinput); + } + cur = cur->u.mark.prev_mark; + } + } + /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ + break; + + case SKIP_next_fail: + if (ST.mark_name) { + /* (*CUT:NAME) - Set up to search for the name as we + collapse the stack*/ + popmark = ST.mark_name; + } else { + /* (*CUT) - No name, we cut here.*/ + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + /* but we set sv_commit to latest mark_name if there + is one so they can test to see how things lead to this + cut */ + if (mark_state) + sv_commit=mark_state->u.mark.mark_name; + } + no_final = 1; + sayNO; + /* NOTREACHED */ + assert(0); +#undef ST + + case LNBREAK: /* \R */ + if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) { + locinput += n; + } else + sayNO; + break; + + default: + PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", + PTR2UV(scan), OP(scan)); + Perl_croak(aTHX_ "regexp memory corruption"); + + /* this is a point to jump to in order to increment + * locinput by one character */ + increment_locinput: + assert(!NEXTCHR_IS_EOS); + if (utf8_target) { + locinput += PL_utf8skip[nextchr]; + /* locinput is allowed to go 1 char off the end, but not 2+ */ + if (locinput > reginfo->strend) + sayNO; + } + else + locinput++; + break; + + } /* end switch */ + + /* switch break jumps here */ + scan = next; /* prepare to execute the next op and ... */ + continue; /* ... jump back to the top, reusing st */ + /* NOTREACHED */ + assert(0); + + push_yes_state: + /* push a state that backtracks on success */ + st->u.yes.prev_yes_state = yes_state; + yes_state = st; + /* FALLTHROUGH */ + push_state: + /* push a new regex state, then continue at scan */ + { + regmatch_state *newst; + + DEBUG_STACK_r({ + regmatch_state *cur = st; + regmatch_state *curyes = yes_state; + int curd = depth; + regmatch_slab *slab = PL_regmatch_slab; + for (;curd > -1;cur--,curd--) { + if (cur < SLAB_FIRST(slab)) { + slab = slab->prev; + cur = SLAB_LAST(slab); + } + PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n", + REPORT_CODE_OFF + 2 + depth * 2,"", + curd, PL_reg_name[cur->resume_state], + (curyes == cur) ? "yes" : "" + ); + if (curyes == cur) + curyes = cur->u.yes.prev_yes_state; + } + } else + DEBUG_STATE_pp("push") + ); + depth++; + st->locinput = locinput; + newst = st+1; + if (newst > SLAB_LAST(PL_regmatch_slab)) + newst = S_push_slab(aTHX); + PL_regmatch_state = newst; + + locinput = pushinput; + st = newst; + continue; + /* NOTREACHED */ + assert(0); + } + } + + /* + * We get here only if there's trouble -- normally "case END" is + * the terminating point. + */ + Perl_croak(aTHX_ "corrupted regexp pointers"); + /* NOTREACHED */ + sayNO; + +yes: + if (yes_state) { + /* we have successfully completed a subexpression, but we must now + * pop to the state marked by yes_state and continue from there */ + assert(st != yes_state); +#ifdef DEBUGGING + while (st != yes_state) { + st--; + if (st < SLAB_FIRST(PL_regmatch_slab)) { + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + DEBUG_STATE_r({ + if (no_final) { + DEBUG_STATE_pp("pop (no final)"); + } else { + DEBUG_STATE_pp("pop (yes)"); + } + }); + depth--; + } +#else + while (yes_state < SLAB_FIRST(PL_regmatch_slab) + || yes_state > SLAB_LAST(PL_regmatch_slab)) + { + /* not in this slab, pop slab */ + depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + depth -= (st - yes_state); +#endif + st = yes_state; + yes_state = st->u.yes.prev_yes_state; + PL_regmatch_state = st; + + if (no_final) + locinput= st->locinput; + state_num = st->resume_state + no_final; + goto reenter_switch; + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", + PL_colors[4], PL_colors[5])); + + if (reginfo->info_aux_eval) { + /* each successfully executed (?{...}) block does the equivalent of + * local $^R = do {...} + * When popping the save stack, all these locals would be undone; + * bypass this by setting the outermost saved $^R to the latest + * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ + if (oreplsv != GvSV(PL_replgv)) + sv_setsv(oreplsv, GvSV(PL_replgv)); + } + result = 1; + goto final_exit; + +no: + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], PL_colors[5]) + ); + +no_silent: + if (no_final) { + if (yes_state) { + goto yes; + } else { + goto final_exit; + } + } + if (depth) { + /* there's a previous state to backtrack to */ + st--; + if (st < SLAB_FIRST(PL_regmatch_slab)) { + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + PL_regmatch_state = st; + locinput= st->locinput; + + DEBUG_STATE_pp("pop"); + depth--; + if (yes_state == st) + yes_state = st->u.yes.prev_yes_state; + + state_num = st->resume_state + 1; /* failure = success + 1 */ + goto reenter_switch; + } + result = 0; + + final_exit: + if (rex->intflags & PREGf_VERBARG_SEEN) { + SV *sv_err = get_sv("REGERROR", 1); + SV *sv_mrk = get_sv("REGMARK", 1); + if (result) { + sv_commit = &PL_sv_no; + if (!sv_yes_mark) + sv_yes_mark = &PL_sv_yes; + } else { + if (!sv_commit) + sv_commit = &PL_sv_yes; + sv_yes_mark = &PL_sv_no; + } + assert(sv_err); + assert(sv_mrk); + sv_setsv(sv_err, sv_commit); + sv_setsv(sv_mrk, sv_yes_mark); + } + + + if (last_pushed_cv) { + dSP; + POP_MULTICALL; + PERL_UNUSED_VAR(SP); + } + + assert(!result || locinput - reginfo->strbeg >= 0); + return result ? locinput - reginfo->strbeg : -1; +} + +/* + - regrepeat - repeatedly match something simple, report how many + * + * What 'simple' means is a node which can be the operand of a quantifier like + * '+', or {1,3} + * + * startposp - pointer a pointer to the start position. This is updated + * to point to the byte following the highest successful + * match. + * p - the regnode to be repeatedly matched against. + * reginfo - struct holding match state, such as strend + * max - maximum number of things to match. + * depth - (for debugging) backtracking depth. + */ +STATIC I32 +S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, + regmatch_info *const reginfo, I32 max, int depth) +{ + char *scan; /* Pointer to current position in target string */ + I32 c; + char *loceol = reginfo->strend; /* local version */ + I32 hardcount = 0; /* How many matches so far */ + bool utf8_target = reginfo->is_utf8_target; + int to_complement = 0; /* Invert the result? */ + UV utf8_flags; + _char_class_number classnum; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_REGREPEAT; + + scan = *startposp; + if (max == REG_INFTY) + max = I32_MAX; + else if (! utf8_target && loceol - scan > max) + loceol = scan + max; + + /* Here, for the case of a non-UTF-8 target we have adjusted down + * to the maximum of how far we should go in it (leaving it set to the real + * end, if the maximum permissible would take us beyond that). This allows + * us to make the loop exit condition that we haven't gone past to + * also mean that we haven't exceeded the max permissible count, saving a + * test each time through the loop. But it assumes that the OP matches a + * single byte, which is true for most of the OPs below when applied to a + * non-UTF-8 target. Those relatively few OPs that don't have this + * characteristic will have to compensate. + * + * There is no adjustment for UTF-8 targets, as the number of bytes per + * character varies. OPs will have to test both that the count is less + * than the max permissible (using to keep track), and that we + * are still within the bounds of the string (using . A few OPs + * match a single byte no matter what the encoding. They can omit the max + * test if, for the UTF-8 case, they do the adjustment that was skipped + * above. + * + * Thus, the code above sets things up for the common case; and exceptional + * cases need extra work; the common case is to make sure doesn't + * go past , and for UTF-8 to also use to make sure the + * count doesn't exceed the maximum permissible */ + + switch (OP(p)) { + case REG_ANY: + if (utf8_target) { + while (scan < loceol && hardcount < max && *scan != '\n') { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && *scan != '\n') + scan++; + } + break; + case SANY: + if (utf8_target) { + while (scan < loceol && hardcount < max) { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else + scan = loceol; + break; + case CANY: /* Move forward bytes, unless goes off end */ + if (utf8_target && loceol - scan > max) { + + /* hadn't been adjusted in the UTF-8 case */ + scan += max; + } + else { + scan = loceol; + } + break; + case EXACT: + assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + + c = (U8)*STRING(p); + + /* Can use a simple loop if the pattern char to match on is invariant + * under UTF-8, or both target and pattern aren't UTF-8. Note that we + * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's + * true iff it doesn't matter if the argument is in UTF-8 or not */ + if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) { + if (utf8_target && loceol - scan > max) { + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match at all, 1 char == 1 byte */ + loceol = scan + max; + } + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else if (reginfo->is_utf8_pat) { + if (utf8_target) { + STRLEN scan_char_len; + + /* When both target and pattern are UTF-8, we have to do + * string EQ */ + while (hardcount < max + && scan < loceol + && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) + && memEQ(scan, STRING(p), scan_char_len)) + { + scan += scan_char_len; + hardcount++; + } + } + else if (! UTF8_IS_ABOVE_LATIN1(c)) { + + /* Target isn't utf8; convert the character in the UTF-8 + * pattern to non-UTF8, and do a simple loop */ + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } /* else pattern char is above Latin1, can't possibly match the + non-UTF-8 target */ + } + else { + + /* Here, the string must be utf8; pattern isn't, and is + * different in utf8 than not, so can't compare them directly. + * Outside the loop, find the two utf8 bytes that represent c, and + * then look for those in sequence in the utf8 string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } + break; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf; + + case EXACTFL: + utf8_flags = FOLDEQ_LOCALE; + goto do_exactf; + + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; + + case EXACTFU_SS: + case EXACTFU: + utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; + + do_exactf: { + int c1, c2; + U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; + + assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + + if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, + reginfo)) + { + if (c1 == CHRTEST_VOID) { + /* Use full Unicode fold matching */ + char *tmpeol = reginfo->strend; + STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; + while (hardcount < max + && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, pat_len, + reginfo->is_utf8_pat, utf8_flags)) + { + scan = tmpeol; + tmpeol = reginfo->strend; + hardcount++; + } + } + else if (utf8_target) { + if (c1 == c2) { + while (scan < loceol + && hardcount < max + && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while (scan < loceol + && hardcount < max + && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) + || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + } + else if (c1 == c2) { + while (scan < loceol && UCHARAT(scan) == c1) { + scan++; + } + } + else { + while (scan < loceol && + (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) + { + scan++; + } + } + } + break; + } + case ANYOF: + if (utf8_target) { + while (hardcount < max + && scan < loceol + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && REGINCLASS(prog, p, (U8*)scan)) + scan++; + } + break; + + /* The argument (FLAGS) to all the POSIX node types is the class number */ + + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: + if (! utf8_target) { + while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), + *scan))) + { + scan++; + } + } else { + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), + (U8 *) scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case POSIXD: + if (utf8_target) { + goto utf8_posix; + } + /* FALLTHROUGH */ + + case POSIXA: + if (utf8_target && loceol - scan > max) { + + /* We didn't adjust at the beginning of this routine + * because is UTF-8, but it is actually ok to do so, since here, to + * match, 1 char == 1 byte. */ + loceol = scan + max; + } + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + break; + + case NPOSIXD: + if (utf8_target) { + to_complement = 1; + goto utf8_posix; + } + /* FALLTHROUGH */ + + case NPOSIXA: + if (! utf8_target) { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + } + else { + + /* The complement of something that matches only ASCII matches all + * non-ASCII, plus everything in ASCII that isn't in the class. */ + while (hardcount < max && scan < loceol + && (! isASCII_utf8(scan) + || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + while (scan < loceol && to_complement + ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p)))) + { + scan++; + } + } + else { + utf8_posix: + classnum = (_char_class_number) FLAGS(p); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, a swash is needed for above-Latin1 code points. + * Process as many Latin1 code points using the built-in rules. + * Go to another loop to finish processing upon encountering + * the first Latin1 code point. We could do that in this loop + * as well, but the other way saves having to test if the swash + * has been loaded every time through the loop: extra space to + * save a test. */ + while (hardcount < max && scan < loceol) { + if (UTF8_IS_INVARIANT(*scan)) { + if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan, + classnum)))) + { + break; + } + scan++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), + classnum)))) + { + break; + } + scan += 2; + } + else { + goto found_above_latin1; + } + + hardcount++; + } + } + else { + /* For these character classes, the knowledge of how to handle + * every code point is compiled in to Perl via a macro. This + * code is written for making the loops as tight as possible. + * It could be refactored to save space instead */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate code + if we revert the change of \v + matching this */ + /* FALLTHROUGH */ + case _CC_ENUM_PSXSPC: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isSPACE_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_BLANK: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isBLANK_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_XDIGIT: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isXDIGIT_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_VERTSPACE: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isVERTWS_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_CNTRL: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isCNTRL_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + default: + Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum); + } + } + } + break; + + found_above_latin1: /* Continuation of POSIXU and NPOSIXU */ + + /* Load the swash if not already present */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = _core_swash_init( + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(_generic_utf8( + classnum, + scan, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) scan, + TRUE)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + + case LNBREAK: + if (utf8_target) { + while (hardcount < max && scan < loceol && + (c=is_LNBREAK_utf8_safe(scan, loceol))) { + scan += c; + hardcount++; + } + } else { + /* LNBREAK can match one or two latin chars, which is ok, but we + * have to use hardcount in this situation, and throw away the + * adjustment to done before the switch statement */ + loceol = reginfo->strend; + while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { + scan+=c; + hardcount++; + } + } + break; + + case BOUND: + case BOUNDA: + case BOUNDL: + case BOUNDU: + case EOS: + case GPOS: + case KEEPS: + case NBOUND: + case NBOUNDA: + case NBOUNDL: + case NBOUNDU: + case OPFAIL: + case SBOL: + case SEOL: + /* These are all 0 width, so match right here or not at all. */ + break; + + default: + Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); + /* NOTREACHED */ + assert(0); + + } + + if (hardcount) + c = hardcount; + else + c = scan - *startposp; + *startposp = scan; + + DEBUG_r({ + GET_RE_DEBUG_FLAGS_DECL; + DEBUG_EXECUTE_r({ + SV * const prop = sv_newmortal(); + regprop(prog, prop, p, reginfo); + PerlIO_printf(Perl_debug_log, + "%*s %s can match %"IVdf" times out of %"IVdf"...\n", + REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); + }); + }); + + return(c); +} + + +#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) +/* +- regclass_swash - prepare the utf8 swash. Wraps the shared core version to +create a copy so that changes the caller makes won't change the shared one. +If is non-null, will return NULL in it, for back-compat. + */ +SV * +Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp) +{ + PERL_ARGS_ASSERT_REGCLASS_SWASH; + + if (altsvp) { + *altsvp = NULL; + } + + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); +} + +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) +{ + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. + * If is 'true', will attempt to create the swash if not already + * done. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). + * Tied intimately to how regcomp.c sets up the data structure */ + + SV *sw = NULL; + SV *si = NULL; /* Input swash initialization string */ + SV* invlist = NULL; + + RXi_GET_DECL(prog,progi); + const struct reg_data * const data = prog ? progi->data : NULL; + + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; + + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + if (data && data->count) { + const U32 n = ARG(node); + + if (data->what[n] == 's') { + SV * const rv = MUTABLE_SV(data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + + si = *ary; /* ary[0] = the string to initialize the swash with */ + + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if + * that inversion list has any user-defined properties in it. */ + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + assert(only_utf8_locale_ptr); + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; + } + } + + /* Element [1] is reserved for the set-up swash. If already there, + * return it; if not, create it and store it there */ + if (ary[1] && SvROK(ary[1])) { + sw = ary[1]; + } + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { + assert(si); + sw = _core_swash_init("utf8", /* the utf8 package */ + "", /* nameless */ + si, + 1, /* binary */ + 0, /* not from tr/// */ + invlist, + &swash_init_flags); + (void)av_store(av, 1, sw); + } + } + } + + /* If requested, return a printable version of what this swash matches */ + if (listsvp) { + SV* matches_string = newSVpvs(""); + + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ + if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) + && (si && si != &PL_sv_undef)) + { + sv_catsv(matches_string, si); + } + + /* Add the inversion list to whatever we have. This may have come from + * the swash, or from an input parameter */ + if (invlist) { + sv_catsv(matches_string, _invlist_contents(invlist)); + } + *listsvp = matches_string; + } + + return sw; +} +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ + +/* + - reginclass - determine if a character falls into a character class + + n is the ANYOF regnode + p is the target string + p_end points to one byte beyond the end of the target string + utf8_target tells whether p is in UTF-8. + + Returns true if matched; false otherwise. + + Note that this can be a synthetic start class, a combination of various + nodes, so things you think might be mutually exclusive, such as locale, + aren't. It can match both locale and non-locale + + */ + +STATIC bool +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) +{ + dVAR; + const char flags = ANYOF_FLAGS(n); + bool match = FALSE; + UV c = *p; + + PERL_ARGS_ASSERT_REGINCLASS; + + /* If c is not already the code point, get it. Note that + * UTF8_IS_INVARIANT() works even if not in UTF-8 */ + if (! UTF8_IS_INVARIANT(c) && utf8_target) { + STRLEN c_len = 0; + c = utf8n_to_uvchr(p, p_end - p, &c_len, + (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) + | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); + /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for + * UTF8_ALLOW_FFFF */ + if (c_len == (STRLEN)-1) + Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + } + + /* If this character is potentially in the bitmap, check it */ + if (c < 256) { + if (ANYOF_BITMAP_TEST(n, c)) + match = TRUE; + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL + && ! utf8_target + && ! isASCII(c)) + { + match = TRUE; + } + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } + } + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { + + /* The data structure is arranged so bits 0, 2, 4, ... are set + * if the class includes the Posix character class given by + * bit/2; and 1, 3, 5, ... are set if the class includes the + * complemented Posix class given by int(bit/2). So we loop + * through the bits, each time changing whether we complement + * the result or not. Suppose for the sake of illustration + * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0 + * is set, it means there is a match for this ANYOF node if the + * character is in the class given by the expression (0 / 2 = 0 + * = \w). If it is in that class, isFOO_lc() will return 1, + * and since 'to_complement' is 0, the result will stay TRUE, + * and we exit the loop. Suppose instead that bit 0 is 0, but + * bit 1 is 1. That means there is a match if the character + * matches \W. We won't bother to call isFOO_lc() on bit 0, + * but will on bit 1. On the second iteration 'to_complement' + * will be 1, so the exclusive or will reverse things, so we + * are testing for \W. On the third iteration, 'to_complement' + * will be 0, and we would be testing for \s; the fourth + * iteration would test for \S, etc. + * + * Note that this code assumes that all the classes are closed + * under folding. For example, if a character matches \w, then + * its fold does too; and vice versa. This should be true for + * any well-behaved locale for all the currently defined Posix + * classes, except for :lower: and :upper:, which are handled + * by the pseudo-class :cased: which matches if either of the + * other two does. To get rid of this assumption, an outer + * loop could be used below to iterate over both the source + * character, and its fold (if different) */ + + int count = 0; + int to_complement = 0; + + while (count < ANYOF_MAX) { + if (ANYOF_POSIXL_TEST(n, count) + && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) + { + match = TRUE; + break; + } + count++; + to_complement ^= 1; + } + } + } + } + + + /* If the bitmap didn't (or couldn't) match, and something outside the + * bitmap could match, try that. */ + if (!match) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { + match = TRUE; /* Everything above 255 matches */ + } + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) + { + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); + if (sw) { + U8 utf8_buffer[2]; + U8 * utf8_p; + if (utf8_target) { + utf8_p = (U8 *) p; + } else { /* Convert to utf8 */ + utf8_p = utf8_buffer; + append_utf8_from_native_byte(*p, &utf8_p); + utf8_p = utf8_buffer; + } + + if (swash_fetch(sw, utf8_p, TRUE)) { + match = TRUE; + } + } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } + } + + if (UNICODE_IS_SUPER(c) + && (flags & ANYOF_WARN_SUPER) + && ckWARN_d(WARN_NON_UNICODE)) + { + Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); + } + } + +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ + return (flags & ANYOF_INVERT) ^ match; +} + +STATIC U8 * +S_reghop3(U8 *s, SSize_t off, const U8* lim) +{ + /* return the position 'off' UTF-8 characters away from 's', forward if + * 'off' >= 0, backwards if negative. But don't go outside of position + * 'lim', which better be < s if off < 0 */ + + PERL_ARGS_ASSERT_REGHOP3; + + if (off >= 0) { + while (off-- && s < lim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + } + else { + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + } + return s; +} + +STATIC U8 * +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) +{ + PERL_ARGS_ASSERT_REGHOP4; + + if (off >= 0) { + while (off-- && s < rlim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + } + else { + while (off++ && s > llim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > llim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + } + return s; +} + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ + +STATIC U8 * +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) +{ + PERL_ARGS_ASSERT_REGHOPMAYBE3; + + if (off >= 0) { + while (off-- && s < lim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + if (off >= 0) + return NULL; + } + else { + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + if (off <= 0) + return NULL; + } + return s; +} + + +/* when executing a regex that may have (?{}), extra stuff needs setting + up that will be visible to the called code, even before the current + match has finished. In particular: + + * $_ is localised to the SV currently being matched; + * pos($_) is created if necessary, ready to be updated on each call-out + to code; + * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm + isn't set until the current pattern is successfully finished), so that + $1 etc of the match-so-far can be seen; + * save the old values of subbeg etc of the current regex, and set then + to the current string (again, this is normally only done at the end + of execution) +*/ + +static void +S_setup_eval_state(pTHX_ regmatch_info *const reginfo) +{ + MAGIC *mg; + regexp *const rex = ReANY(reginfo->prog); + regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; + + eval_state->rex = rex; + + if (reginfo->sv) { + /* Make $_ available to executed code. */ + if (reginfo->sv != DEFSV) { + SAVE_DEFSV; + DEFSV_set(reginfo->sv); + } + + if (!(mg = mg_find_mglob(reginfo->sv))) { + /* prepare for quick setting of pos */ + mg = sv_magicext_mglob(reginfo->sv); + mg->mg_len = -1; + } + eval_state->pos_magic = mg; + eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; + } + else + eval_state->pos_magic = NULL; + + if (!PL_reg_curpm) { + /* PL_reg_curpm is a fake PMOP that we can attach the current + * regex to and point PL_curpm at, so that $1 et al are visible + * within a /(?{})/. It's just allocated once per interpreter the + * first time its needed */ + Newxz(PL_reg_curpm, 1, PMOP); +#ifdef USE_ITHREADS + { + SV* const repointer = &PL_sv_undef; + /* this regexp is also owned by the new PL_reg_curpm, which + will try to free it. */ + av_push(PL_regex_padav, repointer); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); + PL_regex_pad = AvARRAY(PL_regex_padav); + } +#endif + } + SET_reg_curpm(reginfo->prog); + eval_state->curpm = PL_curpm; + PL_curpm = PL_reg_curpm; + if (RXp_MATCH_COPIED(rex)) { + /* Here is a serious problem: we cannot rewrite subbeg, + since it may be needed if this match fails. Thus + $` inside (?{}) could fail... */ + eval_state->subbeg = rex->subbeg; + eval_state->sublen = rex->sublen; + eval_state->suboffset = rex->suboffset; + eval_state->subcoffset = rex->subcoffset; +#ifdef PERL_ANY_COW + eval_state->saved_copy = rex->saved_copy; +#endif + RXp_MATCH_COPIED_off(rex); + } + else + eval_state->subbeg = NULL; + rex->subbeg = (char *)reginfo->strbeg; + rex->suboffset = 0; + rex->subcoffset = 0; + rex->sublen = reginfo->strend - reginfo->strbeg; +} + + +/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ + +static void +S_cleanup_regmatch_info_aux(pTHX_ void *arg) +{ + regmatch_info_aux *aux = (regmatch_info_aux *) arg; + regmatch_info_aux_eval *eval_state = aux->info_aux_eval; + regmatch_slab *s; + + Safefree(aux->poscache); + + if (eval_state) { + + /* undo the effects of S_setup_eval_state() */ + + if (eval_state->subbeg) { + regexp * const rex = eval_state->rex; + rex->subbeg = eval_state->subbeg; + rex->sublen = eval_state->sublen; + rex->suboffset = eval_state->suboffset; + rex->subcoffset = eval_state->subcoffset; +#ifdef PERL_ANY_COW + rex->saved_copy = eval_state->saved_copy; +#endif + RXp_MATCH_COPIED_on(rex); + } + if (eval_state->pos_magic) + { + eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } + + PL_curpm = eval_state->curpm; + } + + PL_regmatch_state = aux->old_regmatch_state; + PL_regmatch_slab = aux->old_regmatch_slab; + + /* free all slabs above current one - this must be the last action + * of this function, as aux and eval_state are allocated within + * slabs and may be freed here */ + + s = PL_regmatch_slab->next; + if (s) { + PL_regmatch_slab->next = NULL; + while (s) { + regmatch_slab * const osl = s; + s = s->next; + Safefree(osl); + } + } +} + + +STATIC void +S_to_utf8_substr(pTHX_ regexp *prog) +{ + /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile + * on the converted value */ + + int i = 1; + + PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; + + do { + if (prog->substrs->data[i].substr + && !prog->substrs->data[i].utf8_substr) { + SV* const sv = newSVsv(prog->substrs->data[i].substr); + prog->substrs->data[i].utf8_substr = sv; + sv_utf8_upgrade(sv); + if (SvVALID(prog->substrs->data[i].substr)) { + if (SvTAIL(prog->substrs->data[i].substr)) { + /* Trim the trailing \n that fbm_compile added last + time. */ + SvCUR_set(sv, SvCUR(sv) - 1); + /* Whilst this makes the SV technically "invalid" (as its + buffer is no longer followed by "\0") when fbm_compile() + adds the "\n" back, a "\0" is restored. */ + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } + if (prog->substrs->data[i].substr == prog->check_substr) + prog->check_utf8 = sv; + } + } while (i--); +} + +STATIC bool +S_to_byte_substr(pTHX_ regexp *prog) +{ + /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile + * on the converted value; returns FALSE if can't be converted. */ + + int i = 1; + + PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; + + do { + if (prog->substrs->data[i].utf8_substr + && !prog->substrs->data[i].substr) { + SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); + if (! sv_utf8_downgrade(sv, TRUE)) { + return FALSE; + } + if (SvVALID(prog->substrs->data[i].utf8_substr)) { + if (SvTAIL(prog->substrs->data[i].utf8_substr)) { + /* Trim the trailing \n that fbm_compile added last + time. */ + SvCUR_set(sv, SvCUR(sv) - 1); + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } + prog->substrs->data[i].substr = sv; + if (prog->substrs->data[i].utf8_substr == prog->check_utf8) + prog->check_substr = sv; + } + } while (i--); + + return TRUE; +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021003/regcomp.c b/src/5021003/regcomp.c new file mode 100644 index 0000000..14735db --- /dev/null +++ b/src/5021003/regcomp.c @@ -0,0 +1,16895 @@ +/* regcomp.c + */ + +/* + * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee + * + * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] + */ + +/* This file contains functions for compiling a regular expression. See + * also regexec.c which funnily enough, contains functions for executing + * a regular expression. + * + * This file is also copied at build time to ext/re/re_comp.c, where + * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. + * This causes the main functions to be compiled under new names and with + * debugging support added, which makes "use re 'debug'" work. + */ + +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +/* + * pregcomp and pregexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + * + **** Alterations to Henry's code are... + **** + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others + **** + **** You may distribute under the terms of either the GNU General Public + **** License or the Artistic License, as specified in the README file. + + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#define PERL_IN_REGCOMP_C +#include "perl.h" + +#ifndef PERL_IN_XSUB_RE +#include "re_defs.h" +#endif + +#define REG_COMP_C +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +EXTERN_C const struct regexp_engine my_reg_engine; +#else +# include "regcomp.h" +#endif + +#include "dquote_static.c" +#include "charclass_invlists.h" +#include "inline_invlist.c" +#include "unicode_constants.h" + +#define HAS_NONLATIN1_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) +#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) + +#ifndef STATIC +#define STATIC static +#endif + + +struct RExC_state_t { + U32 flags; /* RXf_* are we folding, multilining? */ + U32 pm_flags; /* PMf_* stuff from the calling PMOP */ + char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ + regexp *rx; /* perl core regexp structure */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ + char *start; /* Start of input for compile */ + char *end; /* End of input for compile */ + char *parse; /* Input-scan pointer. */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ + regnode *emit_start; /* Start of emitted-code area */ + regnode *emit_bound; /* First regnode outside of the + allocated space */ + regnode *emit; /* Code-emit pointer; if = &emit_dummy, + implies compiling, so don't emit */ + regnode_ssc emit_dummy; /* placeholder for emit to point to; + large enough for the largest + non-EXACTish node, so can use it as + scratch in pass1 */ + I32 naughty; /* How bad is this pattern? */ + I32 sawback; /* Did we see \1, ...? */ + U32 seen; + SSize_t size; /* Code size. */ + I32 npar; /* Capture buffer count, (OPEN) plus + one. ("par" 0 is the whole + pattern)*/ + I32 nestroot; /* root parens we are in - used by + accept */ + I32 extralen; + I32 seen_zerolen; + regnode **open_parens; /* pointers to open parens */ + regnode **close_parens; /* pointers to close parens */ + regnode *opend; /* END node in program */ + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ + I32 uni_semantics; /* If a d charset modifier should use unicode + rules, even if the pattern is not in + utf8 */ + HV *paren_names; /* Paren names */ + + regnode **recurse; /* Recurse regops */ + I32 recurse_count; /* Number of recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which parens we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ + I32 in_lookbehind; + I32 contains_locale; + I32 contains_i; + I32 override_recoding; + I32 in_multi_char_class; + struct reg_code_block *code_blocks; /* positions of literal (?{}) + within pattern */ + int num_code_blocks; /* size of code_blocks[] */ + int code_index; /* next code_blocks[] slot */ + SSize_t maxlen; /* mininum possible number of chars in string to match */ +#ifdef ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ +#define RExC_starttry (pRExC_state->starttry) +#endif + SV *runtime_code_qr; /* qr with the runtime code blocks */ +#ifdef DEBUGGING + const char *lastparse; + I32 lastnum; + AV *paren_name_list; /* idx -> name */ +#define RExC_lastparse (pRExC_state->lastparse) +#define RExC_lastnum (pRExC_state->lastnum) +#define RExC_paren_name_list (pRExC_state->paren_name_list) +#endif +}; + +#define RExC_flags (pRExC_state->flags) +#define RExC_pm_flags (pRExC_state->pm_flags) +#define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) +#define RExC_rx (pRExC_state->rx) +#define RExC_rxi (pRExC_state->rxi) +#define RExC_start (pRExC_state->start) +#define RExC_end (pRExC_state->end) +#define RExC_parse (pRExC_state->parse) +#define RExC_whilem_seen (pRExC_state->whilem_seen) +#ifdef RE_TRACK_PATTERN_OFFSETS +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the + others */ +#endif +#define RExC_emit (pRExC_state->emit) +#define RExC_emit_dummy (pRExC_state->emit_dummy) +#define RExC_emit_start (pRExC_state->emit_start) +#define RExC_emit_bound (pRExC_state->emit_bound) +#define RExC_naughty (pRExC_state->naughty) +#define RExC_sawback (pRExC_state->sawback) +#define RExC_seen (pRExC_state->seen) +#define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) +#define RExC_npar (pRExC_state->npar) +#define RExC_nestroot (pRExC_state->nestroot) +#define RExC_extralen (pRExC_state->extralen) +#define RExC_seen_zerolen (pRExC_state->seen_zerolen) +#define RExC_utf8 (pRExC_state->utf8) +#define RExC_uni_semantics (pRExC_state->uni_semantics) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) +#define RExC_open_parens (pRExC_state->open_parens) +#define RExC_close_parens (pRExC_state->close_parens) +#define RExC_opend (pRExC_state->opend) +#define RExC_paren_names (pRExC_state->paren_names) +#define RExC_recurse (pRExC_state->recurse) +#define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) +#define RExC_in_lookbehind (pRExC_state->in_lookbehind) +#define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_contains_i (pRExC_state->contains_i) +#define RExC_override_recoding (pRExC_state->override_recoding) +#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) + + +#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ + ((*s) == '{' && regcurly(s))) + +/* + * Flags to be passed up and down. + */ +#define WORST 0 /* Worst case. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ + +/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single + * character. (There needs to be a case: in the switch statement in regexec.c + * for any node marked SIMPLE.) Note that this is not the same thing as + * REGNODE_SIMPLE */ +#define SIMPLE 0x02 +#define SPSTART 0x04 /* Starts with * or + */ +#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ +#define TRYAGAIN 0x10 /* Weeded out a declaration. */ +#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */ + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) + +/* whether trie related optimizations are enabled */ +#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION +#define TRIE_STUDY_OPT +#define FULL_TRIE_STUDY +#define TRIE_STCLASS +#endif + + + +#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] +#define PBITVAL(paren) (1 << ((paren) & 7)) +#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren)) +#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) +#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) + +#define REQUIRE_UTF8 STMT_START { \ + if (!UTF) { \ + *flagp = RESTART_UTF8; \ + return NULL; \ + } \ + } STMT_END + +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((int) ((class) / 2)) +#define classnum_to_namedclass(classnum) ((classnum) * 2) + +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + +/* About scan_data_t. + + During optimisation we recurse through the regexp program performing + various inplace (keyhole style) optimisations. In addition study_chunk + and scan_commit populate this data structure with information about + what strings MUST appear in the pattern. We look for the longest + string that must appear at a fixed location, and we look for the + longest string that may appear at a floating location. So for instance + in the pattern: + + /FOO[xX]A.*B[xX]BAR/ + + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating + strings (because they follow a .* construct). study_chunk will identify + both FOO and BAR as being the longest fixed and floating strings respectively. + + The strings can be composites, for instance + + /(f)(o)(o)/ + + will result in a composite fixed substring 'foo'. + + For each string some basic information is maintained: + + - offset or min_offset + This is the position the string must appear at, or not before. + It also implicitly (when combined with minlenp) tells us how many + characters must match before the string we are searching for. + Likewise when combined with minlenp and the length of the string it + tells us how many characters must appear after the string we have + found. + + - max_offset + Only used for floating strings. This is the rightmost point that + the string can appear at. If set to SSize_t_MAX it indicates that the + string can occur infinitely far to the right. + + - minlenp + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive + lookahead or positive lookbehind we can have multiple patterns + involved. Consider + + /(?=FOO).*F/ + + The minimum length of the pattern overall is 3, the minimum length + of the lookahead part is 3, but the minimum length of the part that + will actually match is 1. So 'FOO's minimum length is 3, but the + minimum length for the F is 1. This is important as the minimum length + is used to determine offsets in front of and behind the string being + looked for. Since strings can be composites this is the length of the + pattern at the time it was committed with a scan_commit. Note that + the length is calculated by study_chunk, so that the minimum lengths + are not known until the full pattern has been compiled, thus the + pointer to the value. + + - lookbehind + + In the case of lookbehind the string being searched for can be + offset past the start point of the final matching string. + If this value was just blithely removed from the min_offset it would + invalidate some of the calculations for how many chars must match + before or after (as they are derived from min_offset and minlen and + the length of the string being searched for). + When the final pattern is compiled and the data is moved from the + scan_data_t structure into the regexp structure the information + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the + associated string. + + The fields pos_min and pos_delta are used to store the minimum offset + and the delta to the maximum offset at the current point in the pattern. + +*/ + +typedef struct scan_data_t { + /*I32 len_min; unused */ + /*I32 len_delta; unused */ + SSize_t pos_min; + SSize_t pos_delta; + SV *last_found; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; + SV **longest; /* Either &l_fixed, or &l_float. */ + SV *longest_fixed; /* longest fixed string found in pattern */ + SSize_t offset_fixed; /* offset where it starts */ + SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */ + I32 lookbehind_fixed; /* is the position of the string modfied by LB */ + SV *longest_float; /* longest floating string found in pattern */ + SSize_t offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_max; /* latest point in string it can appear */ + SSize_t *minlen_float; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ + I32 flags; + I32 whilem_c; + SSize_t *last_closep; + regnode_ssc *start_class; +} scan_data_t; + +/* The below is perhaps overboard, but this allows us to save a test at the + * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' + * and 'a' differ by a single bit; the same with the upper and lower case of + * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; + * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and + * then inverts it to form a mask, with just a single 0, in the bit position + * where the upper- and lowercase differ. XXX There are about 40 other + * instances in the Perl core where this micro-optimization could be used. + * Should decide if maintenance cost is worse, before changing those + * + * Returns a boolean as to whether or not 'v' is either a lowercase or + * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a + * compile-time constant, the generated code is better than some optimizing + * compilers figure out, amounting to a mask and test. The results are + * meaningless if 'c' is not one of [A-Za-z] */ +#define isARG2_lower_or_UPPER_ARG1(c, v) \ + (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) + +/* + * Forward declarations for pregcomp()'s friends. + */ + +static const scan_data_t zero_scan_data = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0}; + +#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) +#define SF_BEFORE_SEOL 0x0001 +#define SF_BEFORE_MEOL 0x0002 +#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) +#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) + +#define SF_FIX_SHIFT_EOL (+2) +#define SF_FL_SHIFT_EOL (+4) + +#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) +#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) + +#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL) +#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */ +#define SF_IS_INF 0x0040 +#define SF_HAS_PAR 0x0080 +#define SF_IN_PAR 0x0100 +#define SF_HAS_EVAL 0x0200 +#define SCF_DO_SUBSTR 0x0400 +#define SCF_DO_STCLASS_AND 0x0800 +#define SCF_DO_STCLASS_OR 0x1000 +#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 + +#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ +#define SCF_SEEN_ACCEPT 0x8000 +#define SCF_TRIE_DOING_RESTUDY 0x10000 + +#define UTF cBOOL(RExC_utf8) + +/* The enums for all these are ordered so things work out correctly */ +#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) + +#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) + +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + +#define OOB_NAMEDCLASS -1 + +/* There is no code point that is out-of-bounds, so this is problematic. But + * its only current use is to initialize a variable that is always set before + * looked at. */ +#define OOB_UNICODE 0xDEADBEEF + +#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) + + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "<-- HERE" /* marker as it appears in the description */ +#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%"UTF8f MARKER2 "%"UTF8f"/" + +#define REPORT_LOCATION_ARGS(offset) \ + UTF8fARG(UTF, offset, RExC_precomp), \ + UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define _FAIL(code) STMT_START { \ + const char *ellipses = ""; \ + IV len = RExC_end - RExC_precomp; \ + \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + code; \ +} STMT_END + +#define FAIL(msg) _FAIL( \ + Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +#define FAIL2(msg,arg) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL(m); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL2(m, a1); \ +} STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL3(m, a1, a2); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vFAIL4(m,a1,a2,a3) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + Simple_vFAIL4(m, a1, a2, a3); \ +} STMT_END + +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + const IV offset = RExC_parse - RExC_precomp; \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + +/* m is not necessarily a "literal string", in this macro */ +#define reg_warn_non_literal_string(loc, m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNreg(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN_dep(loc, m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNdep(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARNregdep(loc,m) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN2reg(loc, m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN3(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN4(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + +#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ +} STMT_END + + +/* Allow for side effects in s */ +#define REGC(c,s) STMT_START { \ + if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ +} STMT_END + +/* Macros for recording node offsets. 20001227 mjd@plover.com + * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in + * element 2*n-1 of the array. Element #2n holds the byte length node #n. + * Element 0 holds the number n. + * Position is 1 indexed. + */ +#ifndef RE_TRACK_PATTERN_OFFSETS +#define Set_Node_Offset_To_R(node,byte) +#define Set_Node_Offset(node,byte) +#define Set_Cur_Node_Offset +#define Set_Node_Length_To_R(node,len) +#define Set_Node_Length(node,len) +#define Set_Node_Cur_Length(node,start) +#define Node_Offset(n) +#define Node_Length(n) +#define Set_Node_Offset_Length(node,offset,len) +#define ProgLen(ri) ri->u.proglen +#define SetProgLen(ri,x) ri->u.proglen = x +#else +#define ProgLen(ri) ri->u.offsets[0] +#define SetProgLen(ri,x) ri->u.offsets[0] = x +#define Set_Node_Offset_To_R(node,byte) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(byte))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", \ + (int)(node)); \ + } else { \ + RExC_offsets[2*(node)-1] = (byte); \ + } \ + } \ +} STMT_END + +#define Set_Node_Offset(node,byte) \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) +#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) + +#define Set_Node_Length_To_R(node,len) STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ + __LINE__, (int)(node), (int)(len))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Length macro", \ + (int)(node)); \ + } else { \ + RExC_offsets[2*(node)] = (len); \ + } \ + } \ +} STMT_END + +#define Set_Node_Length(node,len) \ + Set_Node_Length_To_R((node)-RExC_emit_start, len) +#define Set_Node_Cur_Length(node, start) \ + Set_Node_Length(node, RExC_parse - start) + +/* Get offsets and lengths */ +#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) +#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) + +#define Set_Node_Offset_Length(node,offset,len) STMT_START { \ + Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ + Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ +} STMT_END +#endif + +#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS +#define EXPERIMENTAL_INPLACESCAN +#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ + +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_CANY_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_GOSTART_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + PerlIO_printf(Perl_debug_log,"\n"); \ + }); + +#define DEBUG_STUDYDATA(str,data,depth) \ +DEBUG_OPTIMISE_MORE_r(if(data){ \ + PerlIO_printf(Perl_debug_log, \ + "%*s" str "Pos:%"IVdf"/%"IVdf \ + " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ + (int)(depth)*2, "", \ + (IV)((data)->pos_min), \ + (IV)((data)->pos_delta), \ + (UV)((data)->flags), \ + (IV)((data)->whilem_c), \ + (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ + is_inf ? "INF " : "" \ + ); \ + if ((data)->last_found) \ + PerlIO_printf(Perl_debug_log, \ + "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ + " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ + SvPVX_const((data)->last_found), \ + (IV)((data)->last_end), \ + (IV)((data)->last_start_min), \ + (IV)((data)->last_start_max), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_fixed)) ? "*" : "", \ + SvPVX_const((data)->longest_fixed), \ + (IV)((data)->offset_fixed), \ + ((data)->longest && \ + (data)->longest==&((data)->longest_float)) ? "*" : "", \ + SvPVX_const((data)->longest_float), \ + (IV)((data)->offset_float_min), \ + (IV)((data)->offset_float_max) \ + ); \ + PerlIO_printf(Perl_debug_log,"\n"); \ +}); + +/* Mark that we cannot extend a found fixed substring at this point. + Update the longest found anchored substring and the longest found + floating substrings if needed. */ + +STATIC void +S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) +{ + const STRLEN l = CHR_SVLEN(data->last_found); + const STRLEN old_l = CHR_SVLEN(*data->longest); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_SCAN_COMMIT; + + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { + SvSetMagicSV(*data->longest, data->last_found); + if (*data->longest == data->longest_fixed) { + data->offset_fixed = l ? data->last_start_min : data->pos_min; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); + else + data->flags &= ~SF_FIX_BEFORE_EOL; + data->minlen_fixed=minlenp; + data->lookbehind_fixed=0; + } + else { /* *data->longest == data->longest_float */ + data->offset_float_min = l ? data->last_start_min : data->pos_min; + data->offset_float_max = (l + ? data->last_start_max + : (data->pos_delta == SSize_t_MAX + ? SSize_t_MAX + : data->pos_min + data->pos_delta)); + if (is_inf + || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX) + data->offset_float_max = SSize_t_MAX; + if (data->flags & SF_BEFORE_EOL) + data->flags + |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); + else + data->flags &= ~SF_FL_BEFORE_EOL; + data->minlen_float=minlenp; + data->lookbehind_float=0; + } + } + SvCUR_set(data->last_found, 0); + { + SV * const sv = data->last_found; + if (SvUTF8(sv) && SvMAGICAL(sv)) { + MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); + if (mg) + mg->mg_len = 0; + } + } + data->last_end = -1; + data->flags &= ~SF_BEFORE_EOL; + DEBUG_STUDYDATA("commit: ",data,0); +} + +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ + _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ +} + +STATIC int +S_ssc_is_anything(const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + +STATIC void +S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ + + PERL_ARGS_ASSERT_SSC_INIT; + + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ssc_anything(ssc); + + /* If any portion of the regex is to operate under locale rules, + * initialization includes it. The reason this isn't done for all regexes + * is that the optimizer was written under the assumption that locale was + * all-or-nothing. Given the complexity and lack of documentation in the + * optimizer, and that there are inadequate test cases for locale, many + * parts of it may not work properly, it is safest to avoid locale unless + * necessary. */ + if (RExC_contains_locale) { + ANYOF_POSIXL_SETALL(ssc); + } + else { + ANYOF_POSIXL_ZERO(ssc); + } +} + +STATIC int +S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ + + UV start, end; + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; +} + +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) +{ + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of type ANYOF. Handles complementing the result if + * appropriate. If some code points aren't knowable at this time, the + * returned list must, and will, contain every code point that is a + * possibility. */ + + SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; + unsigned int i; + const U32 n = ARG(node); + bool new_node_has_latin1 = FALSE; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (n != ANYOF_NONBITMAP_EMPTY) { + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + assert(RExC_rxi->data->what[n] == 's'); + + if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ + invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]))); + } + else if (ary[0] && ary[0] != &PL_sv_undef) { + + /* Here, no compile-time swash, and there are things that won't be + * known until runtime -- we have to assume it could be anything */ + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[3] && ary[3] != &PL_sv_undef) { + + /* Here no compile-time swash, and no run-time only data. Use the + * node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; + } + } + + /* An ANYOF node contains a bitmap for the first 256 code points, and an + * inversion list for the others, but if there are code points that should + * match only conditionally on the target string being UTF-8, those are + * placed in the inversion list, and not the bitmap. Since there are + * circumstances under which they could match, they are included in the + * SSC. But if the ANYOF node is to be inverted, we have to exclude them + * here, so that when we invert below, the end result actually does include + * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here + * before we add the unconditionally matched code points */ + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + for (i = 0; i < 256; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + invlist = add_cp_to_invlist(invlist, i); + new_node_has_latin1 = TRUE; + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well */ + if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + } + + if (ANYOF_FLAGS(node) & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + + /* Under /li, any 0-255 could fold to any other 0-255, depending on the + * locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + ANYOF_FLAGS(node) & ANYOF_INVERT, + &invlist); + } + + return invlist; +} + +/* These two functions currently do the exact same thing */ +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ + +STATIC void +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) +{ + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = ANYOF_FLAGS(and_with); + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * () & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & () = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ + + if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; + + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); + + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); + + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } + } + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } + } +} + +STATIC void +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) +{ + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ + + SV* ored_cp_list; + U8 ored_flags; + + PERL_ARGS_ASSERT_SSC_OR; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = ANYOF_FLAGS(or_with); + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + } + + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ + + if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); + } + } + } + } + + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} + +PERL_STATIC_INLINE void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +PERL_STATIC_INLINE void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +PERL_STATIC_INLINE void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +PERL_STATIC_INLINE void +S_ssc_clear_locale(regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +STATIC void +S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first 256 code points in a bit map */ + + SV* invlist = invlist_clone(ssc->invlist); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the + * time we reach here */ + assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + + populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; + } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); +} + +#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] +#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) +#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) + + +#ifdef DEBUGGING +/* + dump_trie(trie,widecharmap,revcharmap) + dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) + dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) + + These routines dump out a trie in a somewhat readable format. + The _interim_ variants are used for debugging the interim + tables that are used to generate the final compressed + representation which is what dump_trie expects. + + Part of the reason for their existence is to provide a form + of documentation as to how the different representations function. + +*/ + +/* + Dumps the final compressed table form of the trie to Perl_debug_log. + Used for debugging make_trie(). +*/ + +STATIC void +S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, + AV *revcharmap, U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + U16 word; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE; + + PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", + (int)depth * 2 + 2,"", + "Match","Base","Ofs" ); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) { + SV ** const tmp = av_fetch( revcharmap, state, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", + (int)depth * 2 + 2,""); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) + PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); + PerlIO_printf( Perl_debug_log, "\n"); + + for( state = 1 ; state < trie->statecount ; state++ ) { + const U32 base = trie->states[ state ].trans.base; + + PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", + (int)depth * 2 + 2,"", (UV)state); + + if ( trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, " W%4X", + trie->states[ state ].wordnum ); + } else { + PerlIO_printf( Perl_debug_log, "%6s", "" ); + } + + PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); + + if ( base ) { + U32 ofs = 0; + + while( ( base + ofs < trie->uniquecharcount ) || + ( base + ofs - trie->uniquecharcount < trie->lasttrans + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) + ofs++; + + PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) + { + PerlIO_printf( Perl_debug_log, "%*"UVXf, + colwidth, + (UV)trie->trans[ base + ofs + - trie->uniquecharcount ].next ); + } else { + PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); + } + } + + PerlIO_printf( Perl_debug_log, "]"); + + } + PerlIO_printf( Perl_debug_log, "\n" ); + } + PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", + (int)depth*2, ""); + for (word=1; word <= trie->wordcount; word++) { + PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", + (int)word, (int)(trie->wordinfo[word].prev), + (int)(trie->wordinfo[word].len)); + } + PerlIO_printf(Perl_debug_log, "\n" ); +} +/* + Dumps a fully constructed but uncompressed trie in list form. + List tries normally only are used for construction when the number of + possible chars (trie->uniquecharcount) is very high. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + + /* print out the table precompression. */ + PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", + (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", + "------:-----+-----------------\n" ); + + for( state=1 ; state < next_alloc ; state ++ ) { + U16 charid; + + PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", + (int)depth * 2 + 2,"", (UV)state ); + if ( ! trie->states[ state ].wordnum ) { + PerlIO_printf( Perl_debug_log, "%5s| ",""); + } else { + PerlIO_printf( Perl_debug_log, "W%4x| ", + trie->states[ state ].wordnum + ); + } + for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, + TRIE_LIST_ITEM(state,charid).forid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR + ) , + TRIE_LIST_ITEM(state,charid).forid, + (UV)TRIE_LIST_ITEM(state,charid).newstate + ); + if (!(charid % 10)) + PerlIO_printf(Perl_debug_log, "\n%*s| ", + (int)((depth * 2) + 14), ""); + } + } + PerlIO_printf( Perl_debug_log, "\n"); + } +} + +/* + Dumps a fully constructed but uncompressed trie in table form. + This is the normal DFA style state transition table, with a few + twists to facilitate compression later. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + U16 charid; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; + + /* + print out the table precompression so that we can do a visual check + that they are identical. + */ + + PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + SV ** const tmp = av_fetch( revcharmap, charid, 0); + if ( tmp ) { + PerlIO_printf( Perl_debug_log, "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + + PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); + + for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { + PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); + } + + PerlIO_printf( Perl_debug_log, "\n" ); + + for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { + + PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", + (int)depth * 2 + 2,"", + (UV)TRIE_NODENUM( state ) ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); + if (v) + PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); + else + PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); + } + if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { + PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + (UV)trie->trans[ state ].check ); + } else { + PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + (UV)trie->trans[ state ].check, + trie->states[ TRIE_NODENUM( state ) ].wordnum ); + } + } +} + +#endif + + +/* make_trie(startbranch,first,last,tail,word_count,flags,depth) + startbranch: the first branch in the whole branch sequence + first : start branch of sequence of branch-exact nodes. + May be the same as startbranch + last : Thing following the last branch. + May be the same as tail. + tail : item following the branch sequence + count : words in the sequence + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/ + depth : indent depth + +Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. + +A trie is an N'ary tree where the branches are determined by digital +decomposition of the key. IE, at the root node you look up the 1st character and +follow that branch repeat until you find the end of the branches. Nodes can be +marked as "accepting" meaning they represent a complete word. Eg: + + /he|she|his|hers/ + +would convert into the following structure. Numbers represent states, letters +following numbers represent valid transitions on the letter from that state, if +the number is in square brackets it represents an accepting state, otherwise it +will be in parenthesis. + + +-h->+-e->[3]-+-r->(8)-+-s->[9] + | | + | (2) + | | + (1) +-i->(6)-+-s->[7] + | + +-s->(3)-+-h->(4)-+-e->[5] + + Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) + +This shows that when matching against the string 'hers' we will begin at state 1 +read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, +then read 'r' and go to state 8 followed by 's' which takes us to state 9 which +is also accepting. Thus we know that we can match both 'he' and 'hers' with a +single traverse. We store a mapping from accepting to state to which word was +matched, and then when we have multiple possibilities we try to complete the +rest of the regex in the order in which they occured in the alternation. + +The only prior NFA like behaviour that would be changed by the TRIE support is +the silent ignoring of duplicate alternations which are of the form: + + / (DUPE|DUPE) X? (?{ ... }) Y /x + +Thus EVAL blocks following a trie may be called a different number of times with +and without the optimisation. With the optimisations dupes will be silently +ignored. This inconsistent behaviour of EVAL type nodes is well established as +the following demonstrates: + + 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ + +which prints out 'word' three times, but + + 'words'=~/(word|word|word)(?{ print $1 })S/ + +which doesnt print it out at all. This is due to other optimisations kicking in. + +Example of what happens on a structural level: + +The regexp /(ac|ad|ab)+/ will produce the following debug output: + + 1: CURLYM[1] {1,32767}(18) + 5: BRANCH(8) + 6: EXACT (16) + 8: BRANCH(11) + 9: EXACT (16) + 11: BRANCH(14) + 12: EXACT (16) + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +This would be optimizable with startbranch=5, first=5, last=16, tail=16 +and should turn into: + + 1: CURLYM[1] {1,32767}(18) + 5: TRIE(16) + [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] + + + + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +Cases where tail != last would be like /(?foo|bar)baz/: + + 1: BRANCH(4) + 2: EXACT (8) + 4: BRANCH(7) + 5: EXACT (8) + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + +which would be optimizable with startbranch=1, first=1, last=7, tail=8 +and would end up looking like: + + 1: TRIE(8) + [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] + + + 7: TAIL(8) + 8: EXACT (10) + 10: END(0) + + d = uvchr_to_utf8_flags(d, uv, 0); + +is the recommended Unicode-aware way of saying + + *(d++) = uv; +*/ + +#define TRIE_STORE_REVCHAR(val) \ + STMT_START { \ + if (UTF) { \ + SV *zlopp = newSV(7); /* XXX: optimize me */ \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)val; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END + +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ +} STMT_END + + + +#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ + if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ + U32 ging = TRIE_LIST_LEN( state ) *= 2; \ + Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ + } \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ + TRIE_LIST_CUR( state )++; \ +} STMT_END + +#define TRIE_LIST_NEW(state) STMT_START { \ + Newxz( trie->states[ state ].trans.list, \ + 4, reg_trie_trans_le ); \ + TRIE_LIST_CUR( state ) = 1; \ + TRIE_LIST_LEN( state ) = 4; \ +} STMT_END + +#define TRIE_HANDLE_WORD(state) STMT_START { \ + U16 dupe= trie->states[ state ].wordnum; \ + regnode * const noper_next = regnext( noper ); \ + \ + DEBUG_r({ \ + /* store the word for dumping */ \ + SV* tmp; \ + if (OP(noper) != NOTHING) \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ + else \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ + av_push( trie_words, tmp ); \ + }); \ + \ + curword++; \ + trie->wordinfo[curword].prev = 0; \ + trie->wordinfo[curword].len = wordlen; \ + trie->wordinfo[curword].accept = state; \ + \ + if ( noper_next < tail ) { \ + if (!trie->jump) \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ + trie->jump[curword] = (U16)(noper_next - convert); \ + if (!jumper) \ + jumper = noper_next; \ + if (!nextbranch) \ + nextbranch= regnext(cur); \ + } \ + \ + if ( dupe ) { \ + /* It's a dupe. Pre-insert into the wordinfo[].prev */\ + /* chain, so that when the bits of chain are later */\ + /* linked together, the dups appear in the chain */\ + trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ + trie->wordinfo[dupe].prev = curword; \ + } else { \ + /* we haven't inserted this word yet. */ \ + trie->states[ state ].wordnum = curword; \ + } \ +} STMT_END + + +#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ + ( ( base + charid >= ucharcount \ + && base + charid < ubound \ + && state == trie->trans[ base - ucharcount + charid ].check \ + && trie->trans[ base - ucharcount + charid ].next ) \ + ? trie->trans[ base - ucharcount + charid ].next \ + : ( state==1 ? special : 0 ) \ + ) + +#define MADE_TRIE 1 +#define MADE_JUMP_TRIE 2 +#define MADE_EXACT_TRIE 4 + +STATIC I32 +S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) +{ + /* first pass, loop through and scan words */ + reg_trie_data *trie; + HV *widecharmap = NULL; + AV *revcharmap = newAV(); + regnode *cur; + STRLEN len = 0; + UV uvc = 0; + U16 curword = 0; + U32 next_alloc = 0; + regnode *jumper = NULL; + regnode *nextbranch = NULL; + regnode *convert = NULL; + U32 *prev_states; /* temp array mapping each state to previous one */ + /* we just use folder as a flag in utf8 */ + const U8 * folder = NULL; + +#ifdef DEBUGGING + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu")); + AV *trie_words = NULL; + /* along with revcharmap, this only used during construction but both are + * useful during debugging so we store them in the struct when debugging. + */ +#else + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); + STRLEN trie_charcount=0; +#endif + SV *re_trie_maxbuff; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + switch (flags) { + case EXACT: break; + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: folder = PL_fold_latin1; break; + case EXACTF: folder = PL_fold; break; + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); + } + + trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); + trie->refcount = 1; + trie->startstate = 1; + trie->wordcount = word_count; + RExC_rxi->data->data[ data_slot ] = (void*)trie; + trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); + if (flags == EXACT) + trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); + trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( + trie->wordcount+1, sizeof(reg_trie_wordinfo)); + + DEBUG_r({ + trie_words = newAV(); + }); + + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + assert(re_trie_maxbuff); + if (!SvIOK(re_trie_maxbuff)) { + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf( Perl_debug_log, + "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + (int)depth * 2 + 2, "", + REG_NODE_NUM(startbranch),REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); + }); + + /* Find the node we are going to overwrite */ + if ( first == startbranch && OP( last ) != BRANCH ) { + /* whole branch chain */ + convert = first; + } else { + /* branch sub-chain */ + convert = NEXTOPER( first ); + } + + /* -- First loop and Setup -- + + We first traverse the branches and scan each word to determine if it + contains widechars, and how many unique chars there are, this is + important as we have to build a table with at least as many columns as we + have unique chars. + + We use an array of integers to represent the character codes 0..255 + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. + + *TODO* If we keep track of how many times each character is used we can + remap the columns so that the table compression later on is more + efficient in terms of memory by ensuring the most common value is in the + middle and the least common are on the outside. IMO this would be better + than a most to least common mapping as theres a decent chance the most + common letter will share a node with the least common, meaning the node + will not be compressible. With a middle is most common approach the worst + case is when we have the least common nodes twice. + + */ + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + regnode *noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + int foldlen = 0; + U32 wordlen = 0; /* required init */ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + trie->minlen= STR_LEN(noper); + } else { + trie->minlen= 0; + continue; + } + } + + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ + TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte + regardless of encoding */ + if (OP( noper ) == EXACTFU_SS) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); + } + } + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ + TRIE_CHARCOUNT(trie)++; + TRIE_READ_CHAR; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because for + * non-UTF, the latin1_safe macro is smart enough to account + * for all the unfolded characters, and because for UTF, the + * string will already have been folded earlier in the + * compilation process */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ + if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } + if ( !trie->charmap[ uvc ] ) { + trie->charmap[ uvc ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( uvc ); + } + if ( set_bit ) { + /* store the codepoint in the bitmap, and its folded + * equivalent. */ + TRIE_BITMAP_SET(trie, uvc); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + variant codepoints */ + if (! UVCHR_IS_INVARIANT(uvc)) { + TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); + } + } + set_bit = 0; /* We've done our bit :-) */ + } + } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + + SV** svpp; + if ( !widecharmap ) + widecharmap = newHV(); + + svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); + + if ( !svpp ) + Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); + + if ( !SvTRUE( *svpp ) ) { + sv_setiv( *svpp, ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR(uvc); + } + } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ + if( cur == first ) { + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; + } + } /* end first pass */ + DEBUG_TRIE_COMPILE_r( + PerlIO_printf( Perl_debug_log, + "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + (int)depth * 2 + 2,"", + ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, + (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, + (int)trie->minlen, (int)trie->maxlen ) + ); + + /* + We now know what we are dealing with in terms of unique chars and + string sizes so we can calculate how much memory a naive + representation using a flat table will take. If it's over a reasonable + limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory + conservative but potentially much slower representation using an array + of lists. + + At the end we convert both representations into the same compressed + form that will be used in regexec.c for matching with. The latter + is a form that cannot be used to construct with but has memory + properties similar to the list form and access properties similar + to the table form making it both suitable for fast searches and + small enough that its feasable to store for the duration of a program. + + See the comment in the code where the compressed table is produced + inplace from the flat tabe representation for an explanation of how + the compression works. + + */ + + + Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); + prev_states[1] = 0; + + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { + /* + Second Pass -- Array Of Lists Representation + + Each state will be represented by a list of charid:state records + (reg_trie_trans_le) the first such element holds the CUR and LEN + points of the allocated array. (See defines above). + + We build the initial structure using the lists, and then convert + it into the compressed table form which allows faster lookups + (but cant be modified once converted). + */ + + STRLEN transcount = 1; + + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using list compiler\n", + (int)depth * 2 + 2, "")); + + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + TRIE_LIST_NEW(1); + next_alloc = 2; + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = NEXTOPER( cur ); + U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + U32 state = 1; /* required init */ + U16 charid = 0; /* sanity init */ + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + + if (OP(noper) != NOTHING) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + if ( !svpp ) { + charid = 0; + } else { + charid=(U16)SvIV( *svpp ); + } + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + if ( charid ) { + + U16 check; + U32 newstate = 0; + + charid--; + if ( !trie->states[ state ].trans.list ) { + TRIE_LIST_NEW( state ); + } + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { + newstate = TRIE_LIST_ITEM( state, check ).newstate; + break; + } + } + if ( ! newstate ) { + newstate = next_alloc++; + prev_states[newstate] = state; + TRIE_LIST_PUSH( state, charid, newstate ); + transcount++; + } + state = newstate; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + } + } + TRIE_HANDLE_WORD(state); + + } /* end second pass */ + + /* next alloc is the NEXT state to be allocated */ + trie->statecount = next_alloc; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, + next_alloc + * sizeof(reg_trie_state) ); + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, + revcharmap, next_alloc, + depth+1) + ); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); + { + U32 state; + U32 tp = 0; + U32 zp = 0; + + + for( state=1 ; state < next_alloc ; state ++ ) { + U32 base=0; + + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) + ); + */ + + if (trie->states[state].trans.list) { + U16 minid=TRIE_LIST_ITEM( state, 1).forid; + U16 maxid=minid; + U16 idx; + + for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U16 forid = TRIE_LIST_ITEM( state, idx).forid; + if ( forid < minid ) { + minid=forid; + } else if ( forid > maxid ) { + maxid=forid; + } + } + if ( transcount < tp + maxid - minid + 1) { + transcount *= 2; + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, + transcount + * sizeof(reg_trie_trans) ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); + } + base = trie->uniquecharcount + tp - minid; + if ( maxid == minid ) { + U32 set = 0; + for ( ; zp < tp ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + base = trie->uniquecharcount + zp - minid; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ zp ].check = state; + set = 1; + break; + } + } + if ( !set ) { + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ tp ].check = state; + tp++; + zp = tp; + } + } else { + for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; + trie->trans[ tid ].check = state; + } + tp += ( maxid - minid + 1 ); + } + Safefree(trie->states[ state ].trans.list); + } + /* + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, " base: %d\n",base); + ); + */ + trie->states[ state ].trans.base=base; + } + trie->lasttrans = tp + 1; + } + } else { + /* + Second Pass -- Flat Table Representation. + + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. + + We then construct the trie using only the .next slots of the entry + structs. + + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. + + Since trans are numbered from 1 any 0 pointer in the table is a FAIL + transition. + + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: + + A B A B + 1. 2 4 1. 3 7 + 2. 0 3 3. 0 5 + 3. 0 0 5. 0 0 + 4. 0 0 7. 0 0 + + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. + + */ + DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, + "%*sCompiling trie using table compiler\n", + (int)depth * 2 + 2, "")); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) + * trie->uniquecharcount + 1, + sizeof(reg_trie_trans) ); + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + next_alloc = trie->uniquecharcount + 1; + + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = NEXTOPER( cur ); + const U8 *uc = (U8*)STRING( noper ); + const U8 *e = uc + STR_LEN( noper ); + + U32 state = 1; /* required init */ + + U16 charid = 0; /* sanity init */ + U32 accept_state = 0; /* sanity init */ + + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + + if ( OP(noper) != NOTHING ) { + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + charid = svpp ? (U16)SvIV(*svpp) : 0; + } + if ( charid ) { + charid--; + if ( !trie->trans[ state + charid ].next ) { + trie->trans[ state + charid ].next = next_alloc; + trie->trans[ state ].check++; + prev_states[TRIE_NODENUM(next_alloc)] + = TRIE_NODENUM(state); + next_alloc += trie->uniquecharcount; + } + state = trie->trans[ state + charid ].next; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + } + } + accept_state = TRIE_NODENUM( state ); + TRIE_HANDLE_WORD(accept_state); + + } /* end second pass */ + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, + revcharmap, + next_alloc, depth+1)); + + { + /* + * Inplace compress the table.* + + For sparse data sets the table constructed by the trie algorithm will + be mostly 0/FAIL transitions or to put it another way mostly empty. + (Note that leaf nodes will not contain any transitions.) + + This algorithm compresses the tables by eliminating most such + transitions, at the cost of a modest bit of extra work during lookup: + + - Each states[] entry contains a .base field which indicates the + index in the state[] array wheres its transition data is stored. + + - If .base is 0 there are no valid transitions from that node. + + - If .base is nonzero then charid is added to it to find an entry in + the trans array. + + -If trans[states[state].base+charid].check!=state then the + transition is taken to be a 0/Fail transition. Thus if there are fail + transitions at the front of the node then the .base offset will point + somewhere inside the previous nodes data (or maybe even into a node + even earlier), but the .check field determines if the transition is + valid. + + XXX - wrong maybe? + The following process inplace converts the table to the compressed + table: We first do not compress the root node 1,and mark all its + .check pointers as 1 and set its .base pointer as 1 as well. This + allows us to do a DFA construction from the compressed table later, + and ensures that any .base pointers we calculate later are greater + than 0. + + - We set 'pos' to indicate the first entry of the second node. + + - We then iterate over the columns of the node, finding the first and + last used entry at l and m. We then copy l..m into pos..(pos+m-l), + and set the .check pointers accordingly, and advance pos + appropriately and repreat for the next node. Note that when we copy + the next pointers we have to convert them from the original + NODEIDX form to NODENUM form as the former is not valid post + compression. + + - If a node has no transitions used we mark its base as 0 and do not + advance the pos pointer. + + - If a node only has one transition we use a second pointer into the + structure to fill in allocated fail transitions from other states. + This pointer is independent of the main pointer and scans forward + looking for null transitions that are allocated to a state. When it + finds one it writes the single transition into the "hole". If the + pointer doesnt find one the single transition is appended as normal. + + - Once compressed we can Renew/realloc the structures to release the + excess space. + + See "Table-Compression Methods" in sec 3.9 of the Red Dragon, + specifically Fig 3.47 and the associated pseudocode. + + demq + */ + const U32 laststate = TRIE_NODENUM( next_alloc ); + U32 state, charid; + U32 pos = 0, zp=0; + trie->statecount = laststate; + + for ( state = 1 ; state < laststate ; state++ ) { + U8 flag = 0; + const U32 stateidx = TRIE_NODEIDX( state ); + const U32 o_used = trie->trans[ stateidx ].check; + U32 used = trie->trans[ stateidx ].check; + trie->trans[ stateidx ].check = 0; + + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { + if ( flag || trie->trans[ stateidx + charid ].next ) { + if ( trie->trans[ stateidx + charid ].next ) { + if (o_used == 1) { + for ( ; zp < pos ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + break; + } + } + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); + trie->trans[ zp ].check = state; + if ( ++zp > pos ) pos = zp; + break; + } + used--; + } + if ( !flag ) { + flag = 1; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; + } + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].check = state; + pos++; + } + } + } + trie->lasttrans = pos + 1; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, laststate + * sizeof(reg_trie_state) ); + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf( Perl_debug_log, + "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + (int)depth * 2 + 2,"", + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + ); + + } /* end table compress */ + } + DEBUG_TRIE_COMPILE_MORE_r( + PerlIO_printf(Perl_debug_log, + "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", + (int)depth * 2 + 2, "", + (UV)trie->statecount, + (UV)trie->lasttrans) + ); + /* resize the trans array to remove unused space */ + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, trie->lasttrans + * sizeof(reg_trie_trans) ); + + { /* Modify the program and insert the new TRIE node */ + U8 nodetype =(U8)(flags & 0xFF); + char *str=NULL; + +#ifdef DEBUGGING + regnode *optimize = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + + U32 mjd_offset = 0; + U32 mjd_nodelen = 0; +#endif /* RE_TRACK_PATTERN_OFFSETS */ +#endif /* DEBUGGING */ + /* + This means we convert either the first branch or the first Exact, + depending on whether the thing following (in 'last') is a branch + or not and whther first is the startbranch (ie is it a sub part of + the alternation or is it the whole thing.) + Assuming its a sub part we convert the EXACT otherwise we convert + the whole branch sequence, including the first. + */ + /* Find the node we are going to overwrite */ + if ( first != startbranch || OP( last ) == BRANCH ) { + /* branch sub-chain */ + NEXT_OFF( first ) = (U16)(last - first); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_r({ + mjd_offset= Node_Offset((convert)); + mjd_nodelen= Node_Length((convert)); + }); +#endif + /* whole branch chain */ + } +#ifdef RE_TRACK_PATTERN_OFFSETS + else { + DEBUG_r({ + const regnode *nop = NEXTOPER( convert ); + mjd_offset= Node_Offset((nop)); + mjd_nodelen= Node_Length((nop)); + }); + } + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", + (int)depth * 2 + 2, "", + (UV)mjd_offset, (UV)mjd_nodelen) + ); +#endif + /* But first we check to see if there is a common prefix we can + split out as an EXACT and put in front of the TRIE node. */ + trie->startstate= 1; + if ( trie->bitmap && !widecharmap && !trie->jump ) { + U32 state; + for ( state = 1 ; state < trie->statecount-1 ; state++ ) { + U32 ofs = 0; + I32 idx = -1; + U32 count = 0; + const U32 base = trie->states[ state ].trans.base; + + if ( trie->states[state].wordnum ) + count = 1; + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) && + ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && + trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + { + if ( ++count > 1 ) { + SV **tmp = av_fetch( revcharmap, ofs, 0); + const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); + if ( state == 1 ) break; + if ( count == 2 ) { + Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*sNew Start State=%"UVuf" Class: [", + (int)depth * 2 + 2, "", + (UV)state)); + if (idx >= 0) { + SV ** const tmp = av_fetch( revcharmap, idx, 0); + const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); + + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie, folder[ *ch ]); + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, "%s", (char*)ch) + ); + } + } + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie,folder[ *ch ]); + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); + } + idx = ofs; + } + } + if ( count == 1 ) { + SV **tmp = av_fetch( revcharmap, idx, 0); + STRLEN len; + char *ch = SvPV( *tmp, len ); + DEBUG_OPTIMISE_r({ + SV *sv=sv_newmortal(); + PerlIO_printf( Perl_debug_log, + "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", + (int)depth * 2 + 2, "", + (UV)state, (UV)idx, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + }); + if ( state==1 ) { + OP( convert ) = nodetype; + str=STRING(convert); + STR_LEN(convert)=0; + } + STR_LEN(convert) += len; + while (len--) + *str++ = *ch++; + } else { +#ifdef DEBUGGING + if (state>1) + DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); +#endif + break; + } + } + trie->prefixlen = (state-1); + if (str) { + regnode *n = convert+NODE_SZ_STR(convert); + NEXT_OFF(convert) = NODE_SZ_STR(convert); + trie->startstate = state; + trie->minlen -= (state - 1); + trie->maxlen -= (state - 1); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + regnode *fix = convert; + U32 word = trie->wordcount; + mjd_nodelen++; + Set_Node_Offset_Length(convert, mjd_offset, state - 1); + while( ++fix < n ) { + Set_Node_Offset_Length(fix, 0, 0); + } + while (word--) { + SV ** const tmp = av_fetch( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif + if (trie->maxlen) { + convert = n; + } else { + NEXT_OFF(convert) = (U16)(tail - convert); + DEBUG_r(optimize= n); + } + } + } + if (!jumper) + jumper = last; + if ( trie->maxlen ) { + NEXT_OFF( convert ) = (U16)(tail - convert); + ARG_SET( convert, data_slot ); + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. + We use this when dumping a trie and during optimisation. */ + if (trie->jump) + trie->jump[0] = (U16)(nextbranch - convert); + + /* If the start state is not accepting (meaning there is no empty string/NOTHING) + * and there is a bitmap + * and the first "jump target" node we found leaves enough room + * then convert the TRIE node into a TRIEC node, with the bitmap + * embedded inline in the opcode - this is hypothetically faster. + */ + if ( !trie->states[trie->startstate].wordnum + && trie->bitmap + && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) + { + OP( convert ) = TRIEC; + Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); + PerlMemShared_free(trie->bitmap); + trie->bitmap= NULL; + } else + OP( convert ) = TRIE; + + /* store the type in the flags */ + convert->flags = nodetype; + DEBUG_r({ + optimize = convert + + NODE_STEP_REGNODE + + regarglen[ OP( convert ) ]; + }); + /* XXX We really should free up the resource in trie now, + as we won't use them - (which resources?) dmq */ + } + /* needed for dumping*/ + DEBUG_r(if (optimize) { + regnode *opt = convert; + + while ( ++opt < optimize) { + Set_Node_Offset_Length(opt,0,0); + } + /* + Try to clean up some of the debris left after the + optimisation. + */ + while( optimize < jumper ) { + mjd_nodelen += Node_Length((optimize)); + OP( optimize ) = OPTIMIZED; + Set_Node_Offset_Length(optimize,0,0); + optimize++; + } + Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen); + }); + } /* end node insert */ + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert); + + /* Finish populating the prev field of the wordinfo array. Walk back + * from each accept state until we find another accept state, and if + * so, point the first word's .prev field at the second word. If the + * second already has a .prev field set, stop now. This will be the + * case either if we've already processed that word's accept state, + * or that state had multiple words, and the overspill words were + * already linked up earlier. + */ + { + U16 word; + U32 state; + U16 prev; + + for (word=1; word <= trie->wordcount; word++) { + prev = 0; + if (trie->wordinfo[word].prev) + continue; + state = trie->wordinfo[word].accept; + while (state) { + state = prev_states[state]; + if (!state) + break; + prev = trie->states[state].wordnum; + if (prev) + break; + } + trie->wordinfo[word].prev = prev; + } + Safefree(prev_states); + } + + + /* and now dump out the compressed format */ + DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); + + RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; +#ifdef DEBUGGING + RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; + RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; +#else + SvREFCNT_dec_NN(revcharmap); +#endif + return trie->jump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE + : MADE_TRIE; +} + +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) +{ +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed + + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 + ISBN 0-201-10088-6 + + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. + Consider + 'abcdgu'=~/abcdefg|cdgu/ + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. + */ + /* add a fail transition */ + const U32 trie_offset = ARG(source); + reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; + U32 *q; + const U32 ucharcount = trie->uniquecharcount; + const U32 numstates = trie->statecount; + const U32 ubound = trie->lasttrans + ucharcount; + U32 q_read = 0; + U32 q_write = 0; + U32 charid; + U32 base = trie->states[ 1 ].trans.base; + U32 *fail; + reg_ac_data *aho; + const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; + PERL_UNUSED_CONTEXT; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ + + ARG_SET( stclass, data_slot ); + aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); + RExC_rxi->data->data[ data_slot ] = (void*)aho; + aho->trie=trie_offset; + aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); + Copy( trie->states, aho->states, numstates, reg_trie_state ); + Newxz( q, numstates, U32); + aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); + aho->refcount = 1; + fail = aho->fail; + /* initialize fail[0..1] to be 1 so that we always have + a valid final fail state */ + fail[ 0 ] = fail[ 1 ] = 1; + + for ( charid = 0; charid < ucharcount ; charid++ ) { + const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); + if ( newstate ) { + q[ q_write ] = newstate; + /* set to point at the root */ + fail[ q[ q_write++ ] ]=1; + } + } + while ( q_read < q_write) { + const U32 cur = q[ q_read++ % numstates ]; + base = trie->states[ cur ].trans.base; + + for ( charid = 0 ; charid < ucharcount ; charid++ ) { + const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); + if (ch_state) { + U32 fail_state = cur; + U32 fail_base; + do { + fail_state = fail[ fail_state ]; + fail_base = aho->states[ fail_state ].trans.base; + } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); + + fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); + fail[ ch_state ] = fail_state; + if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) + { + aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; + } + q[ q_write++ % numstates] = ch_state; + } + } + } + /* restore fail[0..1] to 0 so that we "fall out" of the AC loop + when we fail in state 1, this allows us to use the + charclass scan to find a valid start char. This is based on the principle + that theres a good chance the string being searched contains lots of stuff + that cant be a start char. + */ + fail[ 0 ] = fail[ 1 ] = 0; + DEBUG_TRIE_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%*sStclass Failtable (%"UVuf" states): 0", + (int)(depth * 2), "", (UV)numstates + ); + for( q_read=1; q_read%3d: %s (%d)\n", \ + (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ + Next ? (REG_NODE_NUM(Next)) : 0 ); \ + }}); + + +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one. The regop may be changed if the node(s) contain certain sequences that + * require special handling. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING-kind nodes, and + * these get optimized out + * + * If a node is to match under /i (folded), the number of characters it matches + * can be different than its character length if it contains a multi-character + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. + * + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when whether the fold is valid or + * not won't be known until runtime; namely for EXACTF nodes that contain LATIN + * SMALL LETTER SHARP S, as only if the target string being matched against + * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose + * folding rules depend on the locale in force at runtime. (Multi-char folds + * whose components are all above the Latin1 range are not run-time locale + * dependent, and have already been folded by the time this function is + * called.) + * + * This is as good a place as any to discuss the design of handling these + * multi-character fold sequences. It's been wrong in Perl for a very long + * time. There are three code points in Unicode whose multi-character folds + * were long ago discovered to mess things up. The previous designs for + * dealing with these involved assigning a special node for them. This + * approach doesn't always work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both sides fold to "sss", but if the pattern is parsed to create a node that + * would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFU_SS node + * that is "sss" in this case. + * + * It turns out that there are problems with all multi-character folds, and not + * just these three. Now the code is general, for all such cases. The + * approach taken is: + * 1) This routine examines each EXACTFish node that could contain multi- + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * is used for an EXACTFU node that contains at least one "ss" sequence in + * it. For non-UTF-8 patterns and strings, this is the only case where + * there is a possible fold length change. That means that a regular + * EXACTFU node without UTF-8 involvement doesn't have to concern itself + * with length changes, and so can be processed faster. regexec.c takes + * advantage of this. Generally, an EXACTFish node that is in UTF-8 is + * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't + * known until runtime). This saves effort in regex matching. However, + * the pre-folding isn't done for non-UTF8 patterns because the fold of + * the MICRO SIGN requires UTF-8, and we don't want to slow things down by + * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and, + * again, EXACTFL) nodes fold to isn't known until runtime. The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string are + * members of a fold-pair, and arrays are set up for all of them so that + * the other member of the pair can be found quickly. Code elsewhere in + * this file makes sure that in EXACTFU nodes, the sharp s gets folded to + * 'ss', even if the pattern isn't UTF-8. This avoids the issues + * described in the next item. + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes + * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL + * nodes, violate the assumption, and they are the only instances where it + * is violated. I'm reluctant to try to change the assumption, as the + * code involved is impenetrable to me (khw), so instead the code here + * punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid + * assumption. Thus, there is no optimization based on string lengths for + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFA where it never does. In an EXACTFA node in + * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFA nodes with the sharp s + * doesn't work. Instead, such an EXACTFA is turned into a new regnode, + * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \ + if (PL_regkind[OP(scan)] == EXACT) \ + join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1) + +STATIC U32 +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags,regnode *val, U32 depth) +{ + /* Merge several consecutive EXACTish nodes into one. */ + regnode *n = regnext(scan); + U32 stringok = 1; + regnode *next = scan + NODE_SZ_STR(scan); + U32 merged = 0; + U32 stopnow = 0; +#ifdef DEBUGGING + regnode *stop = scan; + GET_RE_DEBUG_FLAGS_DECL; +#else + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_JOIN_EXACT; +#ifndef EXPERIMENTAL_INPLACESCAN + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(val); +#endif + DEBUG_PEEP("join",scan,depth); + + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) + && NEXT_OFF(n) + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { + + if (OP(n) == TAIL || n > next) + stringok = 0; + if (PL_regkind[OP(n)] == NOTHING) { + DEBUG_PEEP("skip:",n,depth); + NEXT_OFF(scan) += NEXT_OFF(n); + next = n + NODE_STEP_REGNODE; +#ifdef DEBUGGING + if (stringok) + stop = n; +#endif + n = regnext(n); + } + else if (stringok) { + const unsigned int oldl = STR_LEN(scan); + regnode * const nnext = regnext(n); + + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ + /* Don't join if the sum can't fit into a single node */ + if (oldl + STR_LEN(n) > U8_MAX) + break; + + DEBUG_PEEP("merg",n,depth); + merged++; + + NEXT_OFF(scan) += NEXT_OFF(n); + STR_LEN(scan) += STR_LEN(n); + next = n + NODE_SZ_STR(n); + /* Now we can overwrite *n : */ + Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); +#ifdef DEBUGGING + stop = next - 1; +#endif + n = nnext; + if (stopnow) break; + } + +#ifdef EXPERIMENTAL_INPLACESCAN + if (flags && !NEXT_OFF(n)) { + DEBUG_PEEP("atch", val, depth); + if (reg_off_by_arg[OP(n)]) { + ARG_SET(n, val - n); + } + else { + NEXT_OFF(n) = val - n; + } + stopnow = 1; + } +#endif + } + + *min_subtract = 0; + *unfolded_multi_char = FALSE; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ + if (OP(scan) != EXACT) { + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ + + /* One pass is made over the node's string looking for all the + * possibilities. To avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ + + /* Examine the string for a multi-character fold sequence. UTF-8 + * patterns have all characters pre-folded by the time this code is + * executed */ + while (s < s_end - 1) /* Can stop 1 before the end, as minimum + length sequence we are looking for is 2 */ + { + int count = 0; /* How many characters in a multi-char fold */ + int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); + if (! len) { /* Not a multi-char fold: get next char */ + s += UTF8SKIP(s); + continue; + } + + /* Nodes with 'ss' require special handling, except for + * EXACTFA-ish for which there is no multi-char fold to this */ + if (len == 2 && *s == 's' && *(s+1) == 's' + && OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE) + { + count = 2; + if (OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } + s += 2; + } + else { /* Here is a generic multi-char fold. */ + U8* multi_end = s + len; + + /* Count how many characters are in it. In the case of + * /aa, no folds which contain ASCII code points are + * allowed, so check for those, and skip if found. */ + if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { + count = utf8_length(s, multi_end); + s = multi_end; + } + else { + while (s < multi_end) { + if (isASCII(*s)) { + s++; + goto next_iteration; + } + else { + s += UTF8SKIP(s); + } + count++; + } + } + } + + /* The delta is how long the sequence is minus 1 (1 is how long + * the character that folds to the sequence is) */ + total_count_delta += count - 1; + next_iteration: ; + } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); + } + else if (OP(scan) == EXACTFA) { + + /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char + * fold to the ASCII range (and there are no existing ones in the + * upper latin1 range). But, as outlined in the comments preceding + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ + while (s < s_end) { + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + OP(scan) = EXACTFA_NO_TRIE; + *unfolded_multi_char = TRUE; + break; + } + s++; + continue; + } + } + else { + + /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; + + while (s < upper) { + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); + if (! len) { /* Not a multi-char fold. */ + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) + { + *unfolded_multi_char = TRUE; + } + s++; + continue; + } + + if (len == 2 + && isARG2_lower_or_UPPER_ARG1('s', *s) + && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) + { + + /* EXACTF nodes need to know that the minimum length + * changed so that a sharp s in the string can match this + * ss in the pattern, but they remain EXACTF nodes, as they + * won't match this unless the target string is is UTF-8, + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { + OP(scan) = EXACTFU_SS; + } + } + + *min_subtract += len - 1; + s += len; + } + } + } + +#ifdef DEBUGGING + /* Allow dumping but overwriting the collection of skipped + * ops and/or strings with fake optimized ops */ + n = scan + NODE_SZ_STR(scan); + while (n <= stop) { + OP(n) = OPTIMIZED; + FLAGS(n) = 0; + NEXT_OFF(n) = 0; + n++; + } +#endif + DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)}); + return stopnow; +} + +/* REx optimizer. Converts nodes into quicker variants "in place". + Finds fixed substrings. */ + +/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set + to the position after last scanned or to NULL. */ + +#define INIT_AND_WITHP \ + assert(!and_withp); \ + Newx(and_withp,1, regnode_ssc); \ + SAVEFREEPV(and_withp) + +/* this is a chain of data about sub patterns we are processing that + need to be handled separately/specially in study_chunk. Its so + we can simulate recursion without losing state. */ +struct scan_frame; +typedef struct scan_frame { + regnode *last; /* last node to process in this frame */ + regnode *next; /* next node to process when last is reached */ + struct scan_frame *prev; /*previous frame*/ + U32 prev_recursed_depth; + I32 stop; /* what stopparen do we use */ +} scan_frame; + + +STATIC SSize_t +S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, + SSize_t *minlenp, SSize_t *deltap, + regnode *last, + scan_data_t *data, + I32 stopparen, + U32 recursed_depth, + regnode_ssc *and_withp, + U32 flags, U32 depth) + /* scanp: Start here (read-write). */ + /* deltap: Write maxlen-minlen here. */ + /* last: Stop before this one. */ + /* data: string data about the pattern */ + /* stopparen: treat close N as END */ + /* recursed: which subroutines have we recursed into */ + /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ +{ + /* There must be at least this number of characters to match */ + SSize_t min = 0; + I32 pars = 0, code; + regnode *scan = *scanp, *next; + SSize_t delta = 0; + int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); + int is_inf_internal = 0; /* The studied chunk is infinite */ + I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; + scan_data_t data_fake; + SV *re_trie_maxbuff = NULL; + regnode *first_non_open = scan; + SSize_t stopmin = SSize_t_MAX; + scan_frame *frame = NULL; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_STUDY_CHUNK; + +#ifdef DEBUGGING + StructCopy(&zero_scan_data, &data_fake, scan_data_t); +#endif + if ( depth == 0 ) { + while (first_non_open && OP(first_non_open) == OPEN) + first_non_open=regnext(first_non_open); + } + + + fake_study_recurse: + while ( scan && OP(scan) != END && scan < last ){ + UV min_subtract = 0; /* How mmany chars to subtract from the minimum + node length to get a real minimum (because + the folded version may be shorter) */ + bool unfolded_multi_char = FALSE; + /* Peephole optimizer: */ + DEBUG_OPTIMISE_MORE_r( + { + PerlIO_printf(Perl_debug_log, + "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", + ((int) depth*2), "", (long)stopparen, + (unsigned long)depth, (unsigned long)recursed_depth); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + PerlIO_printf(Perl_debug_log,"["); + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) + PerlIO_printf(Perl_debug_log,"%d", + PAREN_TEST(RExC_study_chunk_recursed + + (j * RExC_study_chunk_recursed_bytes), i) + ? 1 : 0 + ); + PerlIO_printf(Perl_debug_log,"]"); + } + } + PerlIO_printf(Perl_debug_log,"\n"); + } + ); + DEBUG_STUDYDATA("Peep:", data, depth); + DEBUG_PEEP("Peep", scan, depth); + + + /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ + * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled + * by a different invocation of reg() -- Yves + */ + JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); + + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. */ + if (OP(scan) != CURLYX) { + const int max = (reg_off_by_arg[OP(scan)] + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); + int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); + int noff; + regnode *n = scan; + + /* Skip NOTHING and LONGJMP. */ + while ((n = regnext(n)) + && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n)))) + && off + noff < max) + off += noff; + if (reg_off_by_arg[OP(scan)]) + ARG(scan) = off; + else + NEXT_OFF(scan) = off; + } + + + + /* The principal pseudo-switch. Cannot be a switch, since we + look into several different things. */ + if (OP(scan) == BRANCH || OP(scan) == BRANCHJ + || OP(scan) == IFTHEN) { + next = regnext(scan); + code = OP(scan); + /* demq: the op(next)==code check is to see if we have + * "branch-branch" AFAICT */ + + if (OP(next) == code || code == IFTHEN) { + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; + regnode_ssc accum; + regnode * const startbranch=scan; + + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + while (OP(scan) == code) { + SSize_t deltanext, minnext, fake; + I32 f = 0; + regnode_ssc this_class; + + num++; + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + + data_fake.pos_delta = delta; + next = regnext(scan); + scan = NEXTOPER(scan); + if (code != BRANCH) + scan = NEXTOPER(scan); + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + /* we suppose the run is continuous, last=next...*/ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f,depth+1); + if (min1 > minnext) + min1 = minnext; + if (deltanext == SSize_t_MAX) { + is_inf = is_inf_internal = 1; + max1 = SSize_t_MAX; + } else if (max1 < minnext + deltanext) + max1 = minnext + deltanext; + scan = next; + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > minnext) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); + } + if (code == IFTHEN && num < 2) /* Empty ELSE branch */ + min1 = 0; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + if (data->pos_delta >= SSize_t_MAX - (max1 - min1)) + data->pos_delta = SSize_t_MAX; + else + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + if (delta == SSize_t_MAX + || SSize_t_MAX - delta - (max1 - min1) < 0) + delta = SSize_t_MAX; + else + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + + if (PERL_ENABLE_TRIE_OPTIMISATION && + OP( startbranch ) == BRANCH ) + { + /* demq. + + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of + + BRANCH->EXACT=>x1 + BRANCH->EXACT=>x2 + tail + + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ + + If we can find such a subsequence we need to turn the first + element into a trie and then add the subsequent branch exact + strings to the trie. + + We have two cases + + 1. patterns where the whole set of branches can be + converted. + + 2. patterns where only a subset can be converted. + + In case 1 we can replace the whole set with a single regop + for the trie. In case 2 we need to keep the start and end + branches so + + 'BRANCH EXACT; BRANCH EXACT; BRANCH X' + becomes BRANCH TRIE; BRANCH X; + + There is an additional case, that being where there is a + common prefix, which gets split out into an EXACT like node + preceding the TRIE node. + + If x(1..n)==tail then we can do a simple trie, if not we make + a "jump" trie, such that when we match the appropriate word + we "jump" to the appropriate tail node. Essentially we turn + a nested if into a case structure of sorts. + + */ + + int made=0; + if (!re_trie_maxbuff) { + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + if (!SvIOK(re_trie_maxbuff)) + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + if ( SvIV(re_trie_maxbuff)>=0 ) { + regnode *cur; + regnode *first = (regnode *)NULL; + regnode *last = (regnode *)NULL; + regnode *tail = scan; + U8 trietype = 0; + U32 count=0; + +#ifdef DEBUGGING + SV * const mysv = sv_newmortal(); /* for dumping */ +#endif + /* var tail is used because there may be a TAIL + regop in the way. Ie, the exacts will point to the + thing following the TAIL, but the last branch will + point at the TAIL. So we advance tail. If we + have nested (?:) we may have to move through several + tails. + */ + + while ( OP( tail ) == TAIL ) { + /* this is the TAIL generated by (?:) */ + tail = regnext( tail ); + } + + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, tail, NULL); + PerlIO_printf( Perl_debug_log, "%*s%s%s\n", + (int)depth * 2 + 2, "", + "Looking for TRIE'able sequences. Tail node is: ", + SvPV_nolen_const( mysv ) + ); + }); + + /* + + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part + of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACTFU | EXACTFU + EXACTFU_SS | EXACTFU + EXACTFA | EXACTFA + + + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ + ( EXACT == (X) ) ? EXACT : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ + ( EXACTFA == (X) ) ? EXACTFA : \ + 0 ) + + /* dont use tail as the end marker for this traverse */ + for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { + regnode * const noper = NEXTOPER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); +#if defined(DEBUGGING) || defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; +#endif + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", + (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); + + regprop(RExC_rx, mysv, noper, NULL); + PerlIO_printf( Perl_debug_log, " -> %s", + SvPV_nolen_const(mysv)); + + if ( noper_next ) { + regprop(RExC_rx, mysv, noper_next, NULL); + PerlIO_printf( Perl_debug_log,"\t=> %s\t", + SvPV_nolen_const(mysv)); + } + PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] + ); + }); + + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ + if ( noper_trietype + && + ( + ( noper_trietype == NOTHING) + || ( trietype == NOTHING ) + || ( trietype == noper_trietype ) + ) +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + && count < U16_MAX) + { + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ + if ( !first ) { + first = cur; + if ( noper_trietype == NOTHING ) { +#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; +#endif + + if ( noper_next_trietype ) { + trietype = noper_next_trietype; + } else if (noper_next_type) { + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ + first = NULL; + } + } else { + trietype = noper_trietype; + } + } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; + last = cur; + } + if (first) + count++; + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ + if ( last ) { + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ + if ( trietype && trietype != NOTHING ) + make_trie( pRExC_state, + startbranch, first, cur, tail, + count, trietype, depth+1 ); + last = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ + } + if ( noper_trietype +#ifdef NOJUMPTRIE + && noper_next == tail +#endif + ){ + /* noper is triable, so we can start a new + * trie sequence */ + count = 1; + first = cur; + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the + * current node is not triable then we have + * to reset the first information. */ + count = 0; + first = NULL; + trietype = 0; + } + } /* end handle unmergable node */ + } /* loop over branches */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", + (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + if ( last && trietype ) { + if ( trietype != NOTHING ) { + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); +#ifdef TRIE_STUDY_OPT + if ( ((made == MADE_EXACT_TRIE && + startbranch == first) + || ( first_non_open == first )) && + depth==0 ) { + flags |= SCF_TRIE_RESTUDY; + if ( startbranch == first + && scan == tail ) + { + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; + } + } +#endif + } else { + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING + */ + if ( startbranch == first ) { + regnode *opt; + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur, NULL); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + OP(startbranch)= NOTHING; + NEXT_OFF(startbranch)= tail - startbranch; + for ( opt= startbranch + 1; opt < tail ; opt++ ) + OP(opt)= OPTIMIZED; + } + } + } /* end if ( last) */ + } /* TRIE_MAXBUF is non zero */ + + } /* do trie */ + + } + else if ( code == BRANCHJ ) { /* single branch is optimized. */ + scan = NEXTOPER(NEXTOPER(scan)); + } else /* single branch is optimized. */ + scan = NEXTOPER(scan); + continue; + } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { + scan_frame *newframe = NULL; + I32 paren; + regnode *start; + regnode *end; + U32 my_recursed_depth= recursed_depth; + + if (OP(scan) != SUSPEND) { + /* set the pointer */ + if (OP(scan) == GOSUB) { + paren = ARG(scan); + RExC_recurse[ARG2L(scan)] = scan; + start = RExC_open_parens[paren-1]; + end = RExC_close_parens[paren-1]; + } else { + paren = 0; + start = RExC_rxi->program + 1; + end = RExC_opend; + } + if (!recursed_depth + || + !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) + ) { + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("set:", data,depth); + PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); + my_recursed_depth= recursed_depth + 1; + Newx(newframe,1,scan_frame); + } else { + DEBUG_STUDYDATA("inf:", data,depth); + /* some form of infinite recursion, assume infinite length + * */ + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + } + } else { + Newx(newframe,1,scan_frame); + paren = stopparen; + start = scan+2; + end = regnext(scan); + } + if (newframe) { + assert(start); + assert(end); + SAVEFREEPV(newframe); + newframe->next = regnext(scan); + newframe->last = last; + newframe->stop = stopparen; + newframe->prev = frame; + newframe->prev_recursed_depth = recursed_depth; + + DEBUG_STUDYDATA("frame-new:",data,depth); + DEBUG_PEEP("fnew", scan, depth); + + frame = newframe; + scan = start; + stopparen = paren; + last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; + + continue; + } + } + else if (OP(scan) == EXACT) { + SSize_t l = STR_LEN(scan); + UV uc; + if (UTF) { + const U8 * const s = (U8*)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } else { + uc = *((U8*)STRING(scan)); + } + min += l; + if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ + /* The code below prefers earlier match for fixed + offset, later match for variable offset. */ + if (data->last_end == -1) { /* Update the start info. */ + data->last_start_min = data->pos_min; + data->last_start_max = is_inf + ? SSize_t_MAX : data->pos_min + data->pos_delta; + } + sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + if (UTF) + SvUTF8_on(data->last_found); + { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += utf8_length((U8*)STRING(scan), + (U8*)STRING(scan)+STR_LEN(scan)); + } + data->last_end = data->pos_min + l; + data->pos_min += l; /* As in the first entry. */ + data->flags &= ~SF_BEFORE_EOL; + } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ + if (flags & SCF_DO_STCLASS_AND) { + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ssc_clear_locale(data->start_class); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + } + else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is + EXACTFish */ + SSize_t l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); + SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 + separate code points */ + const U8 * s = (U8*)STRING(scan); + + /* Search for fixed substrings supports EXACT only. */ + if (flags & SCF_DO_SUBSTR) { + assert(data); + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (UTF) { + uc = utf8_to_uvchr_buf(s, s + l, NULL); + l = utf8_length(s, s + l); + } + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; + } + min += l - min_subtract; + assert (min >= 0); + delta += min_subtract; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += l - min_subtract; + if (data->pos_min < 0) { + data->pos_min = 0; + } + data->pos_delta += min_subtract; + if (min_subtract) { + data->longest = &(data->longest_float); + } + } + + if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); + } + + if (! UTF) { + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { + + /* Any Latin1 range character can potentially match any + * other depending on the locale */ + if (OP(scan) == EXACTFL) { + _invlist_union(EXACTF_invlist, PL_Latin1, + &EXACTF_invlist); + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) { + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + PL_fold_latin1[uc]); + } + } + + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, + (U8) uc, + &EXACTF_invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + STR_LEN(scan); + SV** listp; + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(scan) == EXACTFL + && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) + { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); + } + } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); + } + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; + } + + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c); + } + } + } + } + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); + } + else if (REGNODE_VARIES(OP(scan))) { + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0, f = flags; + regnode * const oscan = scan; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; + I32 next_is_eval = 0; + + switch (PL_regkind[OP(scan)]) { + case WHILEM: /* End of (?:...)* . */ + scan = NEXTOPER(scan); + goto finish; + case PLUS: + if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { + next = NEXTOPER(scan); + if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { + mincount = 1; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + } + if (flags & SCF_DO_SUBSTR) + data->pos_min++; + min++; + /* FALLTHROUGH */ + case STAR: + if (flags & SCF_DO_STCLASS) { + mincount = 0; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + scan = regnext(scan); + goto optimize_curly_tail; + case CURLY: + if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) + && (scan->flags == stopparen)) + { + mincount = 1; + maxcount = 1; + } else { + mincount = ARG1(scan); + maxcount = ARG2(scan); + } + next = regnext(scan); + if (OP(scan) == CURLYX) { + I32 lp = (data ? *(data->last_closep) : 0); + scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); + } + scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; + next_is_eval = (OP(scan) == EVAL); + do_curly: + if (flags & SCF_DO_SUBSTR) { + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + pos_before = data->pos_min; + } + if (data) { + fl = data->flags; + data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); + if (is_inf) + data->flags |= SF_IS_INF; + } + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + oclass = data->start_class; + data->start_class = &this_class; + f |= SCF_DO_STCLASS_AND; + f &= ~SCF_DO_STCLASS_OR; + } + /* Exclude from super-linear cache processing any {n,m} + regops for which the combination of input pos and regex + pos is not enough information to determine if a match + will be possible. + + For example, in the regex /foo(bar\s*){4,8}baz/ with the + regex pos at the \s*, the prospects for a match depend not + only on the input position but also on how many (bar\s*) + repeats into the {4,8} we are. */ + if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) + f &= ~SCF_WHILEM_VISITED_POS; + + /* This will finish on WHILEM, setting scan, or on NULL: */ + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + ,depth+1); + + if (flags & SCF_DO_STCLASS) + data->start_class = oclass; + if (mincount == 0 || minnext == 0) { + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + } + else if (flags & SCF_DO_STCLASS_AND) { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&this_class, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + } + } else { /* Non-zero len */ + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + } + else if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + flags &= ~SCF_DO_STCLASS; + } + if (!scan) /* It was not CURLYX, but CURLY. */ + scan = next; + if (!(flags & SCF_TRIE_DOING_RESTUDY) + /* ? quantifier ok, except for (?{ ... }) */ + && (next_is_eval || !(mincount == 0 && maxcount == 1)) + && (minnext == 0) && (deltanext == 0) + && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ + { + /* Fatal warnings may leak the regexp without this: */ + SAVEFREESV(RExC_rx_sv); + ckWARNreg(RExC_parse, + "Quantifier unexpected on zero-length expression"); + (void)ReREFCNT_inc(RExC_rx_sv); + } + + min += minnext * mincount; + is_inf_internal |= deltanext == SSize_t_MAX + || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf |= is_inf_internal; + if (is_inf) { + delta = SSize_t_MAX; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } + /* Try powerful optimization CURLYX => CURLYN. */ + if ( OP(oscan) == CURLYX && data + && data->flags & SF_IN_PAR + && !(data->flags & SF_HAS_EVAL) + && !deltanext && minnext == 1 ) { + /* Try to optimize to CURLYN. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; + regnode * const nxt1 = nxt; +#ifdef DEBUGGING + regnode *nxt2; +#endif + + /* Skip open. */ + nxt = regnext(nxt); + if (!REGNODE_SIMPLE(OP(nxt)) + && !(PL_regkind[OP(nxt)] == EXACT + && STR_LEN(nxt) == 1)) + goto nogo; +#ifdef DEBUGGING + nxt2 = nxt; +#endif + nxt = regnext(nxt); + if (OP(nxt) != CLOSE) + goto nogo; + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ + } + /* Now we know that nxt2 is the only contents: */ + oscan->flags = (U8)ARG(nxt); + OP(oscan) = CURLYN; + OP(nxt1) = NOTHING; /* was OPEN. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ +#endif + } + nogo: + + /* Try optimization CURLYX => CURLYM. */ + if ( OP(oscan) == CURLYX && data + && !(data->flags & SF_HAS_PAR) + && !(data->flags & SF_HAS_EVAL) + && !deltanext /* atom is fixed width */ + && minnext != 0 /* CURLYM can't handle zero width */ + + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) + ) { + /* XXXX How to optimize if data == 0? */ + /* Optimize to a simpler form. */ + regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */ + regnode *nxt2; + + OP(oscan) = CURLYM; + while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ + && (OP(nxt2) != WHILEM)) + nxt = nxt2; + OP(nxt2) = SUCCEED; /* Whas WHILEM */ + /* Need to optimize away parenths. */ + if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { + /* Set the parenth number. */ + regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ + + oscan->flags = (U8)ARG(nxt); + if (RExC_open_parens) { + RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ + } + OP(nxt1) = OPTIMIZED; /* was OPEN. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ +#endif +#if 0 + while ( nxt1 && (OP(nxt1) != WHILEM)) { + regnode *nnxt = regnext(nxt1); + if (nnxt == nxt) { + if (reg_off_by_arg[OP(nxt1)]) + ARG_SET(nxt1, nxt2 - nxt1); + else if (nxt2 - nxt1 < U16_MAX) + NEXT_OFF(nxt1) = nxt2 - nxt1; + else + OP(nxt) = NOTHING; /* Cannot beautify */ + } + nxt1 = nnxt; + } +#endif + /* Optimize again: */ + study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, + NULL, stopparen, recursed_depth, NULL, 0,depth+1); + } + else + oscan->flags = 0; + } + else if ((OP(oscan) == CURLYX) + && (flags & SCF_WHILEM_VISITED_POS) + /* See the comment on a similar expression above. + However, this time it's not a subexpression + we care about, but the expression itself. */ + && (maxcount == REG_INFTY) + && data && ++data->whilem_c < 16) { + /* This stays as CURLYX, we can put the count/of pair. */ + /* Find WHILEM (as in regexec.c) */ + regnode *nxt = oscan + NEXT_OFF(oscan); + + if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ + nxt += ARG(nxt); + PREVOPER(nxt)->flags = (U8)(data->whilem_c + | (RExC_whilem_seen << 4)); /* On WHILEM */ + } + if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (flags & SCF_DO_SUBSTR) { + SV *last_str = NULL; + STRLEN last_chrs = 0; + int counted = mincount != 0; + + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min + ? pos_before : data->last_start_min; + STRLEN l; + const char * const s = SvPV_const(data->last_found, l); + SSize_t old = b - data->last_start_min; + + if (UTF) + old = utf8_hop((U8*)s, old) - (U8*)s; + l -= old; + /* Get the added string: */ + last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; + if (deltanext == 0 && pos_before == b) { + /* What was added is a constant string */ + if (mincount > 1) { + + SvGROW(last_str, (mincount * l) + 1); + repeatcpy(SvPVX(last_str) + l, + SvPVX_const(last_str), l, + mincount - 1); + SvCUR_set(last_str, SvCUR(last_str) * mincount); + /* Add additional parts. */ + SvCUR_set(data->last_found, + SvCUR(data->last_found) - l); + sv_catsv(data->last_found, last_str); + { + SV * sv = data->last_found; + MAGIC *mg = + SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += last_chrs * (mincount-1); + } + last_chrs *= mincount; + data->last_end += l * (mincount - 1); + } + } else { + /* start offset must point into the last copy */ + data->last_start_min += minnext * (mincount - 1); + data->last_start_max += is_inf ? SSize_t_MAX + : (maxcount - 1) * (minnext + data->pos_delta); + } + } + /* It is counted once already... */ + data->pos_min += minnext * (mincount - counted); +#if 0 +PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf + " SSize_t_MAX=%"UVuf" minnext=%"UVuf + " maxcount=%"UVuf" mincount=%"UVuf"\n", + (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, + (UV)mincount); +if (deltanext != SSize_t_MAX) +PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); +#endif + if (deltanext == SSize_t_MAX + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta) + data->pos_delta = SSize_t_MAX; + else + data->pos_delta += - counted * deltanext + + (minnext + deltanext) * maxcount - minnext * mincount; + if (mincount != maxcount) { + /* Cannot extend fixed substrings found inside + the group. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + if (mincount && last_str) { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + + if (mg) + mg->mg_len = -1; + sv_setsv(sv, last_str); + data->last_end = data->pos_min; + data->last_start_min = data->pos_min - last_chrs; + data->last_start_max = is_inf + ? SSize_t_MAX + : data->pos_min + data->pos_delta - last_chrs; + } + data->longest = &(data->longest_float); + } + SvREFCNT_dec(last_str); + } + if (data && (fl & SF_HAS_EVAL)) + data->flags |= SF_HAS_EVAL; + optimize_curly_tail: + if (OP(oscan) != CURLYX) { + while (PL_regkind[OP(next = regnext(oscan))] == NOTHING + && NEXT_OFF(next)) + NEXT_OFF(oscan) += NEXT_OFF(next); + } + continue; + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); +#endif + case REF: + case CLUMP: + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } + flags &= ~SCF_DO_STCLASS; + break; + } + } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[_CC_VERTSPACE], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + } + min++; + delta++; /* Because of the 2 char string cr-lf */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + } + else if (REGNODE_SIMPLE(OP(scan))) { + + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min++; + } + min++; + if (flags & SCF_DO_STCLASS) { + bool invert = 0; + SV* my_invlist = sv_2mortal(_new_invlist(0)); + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + + /* Some of the logic below assumes that switching + locale on will only add false positives. */ + switch (OP(scan)) { + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); +#endif + case CANY: + case SANY: + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_match_all_cp(data->start_class); + break; + + case REG_ANY: + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); + } + break; + + case ANYOF: + if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); + else + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); + break; + + case NPOSIXL: + invert = 1; + /* FALLTHROUGH */ + + case POSIXL: + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; + if (flags & SCF_DO_STCLASS_AND) { + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); + } + else { + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); + } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + } + break; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; + /* FALLTHROUGH */ + case POSIXA: + if (FLAGS(scan) == _CC_ASCII) { + my_invlist = PL_XPosix_ptrs[_CC_ASCII]; + } + else { + _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)], + PL_XPosix_ptrs[_CC_ASCII], + &my_invlist); + } + goto join_posix; + + case NPOSIXD: + case NPOSIXU: + invert = 1; + /* FALLTHROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } + + join_posix: + + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); + } + else { + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); + } + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { + data->flags |= (OP(scan) == MEOL + ? SF_BEFORE_MEOL + : SF_BEFORE_SEOL); + scan_commit(pRExC_state, data, minlenp, is_inf); + + } + else if ( PL_regkind[OP(scan)] == BRANCHJ + /* Lookbehind, or need to calculate parens/evals/stclass: */ + && (scan->flags || data || (flags & SCF_DO_STCLASS)) + && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) + { + if ( OP(scan) == UNLESSM && + scan->flags == 0 && + OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && + OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED + ) { + regnode *opt; + regnode *upto= regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_STUDYDATA("OPFAIL",data,depth); + + /*DEBUG_PARSE_MSG("opfail");*/ + regprop(RExC_rx, mysv_val, upto, NULL); + PerlIO_printf(Perl_debug_log, + "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) + ); + }); + OP(scan) = OPFAIL; + NEXT_OFF(scan) = upto - scan; + for (opt= scan + 1; opt < upto ; opt++) + OP(opt) = OPTIMIZED; + scan= upto; + continue; + } + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + || OP(scan) == UNLESSM ) + { + /* Negative Lookahead/lookbehind + In this case we can't do fixed string optimisation. + */ + + SSize_t deltanext, minnext, fake = 0; + regnode *nscan; + regnode_ssc intrnl; + int f = 0; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (minnext > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); + } + scan->flags = (U8)minnext; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (f & SCF_DO_STCLASS_AND) { + if (flags & SCF_DO_STCLASS_OR) { + /* OR before, AND after: ideally we would recurse with + * data_fake to get the AND applied by study of the + * remainder of the pattern, and then derecurse; + * *** HACK *** for now just treat as "no information". + * See [perl #56690]. + */ + ssc_init(pRExC_state, data->start_class); + } else { + /* AND before and after: combine and continue. These + * assertions are zero-length, so can match an EMPTY + * string */ + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + } + } + } +#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY + else { + /* Positive Lookahead/lookbehind + In this case we can do fixed string optimisation, + but we must be careful about it. Note in the case of + lookbehind the positions will be offset by the minimum + length of the pattern, something we won't know about + until after the recurse. + */ + SSize_t deltanext, fake = 0; + regnode *nscan; + regnode_ssc intrnl; + int f = 0; + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated + minlens when it's all done. This way we don't + have to worry about freeing them when we know + they wont be used, which would be a pain. + */ + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); + SAVEFREEPV(minnextp); + + if (data) { + StructCopy(data, &data_fake, scan_data_t); + if ((flags & SCF_DO_SUBSTR) && data->last_found) { + f |= SCF_DO_SUBSTR; + if (scan->flags) + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); + data_fake.last_found=newSVsv(data->last_found); + } + } + else + data_fake.last_closep = &fake; + data_fake.flags = 0; + data_fake.pos_delta = delta; + if (is_inf) + data_fake.flags |= SF_IS_INF; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = NEXTOPER(NEXTOPER(scan)); + + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f,depth+1); + if (scan->flags) { + if (deltanext) { + FAIL("Variable length lookbehind not implemented"); + } + else if (*minnextp > (I32)U8_MAX) { + FAIL2("Lookbehind longer than %"UVuf" not implemented", + (UV)U8_MAX); + } + scan->flags = (U8)*minnextp; + } + + *minnextp += min; + + if (f & SCF_DO_STCLASS_AND) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { + if (RExC_rx->minlen<*minnextp) + RExC_rx->minlen=*minnextp; + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); + SvREFCNT_dec_NN(data_fake.last_found); + + if ( data_fake.minlen_fixed != minlenp ) + { + data->offset_fixed= data_fake.offset_fixed; + data->minlen_fixed= data_fake.minlen_fixed; + data->lookbehind_fixed+= scan->flags; + } + if ( data_fake.minlen_float != minlenp ) + { + data->minlen_float= data_fake.minlen_float; + data->offset_float_min=data_fake.offset_float_min; + data->offset_float_max=data_fake.offset_float_max; + data->lookbehind_float+= scan->flags; + } + } + } + } +#endif + } + else if (OP(scan) == OPEN) { + if (stopparen != (I32)ARG(scan)) + pars++; + } + else if (OP(scan) == CLOSE) { + if (stopparen == (I32)ARG(scan)) { + break; + } + if ((I32)ARG(scan) == is_par) { + next = regnext(scan); + + if ( next && (OP(next) != WHILEM) && next < last) + is_par = 0; /* Disable optimization */ + } + if (data) + *(data->last_closep) = ARG(scan); + } + else if (OP(scan) == EVAL) { + if (data) + data->flags |= SF_HAS_EVAL; + } + else if ( PL_regkind[OP(scan)] == ENDLIKE ) { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + flags &= ~SCF_DO_SUBSTR; + } + if (data && OP(scan)==ACCEPT) { + data->flags |= SCF_SEEN_ACCEPT; + if (stopmin > min) + stopmin = min; + } + } + else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ + { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->longest = &(data->longest_float); + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + } + else if (OP(scan) == GPOS) { + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) + { + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) + RExC_rx->gofs = min; + } else { + RExC_rx->intflags |= PREGf_GPOS_FLOAT; + RExC_rx->gofs = 0; + } + } +#ifdef TRIE_STUDY_OPT +#ifdef FULL_TRIE_STUDY + else if (PL_regkind[OP(scan)] == TRIE) { + /* NOTE - There is similar code to this block above for handling + BRANCH nodes on the initial study. If you change stuff here + check there too. */ + regnode *trie_node= scan; + regnode *tail= regnext(scan); + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + SSize_t max1 = 0, min1 = SSize_t_MAX; + regnode_ssc accum; + + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + if (!trie->jump) { + min1= trie->minlen; + max1= trie->maxlen; + } else { + const regnode *nextbranch= NULL; + U32 word; + + for ( word=1 ; word <= trie->wordcount ; word++) + { + SSize_t deltanext=0, minnext=0, f = 0, fake; + regnode_ssc this_class; + + data_fake.flags = 0; + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; + data_fake.pos_delta = delta; + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + if (trie->jump[word]) { + if (!nextbranch) + nextbranch = trie_node + trie->jump[0]; + scan= trie_node + trie->jump[word]; + /* We go from the jump point to the branch that follows + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed_depth, NULL, f,depth+1); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode*)nextbranch); + + if (min1 > (SSize_t)(minnext + trie->minlen)) + min1 = minnext + trie->minlen; + if (deltanext == SSize_t_MAX) { + is_inf = is_inf_internal = 1; + max1 = SSize_t_MAX; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) + max1 = minnext + deltanext + trie->maxlen; + + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > min + min1) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); + } + } + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->longest = &(data->longest_float); + } + min += min1; + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + scan= tail; + continue; + } +#else + else if (PL_regkind[OP(scan)] == TRIE) { + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + U8*bang=NULL; + + min += trie->minlen; + delta += (trie->maxlen - trie->minlen); + flags &= ~SCF_DO_STCLASS; /* xxx */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += trie->minlen; + data->pos_delta += (trie->maxlen - trie->minlen); + if (trie->maxlen != trie->minlen) + data->longest = &(data->longest_float); + } + if (trie->jump) /* no more substrings -- for now /grr*/ + flags &= ~SCF_DO_SUBSTR; + } +#endif /* old or new */ +#endif /* TRIE_STUDY_OPT */ + + /* Else: zero-length, ignore. */ + scan = regnext(scan); + } + /* If we are exiting a recursion we can unset its recursed bit + * and allow ourselves to enter it again - no danger of an + * infinite loop there. + if (stopparen > -1 && recursed) { + DEBUG_STUDYDATA("unset:", data,depth); + PAREN_UNSET( recursed, stopparen); + } + */ + if (frame) { + DEBUG_STUDYDATA("frame-end:",data,depth); + DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ + last = frame->last; + scan = frame->next; + stopparen = frame->stop; + recursed_depth = frame->prev_recursed_depth; + depth = depth - 1; + + frame = frame->prev; + goto fake_study_recurse; + } + + finish: + assert(!frame); + DEBUG_STUDYDATA("pre-fin:",data,depth); + + *scanp = scan; + *deltap = is_inf_internal ? SSize_t_MAX : delta; + + if (flags & SCF_DO_SUBSTR && is_inf) + data->pos_delta = SSize_t_MAX - data->pos_min; + if (is_par > (I32)U8_MAX) + is_par = 0; + if (is_par && pars==1 && data) { + data->flags |= SF_IN_PAR; + data->flags &= ~SF_HAS_PAR; + } + else if (pars && data) { + data->flags |= SF_HAS_PAR; + data->flags &= ~SF_IN_PAR; + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + if (flags & SCF_TRIE_RESTUDY) + data->flags |= SCF_TRIE_RESTUDY; + + DEBUG_STUDYDATA("post-fin:",data,depth); + + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { + RExC_maxlen = final_minlen + delta; + } + return final_minlen; + } + /* not-reached */ +} + +STATIC U32 +S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) +{ + U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; + + PERL_ARGS_ASSERT_ADD_DATA; + + Renewc(RExC_rxi->data, + sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), + char, struct reg_data); + if(count) + Renew(RExC_rxi->data->what, count + n, U8); + else + Newx(RExC_rxi->data->what, n, U8); + RExC_rxi->data->count = count + n; + Copy(s, RExC_rxi->data->what + count, n, U8); + return count; +} + +/*XXX: todo make this not included in a non debugging perl, but appears to be + * used anyway there, in 'use re' */ +#ifndef PERL_IN_XSUB_RE +void +Perl_reginitcolors(pTHX) +{ + const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); + if (s) { + char *t = savepv(s); + int i = 0; + PL_colors[0] = t; + while (++i < 6) { + t = strchr(t, '\t'); + if (t) { + *t = '\0'; + PL_colors[i] = ++t; + } + else + PL_colors[i] = t = (char *)""; + } + } else { + int i = 0; + while (i < 6) + PL_colors[i++] = (char *)""; + } + PL_colorset = 1; +} +#endif + + +#ifdef TRIE_STUDY_OPT +#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ + STMT_START { \ + if ( \ + (data.flags & SCF_TRIE_RESTUDY) \ + && ! restudied++ \ + ) { \ + dOsomething; \ + goto reStudy; \ + } \ + } STMT_END +#else +#define CHECK_RESTUDY_GOTO_butfirst +#endif + +/* + * pregcomp - compile a regular expression into internal code + * + * Decides which engine's compiler to call based on the hint currently in + * scope + */ + +#ifndef PERL_IN_XSUB_RE + +/* return the currently in-scope regex engine (or the default if none) */ + +regexp_engine const * +Perl_current_re_engine(pTHX) +{ + if (IN_PERL_COMPILETIME) { + HV * const table = GvHV(PL_hintgv); + SV **ptr; + + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) + return &reh_regexp_engine; + ptr = hv_fetchs(table, "regcomp", FALSE); + if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) + return &reh_regexp_engine; + return INT2PTR(regexp_engine*,SvIV(*ptr)); + } + else { + SV *ptr; + if (!PL_curcop->cop_hints_hash) + return &reh_regexp_engine; + ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); + if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) + return &reh_regexp_engine; + return INT2PTR(regexp_engine*,SvIV(ptr)); + } +} + + +REGEXP * +Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) +{ + regexp_engine const *eng = current_re_engine(); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_PREGCOMP; + + /* Dispatch a request to compile a regexp to correct regexp engine. */ + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + PTR2UV(eng)); + }); + return CALLREGCOMP_ENG(eng, pattern, flags); +} +#endif + +/* public(ish) entry point for the perl core's own regex compiling code. + * It's actually a wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs, and uses the internal engine rather + * than the current one */ + +REGEXP * +Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) +{ + SV *pat = pattern; /* defeat constness! */ + PERL_ARGS_ASSERT_RE_COMPILE; + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, +#ifdef PERL_IN_XSUB_RE + &my_reg_engine, +#else + &reh_regexp_engine, +#endif + NULL, NULL, rx_flags, 0); +} + + +/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code + * blocks, recalculate the indices. Update pat_p and plen_p in-place to + * point to the realloced string and length. + * + * This is essentially a copy of Perl_bytes_to_utf8() with the code index + * stuff added */ + +static void +S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, + char **pat_p, STRLEN *plen_p, int num_code_blocks) +{ + U8 *const src = (U8*)*pat_p; + U8 *dst; + int n=0; + STRLEN s = 0, d = 0; + bool do_end = 0; + GET_RE_DEBUG_FLAGS_DECL; + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + + Newx(dst, *plen_p * 2 + 1, U8); + + while (s < *plen_p) { + if (NATIVE_BYTE_IS_INVARIANT(src[s])) + dst[d] = src[s]; + else { + dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); + dst[d] = UTF8_EIGHT_BIT_LO(src[s]); + } + if (n < num_code_blocks) { + if (!do_end && pRExC_state->code_blocks[n].start == s) { + pRExC_state->code_blocks[n].start = d; + assert(dst[d] == '('); + do_end = 1; + } + else if (do_end && pRExC_state->code_blocks[n].end == s) { + pRExC_state->code_blocks[n].end = d; + assert(dst[d] == ')'); + do_end = 0; + n++; + } + } + s++; + d++; + } + dst[d] = '\0'; + *plen_p = d; + *pat_p = (char*) dst; + SAVEFREEPV(*pat_p); + RExC_orig_utf8 = RExC_utf8 = 1; +} + + + +/* S_concat_pat(): concatenate a list of args to the pattern string pat, + * while recording any code block indices, and handling overloading, + * nested qr// objects etc. If pat is null, it will allocate a new + * string, or just return the first arg, if there's only one. + * + * Returns the malloced/updated pat. + * patternp and pat_count is the array of SVs to be concatted; + * oplist is the optional list of ops that generated the SVs; + * recompile_p is a pointer to a boolean that will be set if + * the regex will need to be recompiled. + * delim, if non-null is an SV that will be inserted between each element + */ + +static SV* +S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, + SV *pat, SV ** const patternp, int pat_count, + OP *oplist, bool *recompile_p, SV *delim) +{ + SV **svp; + int n = 0; + bool use_delim = FALSE; + bool alloced = FALSE; + + /* if we know we have at least two args, create an empty string, + * then concatenate args to that. For no args, return an empty string */ + if (!pat && pat_count != 1) { + pat = newSVpvs(""); + SAVEFREESV(pat); + alloced = TRUE; + } + + for (svp = patternp; svp < patternp + pat_count; svp++) { + SV *sv; + SV *rx = NULL; + STRLEN orig_patlen = 0; + bool code = 0; + SV *msv = use_delim ? delim : *svp; + if (!msv) msv = &PL_sv_undef; + + /* if we've got a delimiter, we go round the loop twice for each + * svp slot (except the last), using the delimiter the second + * time round */ + if (use_delim) { + svp--; + use_delim = FALSE; + } + else if (delim) + use_delim = TRUE; + + if (SvTYPE(msv) == SVt_PVAV) { + /* we've encountered an interpolated array within + * the pattern, e.g. /...@a..../. Expand the list of elements, + * then recursively append elements. + * The code in this block is based on S_pushav() */ + + AV *const av = (AV*)msv; + const SSize_t maxarg = AvFILL(av) + 1; + SV **array; + + if (oplist) { + assert(oplist->op_type == OP_PADAV + || oplist->op_type == OP_RV2AV); + oplist = OP_SIBLING(oplist); + } + + if (SvRMAGICAL(av)) { + SSize_t i; + + Newx(array, maxarg, SV*); + SAVEFREEPV(array); + for (i=0; i < maxarg; i++) { + SV ** const svp = av_fetch(av, i, FALSE); + array[i] = svp ? *svp : &PL_sv_undef; + } + } + else + array = AvARRAY(av); + + pat = S_concat_pat(aTHX_ pRExC_state, pat, + array, maxarg, NULL, recompile_p, + /* $" */ + GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); + + continue; + } + + + /* we make the assumption here that each op in the list of + * op_siblings maps to one SV pushed onto the stack, + * except for code blocks, with have both an OP_NULL and + * and OP_CONST. + * This allows us to match up the list of SVs against the + * list of OPs to find the next code block. + * + * Note that PUSHMARK PADSV PADSV .. + * is optimised to + * PADRANGE PADSV PADSV .. + * so the alignment still works. */ + + if (oplist) { + if (oplist->op_type == OP_NULL + && (oplist->op_flags & OPf_SPECIAL)) + { + assert(n < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0; + pRExC_state->code_blocks[n].block = oplist; + pRExC_state->code_blocks[n].src_regex = NULL; + n++; + code = 1; + oplist = OP_SIBLING(oplist); /* skip CONST */ + assert(oplist); + } + oplist = OP_SIBLING(oplist);; + } + + /* apply magic and QR overloading to arg */ + + SvGETMAGIC(msv); + if (SvROK(msv) && SvAMAGIC(msv)) { + SV *sv = AMG_CALLunary(msv, regexp_amg); + if (sv) { + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) != SVt_REGEXP) + Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); + msv = sv; + } + } + + /* try concatenation overload ... */ + if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && + (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) + { + sv_setsv(pat, sv); + /* overloading involved: all bets are off over literal + * code. Pretend we haven't seen it */ + pRExC_state->num_code_blocks -= n; + n = 0; + } + else { + /* ... or failing that, try "" overload */ + while (SvAMAGIC(msv) + && (sv = AMG_CALLunary(msv, string_amg)) + && sv != msv + && !( SvROK(msv) + && SvROK(sv) + && SvRV(msv) == SvRV(sv)) + ) { + msv = sv; + SvGETMAGIC(msv); + } + if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) + msv = SvRV(msv); + + if (pat) { + /* this is a partially unrolled + * sv_catsv_nomg(pat, msv); + * that allows us to adjust code block indices if + * needed */ + STRLEN dlen; + char *dst = SvPV_force_nomg(pat, dlen); + orig_patlen = dlen; + if (SvUTF8(msv) && !SvUTF8(pat)) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); + sv_setpvn(pat, dst, dlen); + SvUTF8_on(pat); + } + sv_catsv_nomg(pat, msv); + rx = msv; + } + else + pat = msv; + + if (code) + pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + } + + /* extract any code blocks within any embedded qr//'s */ + if (rx && SvTYPE(rx) == SVt_REGEXP + && RX_ENGINE((REGEXP*)rx)->op_comp) + { + + RXi_GET_DECL(ReANY((REGEXP *)rx), ri); + if (ri->num_code_blocks) { + int i; + /* the presence of an embedded qr// with code means + * we should always recompile: the text of the + * qr// may not have changed, but it may be a + * different closure than last time */ + *recompile_p = 1; + Renew(pRExC_state->code_blocks, + pRExC_state->num_code_blocks + ri->num_code_blocks, + struct reg_code_block); + pRExC_state->num_code_blocks += ri->num_code_blocks; + + for (i=0; i < ri->num_code_blocks; i++) { + struct reg_code_block *src, *dst; + STRLEN offset = orig_patlen + + ReANY((REGEXP *)rx)->pre_prefix; + assert(n < pRExC_state->num_code_blocks); + src = &ri->code_blocks[i]; + dst = &pRExC_state->code_blocks[n]; + dst->start = src->start + offset; + dst->end = src->end + offset; + dst->block = src->block; + dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) + src->src_regex + ? src->src_regex + : (REGEXP*)rx); + n++; + } + } + } + } + /* avoid calling magic multiple times on a single element e.g. =~ $qr */ + if (alloced) + SvSETMAGIC(pat); + + return pat; +} + + + +/* see if there are any run-time code blocks in the pattern. + * False positives are allowed */ + +static bool +S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + int n = 0; + STRLEN s; + + PERL_UNUSED_CONTEXT; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + s = pRExC_state->code_blocks[n].end; + n++; + continue; + } + /* TODO ideally should handle [..], (#..), /#.../x to reduce false + * positives here */ + if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && + (pat[s+2] == '{' + || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) + ) + return 1; + } + return 0; +} + +/* Handle run-time code blocks. We will already have compiled any direct + * or indirect literal code blocks. Now, take the pattern 'pat' and make a + * copy of it, but with any literal code blocks blanked out and + * appropriate chars escaped; then feed it into + * + * eval "qr'modified_pattern'" + * + * For example, + * + * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno + * + * becomes + * + * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' + * + * After eval_sv()-ing that, grab any new code blocks from the returned qr + * and merge them with any code blocks of the original regexp. + * + * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; + * instead, just save the qr and return FALSE; this tells our caller that + * the original pattern needs upgrading to utf8. + */ + +static bool +S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + SV *qr; + + GET_RE_DEBUG_FLAGS_DECL; + + if (pRExC_state->runtime_code_qr) { + /* this is the second time we've been called; this should + * only happen if the main pattern got upgraded to utf8 + * during compilation; re-use the qr we compiled first time + * round (which should be utf8 too) + */ + qr = pRExC_state->runtime_code_qr; + pRExC_state->runtime_code_qr = NULL; + assert(RExC_utf8 && SvUTF8(qr)); + } + else { + int n = 0; + STRLEN s; + char *p, *newpat; + int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ + SV *sv, *qr_ref; + dSP; + + /* determine how many extra chars we need for ' and \ escaping */ + for (s = 0; s < plen; s++) { + if (pat[s] == '\'' || pat[s] == '\\') + newlen++; + } + + Newx(newpat, newlen, char); + p = newpat; + *p++ = 'q'; *p++ = 'r'; *p++ = '\''; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + /* blank out literal code block */ + assert(pat[s] == '('); + while (s <= pRExC_state->code_blocks[n].end) { + *p++ = '_'; + s++; + } + s--; + n++; + continue; + } + if (pat[s] == '\'' || pat[s] == '\\') + *p++ = '\\'; + *p++ = pat[s]; + } + *p++ = '\''; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + *p++ = 'x'; + *p++ = '\0'; + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%sre-parsing pattern for runtime code:%s %s\n", + PL_colors[4],PL_colors[5],newpat); + }); + + sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); + Safefree(newpat); + + ENTER; + SAVETMPS; + save_re_context(); + PUSHSTACKi(PERLSI_REQUIRE); + /* G_RE_REPARSING causes the toker to collapse \\ into \ when + * parsing qr''; normally only q'' does this. It also alters + * hints handling */ + eval_sv(sv, G_SCALAR|G_RE_REPARSING); + SvREFCNT_dec_NN(sv); + SPAGAIN; + qr_ref = POPs; + PUTBACK; + { + SV * const errsv = ERRSV; + if (SvTRUE_NN(errsv)) + { + Safefree(pRExC_state->code_blocks); + /* use croak_sv ? */ + Perl_croak_nocontext("%"SVf, SVfARG(errsv)); + } + } + assert(SvROK(qr_ref)); + qr = SvRV(qr_ref); + assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); + /* the leaving below frees the tmp qr_ref. + * Give qr a life of its own */ + SvREFCNT_inc(qr); + POPSTACK; + FREETMPS; + LEAVE; + + } + + if (!RExC_utf8 && SvUTF8(qr)) { + /* first time through; the pattern got upgraded; save the + * qr for the next time through */ + assert(!pRExC_state->runtime_code_qr); + pRExC_state->runtime_code_qr = qr; + return 0; + } + + + /* extract any code blocks within the returned qr// */ + + + /* merge the main (r1) and run-time (r2) code blocks into one */ + { + RXi_GET_DECL(ReANY((REGEXP *)qr), r2); + struct reg_code_block *new_block, *dst; + RExC_state_t * const r1 = pRExC_state; /* convenient alias */ + int i1 = 0, i2 = 0; + + if (!r2->num_code_blocks) /* we guessed wrong */ + { + SvREFCNT_dec_NN(qr); + return 1; + } + + Newx(new_block, + r1->num_code_blocks + r2->num_code_blocks, + struct reg_code_block); + dst = new_block; + + while ( i1 < r1->num_code_blocks + || i2 < r2->num_code_blocks) + { + struct reg_code_block *src; + bool is_qr = 0; + + if (i1 == r1->num_code_blocks) { + src = &r2->code_blocks[i2++]; + is_qr = 1; + } + else if (i2 == r2->num_code_blocks) + src = &r1->code_blocks[i1++]; + else if ( r1->code_blocks[i1].start + < r2->code_blocks[i2].start) + { + src = &r1->code_blocks[i1++]; + assert(src->end < r2->code_blocks[i2].start); + } + else { + assert( r1->code_blocks[i1].start + > r2->code_blocks[i2].start); + src = &r2->code_blocks[i2++]; + is_qr = 1; + assert(src->end < r1->code_blocks[i1].start); + } + + assert(pat[src->start] == '('); + assert(pat[src->end] == ')'); + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; + dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) + : src->src_regex; + dst++; + } + r1->num_code_blocks += r2->num_code_blocks; + Safefree(r1->code_blocks); + r1->code_blocks = new_block; + } + + SvREFCNT_dec_NN(qr); + return 1; +} + + +STATIC bool +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, + SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift, + SSize_t lookbehind, SSize_t offset, SSize_t *minlen, + STRLEN longest_length, bool eol, bool meol) +{ + /* This is the common code for setting up the floating and fixed length + * string data extracted from Perl_re_op_compile() below. Returns a boolean + * as to whether succeeded or not */ + + I32 t; + SSize_t ml; + + if (! (longest_length + || (eol /* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) + ) + /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ + || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) + { + return FALSE; + } + + /* copy the information about the longest from the reg_scan_data + over to the program. */ + if (SvUTF8(sv_longest)) { + *rx_utf8 = sv_longest; + *rx_substr = NULL; + } else { + *rx_substr = sv_longest; + *rx_utf8 = NULL; + } + /* end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = minlen ? *(minlen) : (SSize_t)longest_length; + *rx_end_shift = ml - offset + - longest_length + (SvTAIL(sv_longest) != 0) + + lookbehind; + + t = (eol/* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); + fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); + + return TRUE; +} + +/* + * Perl_re_op_compile - the perl internal RE engine's function to compile a + * regular expression into internal code. + * The pattern may be passed either as: + * a list of SVs (patternp plus pat_count) + * a list of OPs (expr) + * If both are passed, the SV list is used, but the OP list indicates + * which SVs are actually pre-compiled code blocks + * + * The SVs in the list have magic and qr overloading applied to them (and + * the list may be modified in-place with replacement SVs in the latter + * case). + * + * If the pattern hasn't changed from old_re, then old_re will be + * returned. + * + * eng is the current engine. If that engine has an op_comp method, then + * handle directly (i.e. we assume that op_comp was us); otherwise, just + * do the initial concatenation of arguments and pass on to the external + * engine. + * + * If is_bare_re is not null, set it to a boolean indicating whether the + * arg list reduced (after overloading) to a single bare regex which has + * been returned (i.e. /$qr/). + * + * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. + * + * pm_flags contains the PMf_* flags, typically based on those from the + * pm_flags field of the related PMOP. Currently we're only interested in + * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) [NB: not true in perl] + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. [I'll say.] + */ + +REGEXP * +Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, + OP *expr, const regexp_engine* eng, REGEXP *old_re, + bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) +{ + REGEXP *rx; + struct regexp *r; + regexp_internal *ri; + STRLEN plen; + char *exp; + regnode *scan; + I32 flags; + SSize_t minlen = 0; + U32 rx_flags; + SV *pat; + SV *code_blocksv = NULL; + SV** new_patternp = patternp; + + /* these are all flags - maybe they should be turned + * into a single int with different bit masks */ + I32 sawlookahead = 0; + I32 sawplus = 0; + I32 sawopen = 0; + I32 sawminmod = 0; + + regex_charset initial_charset = get_regex_charset(orig_rx_flags); + bool recompile = 0; + bool runtime_code = 0; + scan_data_t data; + RExC_state_t RExC_state; + RExC_state_t * const pRExC_state = &RExC_state; +#ifdef TRIE_STUDY_OPT + int restudied = 0; + RExC_state_t copyRExC_state; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_OP_COMPILE; + + DEBUG_r(if (!PL_colorset) reginitcolors()); + +#ifndef PERL_IN_XSUB_RE + /* Initialize these here instead of as-needed, as is quick and avoids + * having to test them each time otherwise */ + if (! PL_AboveLatin1) { + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); + PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); + PL_HasMultiCharFold = + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); + } +#endif + + pRExC_state->code_blocks = NULL; + pRExC_state->num_code_blocks = 0; + + if (is_bare_re) + *is_bare_re = FALSE; + + if (expr && (expr->op_type == OP_LIST || + (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { + /* allocate code_blocks if needed */ + OP *o; + int ncode = 0; + + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) + if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) + ncode++; /* count of DO blocks */ + if (ncode) { + pRExC_state->num_code_blocks = ncode; + Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); + } + } + + if (!pat_count) { + /* compile-time pattern with just OP_CONSTs and DO blocks */ + + int n; + OP *o; + + /* find how many CONSTs there are */ + assert(expr); + n = 0; + if (expr->op_type == OP_CONST) + n = 1; + else + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) { + if (o->op_type == OP_CONST) + n++; + } + + /* fake up an SV array */ + + assert(!new_patternp); + Newx(new_patternp, n, SV*); + SAVEFREEPV(new_patternp); + pat_count = n; + + n = 0; + if (expr->op_type == OP_CONST) + new_patternp[n] = cSVOPx_sv(expr); + else + for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) { + if (o->op_type == OP_CONST) + new_patternp[n++] = cSVOPo_sv; + } + + } + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Assembling pattern from %d elements%s\n", pat_count, + orig_rx_flags & RXf_SPLIT ? " for split" : "")); + + /* set expr to the first arg op */ + + if (pRExC_state->num_code_blocks + && expr->op_type != OP_CONST) + { + expr = cLISTOPx(expr)->op_first; + assert( expr->op_type == OP_PUSHMARK + || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) + || expr->op_type == OP_PADRANGE); + expr = OP_SIBLING(expr); + } + + pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, + expr, &recompile, NULL); + + /* handle bare (possibly after overloading) regex: foo =~ $re */ + { + SV *re = pat; + if (SvROK(re)) + re = SvRV(re); + if (SvTYPE(re) == SVt_REGEXP) { + if (is_bare_re) + *is_bare_re = TRUE; + SvREFCNT_inc(re); + Safefree(pRExC_state->code_blocks); + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "Precompiled pattern%s\n", + orig_rx_flags & RXf_SPLIT ? " for split" : "")); + + return (REGEXP*)re; + } + } + + exp = SvPV_nomg(pat, plen); + + if (!eng->op_comp) { + if ((SvUTF8(pat) && IN_BYTES) + || SvGMAGICAL(pat) || SvAMAGIC(pat)) + { + /* make a temporary copy; either to convert to bytes, + * or to avoid repeating get-magic / overloaded stringify */ + pat = newSVpvn_flags(exp, plen, SVs_TEMP | + (IN_BYTES ? 0 : SvUTF8(pat))); + } + Safefree(pRExC_state->code_blocks); + return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); + } + + /* ignore the utf8ness if the pattern is 0 length */ + RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); + RExC_uni_semantics = 0; + RExC_contains_locale = 0; + RExC_contains_i = 0; + pRExC_state->runtime_code_qr = NULL; + + DEBUG_COMPILE_r({ + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", + PL_colors[4],PL_colors[5],s); + }); + + redo_first_pass: + /* we jump here if we upgrade the pattern to utf8 and have to + * recompile */ + + if ((pm_flags & PMf_USE_RE_EVAL) + /* this second condition covers the non-regex literal case, + * i.e. $foo =~ '(?{})'. */ + || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) + ) + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); + + /* return old regex if pattern hasn't changed */ + /* XXX: note in the below we have to check the flags as well as the + * pattern. + * + * Things get a touch tricky as we have to compare the utf8 flag + * independently from the compile flags. */ + + if ( old_re + && !recompile + && !!RX_UTF8(old_re) == !!RExC_utf8 + && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) + && RX_PRECOMP(old_re) + && RX_PRELEN(old_re) == plen + && memEQ(RX_PRECOMP(old_re), exp, plen) + && !runtime_code /* with runtime code, always recompile */ ) + { + Safefree(pRExC_state->code_blocks); + return old_re; + } + + rx_flags = orig_rx_flags; + + if (rx_flags & PMf_FOLD) { + RExC_contains_i = 1; + } + if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + + /* Set to use unicode semantics if the pattern is in utf8 and has the + * 'depends' charset specified, as it means unicode when utf8 */ + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); + } + + RExC_precomp = exp; + RExC_flags = rx_flags; + RExC_pm_flags = pm_flags; + + if (runtime_code) { + if (TAINTING_get && TAINT_get) + Perl_croak(aTHX_ "Eval-group in insecure regular expression"); + + if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { + /* whoops, we have a non-utf8 pattern, whilst run-time code + * got compiled as utf8. Try again with a utf8 pattern */ + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + pRExC_state->num_code_blocks); + goto redo_first_pass; + } + } + assert(!pRExC_state->runtime_code_qr); + + RExC_sawback = 0; + + RExC_seen = 0; + RExC_maxlen = 0; + RExC_in_lookbehind = 0; + RExC_seen_zerolen = *exp == '^' ? -1 : 0; + RExC_extralen = 0; + RExC_override_recoding = 0; + RExC_in_multi_char_class = 0; + + /* First pass: determine size, legality. */ + RExC_parse = exp; + RExC_start = exp; + RExC_end = exp + plen; + RExC_naughty = 0; + RExC_npar = 1; + RExC_nestroot = 0; + RExC_size = 0L; + RExC_emit = (regnode *) &RExC_emit_dummy; + RExC_whilem_seen = 0; + RExC_open_parens = NULL; + RExC_close_parens = NULL; + RExC_opend = NULL; + RExC_paren_names = NULL; +#ifdef DEBUGGING + RExC_paren_name_list = NULL; +#endif + RExC_recurse = NULL; + RExC_study_chunk_recursed = NULL; + RExC_study_chunk_recursed_bytes= 0; + RExC_recurse_count = 0; + pRExC_state->code_index = 0; + +#if 0 /* REGC() is (currently) a NOP at the first pass. + * Clever compilers notice this and complain. --jhi */ + REGC((U8)REG_MAGIC, (char*)RExC_emit); +#endif + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + RExC_lastnum=0; + RExC_lastparse=NULL; + ); + /* reg may croak on us, not giving us a chance to free + pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may + need it to survive as long as the regexp (qr/(?{})/). + We must check that code_blocksv is not already set, because we may + have jumped back to restart the sizing pass. */ + if (pRExC_state->code_blocks && !code_blocksv) { + code_blocksv = newSV_type(SVt_PV); + SAVEFREESV(code_blocksv); + SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); + SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ + } + if (reg(pRExC_state, 0, &flags,1) == NULL) { + /* It's possible to write a regexp in ascii that represents Unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + -- dmq */ + if (flags & RESTART_UTF8) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + pRExC_state->num_code_blocks); + goto redo_first_pass; + } + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); + } + if (code_blocksv) + SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ + + DEBUG_PARSE_r({ + PerlIO_printf(Perl_debug_log, + "Required size %"IVdf" nodes\n" + "Starting second pass (creation)\n", + (IV)RExC_size); + RExC_lastnum=0; + RExC_lastparse=NULL; + }); + + /* The first pass could have found things that force Unicode semantics */ + if ((RExC_utf8 || RExC_uni_semantics) + && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) + { + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); + } + + /* Small enough for pointer-storage convention? + If extralen==0, this means that we will not need long jumps. */ + if (RExC_size >= 0x10000L && RExC_extralen) + RExC_size += RExC_extralen; + else + RExC_extralen = 0; + if (RExC_whilem_seen > 15) + RExC_whilem_seen = 15; + + /* Allocate space and zero-initialize. Note, the two step process + of zeroing when in debug mode, thus anything assigned has to + happen after that */ + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = ReANY(rx); + Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char, regexp_internal); + if ( r == NULL || ri == NULL ) + FAIL("Regexp out of space"); +#ifdef DEBUGGING + /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ + Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), + char); +#else + /* bulk initialize base fields with 0. */ + Zero(ri, sizeof(regexp_internal), char); +#endif + + /* non-zero initialization begins here */ + RXi_SET( r, ri ); + r->engine= eng; + r->extflags = rx_flags; + RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; + + if (pm_flags & PMf_IS_QR) { + ri->code_blocks = pRExC_state->code_blocks; + ri->num_code_blocks = pRExC_state->num_code_blocks; + } + else + { + int n; + for (n = 0; n < pRExC_state->num_code_blocks; n++) + if (pRExC_state->code_blocks[n].src_regex) + SAVEFREESV(pRExC_state->code_blocks[n].src_regex); + SAVEFREEPV(pRExC_state->code_blocks); + } + + { + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_charset = (get_regex_charset(r->extflags) + != REGEX_DEPENDS_CHARSET); + + /* The caret is output if there are any defaults: if not all the STD + * flags are set, or if no character set specifier is needed */ + bool has_default = + (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) + || ! has_charset); + bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) + == REG_RUN_ON_COMMENT_SEEN); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + >> RXf_PMf_STD_PMMOD_SHIFT); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + /* Allocate for the worst case, which is all the std flags are turned + * on. If more precision is desired, we could do a population count of + * the flags set. This could be done with a small lookup table, or by + * shifting, masking and adding, or even, when available, assembly + * language for a machine-language population count. + * We never output a minus, as all those are defaults, so are + * covered by the caret */ + const STRLEN wraplen = plen + has_p + has_runon + + has_default /* If needs a caret */ + + /* If needs a character set specifier */ + + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ + r->xpv_len_u.xpvlenu_pv = p; + if (RExC_utf8) + SvFLAGS(rx) |= SVf_UTF8; + *p++='('; *p++='?'; + + /* If a default, cover it using the caret */ + if (has_default) { + *p++= DEFAULT_PAT_MOD; + } + if (has_charset) { + STRLEN len; + const char* const name = get_regex_charset_name(r->extflags, &len); + Copy(name, p, len, char); + p += len; + } + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ + { + char ch; + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + reganch >>= 1; + } + } + + *p++ = ':'; + Copy(RExC_precomp, p, plen, char); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); + p += plen; + if (has_runon) + *p++ = '\n'; + *p++ = ')'; + *p = 0; + SvCUR_set(rx, p - RX_WRAPPED(rx)); + } + + r->intflags = 0; + r->nparens = RExC_npar - 1; /* set early to validate backrefs */ + + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { + Newxz(RExC_open_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_open_parens); + Newxz(RExC_close_parens,RExC_npar,regnode *); + SAVEFREEPV(RExC_close_parens); + } + if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } + + /* Useful during FAIL. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ + DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, + "%s %"UVuf" bytes for offset annotations.\n", + ri->u.offsets ? "Got" : "Couldn't get", + (UV)((2*RExC_size+1) * sizeof(U32)))); +#endif + SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; + RExC_rx = r; + RExC_rxi = ri; + REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx); + + /* Second pass: emit code. */ + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_pm_flags = pm_flags; + RExC_parse = exp; + RExC_end = exp + plen; + RExC_naughty = 0; + RExC_npar = 1; + RExC_emit_start = ri->program; + RExC_emit = ri->program; + RExC_emit_bound = ri->program + RExC_size + 1; + pRExC_state->code_index = 0; + + REGC((U8)REG_MAGIC, (char*) RExC_emit++); + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(rx); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); + } + /* XXXX To minimize changes to RE engine we always allocate + 3-units-long substrs field. */ + Newx(r->substrs, 1, struct reg_substr_data); + if (RExC_recurse_count) { + Newxz(RExC_recurse,RExC_recurse_count,regnode *); + SAVEFREEPV(RExC_recurse); + } + +reStudy: + r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; + Zero(r->substrs, 1, struct reg_substr_data); + if (RExC_study_chunk_recursed) + Zero(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + +#ifdef TRIE_STUDY_OPT + if (!restudied) { + StructCopy(&zero_scan_data, &data, scan_data_t); + copyRExC_state = RExC_state; + } else { + U32 seen=RExC_seen; + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); + + RExC_state = copyRExC_state; + if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; + else + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; + StructCopy(&zero_scan_data, &data, scan_data_t); + } +#else + StructCopy(&zero_scan_data, &data, scan_data_t); +#endif + + /* Dig out information for optimizations. */ + r->extflags = RExC_flags; /* was pm_op */ + /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ + + if (UTF) + SvUTF8_on(rx); /* Unicode in it? */ + ri->regstclass = NULL; + if (RExC_naughty >= 10) /* Probably an expensive pattern. */ + r->intflags |= PREGf_NAUGHTY; + scan = ri->program + 1; /* First BRANCH. */ + + /* testing for BRANCH here tells us whether there is "must appear" + data in the pattern. If there is then we can use it for optimisations */ + if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. + */ + SSize_t fake; + STRLEN longest_float_length, longest_fixed_length; + regnode_ssc ch_class; /* pointed to by data */ + int stclass_flag; + SSize_t last_close = 0; /* pointed to by data */ + regnode *first= scan; + regnode *first_next= regnext(first); + /* + * Skip introductions and multiplicators >= 1 + * so that we can extract the 'meat' of the pattern that must + * match in the large if() sequence following. + * NOTE that EXACT is NOT covered here, as it is normally + * picked up by the optimiser separately. + * + * This is unfortunate as the optimiser isnt handling lookahead + * properly currently. + * + */ + while ((OP(first) == OPEN && (sawopen = 1)) || + /* An OR of *one* alternative - should not happen now. */ + (OP(first) == BRANCH && OP(first_next) != BRANCH) || + /* for now we can't handle lookbehind IFMATCH*/ + (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || + (OP(first) == PLUS) || + (OP(first) == MINMOD) || + /* An {n,m} with n>0 */ + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) + { + /* + * the only op that could be a regnode is PLUS, all the rest + * will be regnode_1 or regnode_2. + * + * (yves doesn't think this is true) + */ + if (OP(first) == PLUS) + sawplus = 1; + else { + if (OP(first) == MINMOD) + sawminmod = 1; + first += regarglen[OP(first)]; + } + first = NEXTOPER(first); + first_next= regnext(first); + } + + /* Starting-point info. */ + again: + DEBUG_PEEP("first:",first,0); + /* Ignore EXACT as we deal with it later. */ + if (PL_regkind[OP(first)] == EXACT) { + if (OP(first) == EXACT) + NOOP; /* Empty, get anchored substr later. */ + else + ri->regstclass = first; + } +#ifdef TRIE_STCLASS + else if (PL_regkind[OP(first)] == TRIE && + ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) + { + /* this can happen only on restudy */ + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); + } +#endif + else if (REGNODE_SIMPLE(OP(first))) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOUND || + PL_regkind[OP(first)] == NBOUND) + ri->regstclass = first; + else if (PL_regkind[OP(first)] == BOL) { + r->intflags |= (OP(first) == MBOL + ? PREGf_ANCH_MBOL + : (OP(first) == SBOL + ? PREGf_ANCH_SBOL + : PREGf_ANCH_BOL)); + first = NEXTOPER(first); + goto again; + } + else if (OP(first) == GPOS) { + r->intflags |= PREGf_ANCH_GPOS; + first = NEXTOPER(first); + goto again; + } + else if ((!sawopen || !RExC_sawback) && + !sawlookahead && + (OP(first) == STAR && + PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && + !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) + { + /* turn .* into ^.* with an implied $*=1 */ + const int type = + (OP(NEXTOPER(first)) == REG_ANY) + ? PREGf_ANCH_MBOL + : PREGf_ANCH_SBOL; + r->intflags |= (type | PREGf_IMPLICIT); + first = NEXTOPER(first); + goto again; + } + if (sawplus && !sawminmod && !sawlookahead + && (!sawopen || !RExC_sawback) + && !pRExC_state->num_code_blocks) /* May examine pos and $& */ + /* x+ must match at the 1st pos of run of x's */ + r->intflags |= PREGf_SKIP; + + /* Scan is after the zeroth branch, first is atomic matcher. */ +#ifdef TRIE_STUDY_OPT + DEBUG_PARSE_r( + if (!restudied) + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#else + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + (IV)(first - scan + 1)) + ); +#endif + + + /* + * If there's something expensive in the r.e., find the + * longest literal string that must appear and make it the + * regmust. Resolve ties in favor of later strings, since + * the regstart check works with the beginning of the r.e. + * and avoiding duplication strengthens checking. Not a + * strong reason, but sufficient in the absence of others. + * [Now we resolve ties in favor of the earlier string if + * it happens that c_offset_min has been invalidated, since the + * earlier string may buy us something the later one won't.] + */ + + data.longest_fixed = newSVpvs(""); + data.longest_float = newSVpvs(""); + data.last_found = newSVpvs(""); + data.longest = &(data.longest_fixed); + ENTER_with_name("study_chunk"); + SAVEFREESV(data.longest_fixed); + SAVEFREESV(data.longest_float); + SAVEFREESV(data.last_found); + first = scan; + if (!ri->regstclass) { + ssc_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + stclass_flag = SCF_DO_STCLASS_AND; + } else /* XXXX Check for BOUND? */ + stclass_flag = 0; + data.last_closep = &last_close; + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, &first, &minlen, &fake, + scan + RExC_size, /* Up to end */ + &data, -1, 0, NULL, + SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag + | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), + 0); + + + CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); + + + if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) + && data.last_start_min == 0 && data.last_end > 0 + && !RExC_seen_zerolen + && !(RExC_seen & REG_VERBARG_SEEN) + && !(RExC_seen & REG_GPOS_SEEN) + ){ + r->extflags |= RXf_CHECK_ALL; + } + scan_commit(pRExC_state, &data,&minlen,0); + + longest_float_length = CHR_SVLEN(data.longest_float); + + if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) + && S_setup_longest (aTHX_ pRExC_state, + data.longest_float, + &(r->float_utf8), + &(r->float_substr), + &(r->float_end_shift), + data.lookbehind_float, + data.offset_float_min, + data.minlen_float, + longest_float_length, + cBOOL(data.flags & SF_FL_BEFORE_EOL), + cBOOL(data.flags & SF_FL_BEFORE_MEOL))) + { + r->float_min_offset = data.offset_float_min - data.lookbehind_float; + r->float_max_offset = data.offset_float_max; + if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */ + r->float_max_offset -= data.lookbehind_float; + SvREFCNT_inc_simple_void_NN(data.longest_float); + } + else { + r->float_substr = r->float_utf8 = NULL; + longest_float_length = 0; + } + + longest_fixed_length = CHR_SVLEN(data.longest_fixed); + + if (S_setup_longest (aTHX_ pRExC_state, + data.longest_fixed, + &(r->anchored_utf8), + &(r->anchored_substr), + &(r->anchored_end_shift), + data.lookbehind_fixed, + data.offset_fixed, + data.minlen_fixed, + longest_fixed_length, + cBOOL(data.flags & SF_FIX_BEFORE_EOL), + cBOOL(data.flags & SF_FIX_BEFORE_MEOL))) + { + r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; + SvREFCNT_inc_simple_void_NN(data.longest_fixed); + } + else { + r->anchored_substr = r->anchored_utf8 = NULL; + longest_fixed_length = 0; + } + LEAVE_with_name("study_chunk"); + + if (ri->regstclass + && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) + ri->regstclass = NULL; + + if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) + && stclass_flag + && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && !ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); + StructCopy(data.start_class, + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class, NULL); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + data.start_class = NULL; + } + + /* A temporary algorithm prefers floated substr to fixed one to dig + * more info. */ + if (longest_fixed_length > longest_float_length) { + r->substrs->check_ix = 0; + r->check_end_shift = r->anchored_end_shift; + r->check_substr = r->anchored_substr; + r->check_utf8 = r->anchored_utf8; + r->check_offset_min = r->check_offset_max = r->anchored_offset; + if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)) + r->intflags |= PREGf_NOSCAN; + } + else { + r->substrs->check_ix = 1; + r->check_end_shift = r->float_end_shift; + r->check_substr = r->float_substr; + r->check_utf8 = r->float_utf8; + r->check_offset_min = r->float_min_offset; + r->check_offset_max = r->float_max_offset; + } + if ((r->check_substr || r->check_utf8) ) { + r->extflags |= RXf_USE_INTUIT; + if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) + r->extflags |= RXf_INTUIT_TAIL; + } + r->substrs->data[0].max_offset = r->substrs->data[0].min_offset; + + /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) + if ( (STRLEN)minlen < longest_float_length ) + minlen= longest_float_length; + if ( (STRLEN)minlen < longest_fixed_length ) + minlen= longest_fixed_length; + */ + } + else { + /* Several toplevels. Best we can is to set minlen. */ + SSize_t fake; + regnode_ssc ch_class; + SSize_t last_close = 0; + + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); + + scan = ri->program + 1; + ssc_init(pRExC_state, &ch_class); + data.start_class = &ch_class; + data.last_closep = &last_close; + + DEBUG_RExC_seen(); + minlen = study_chunk(pRExC_state, + &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL, + SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied + ? SCF_TRIE_DOING_RESTUDY + : 0), + 0); + + CHECK_RESTUDY_GOTO_butfirst(NOOP); + + r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 + = r->float_substr = r->float_utf8 = NULL; + + if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) + && ! ssc_is_anything(data.start_class)) + { + const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + + ssc_finalize(pRExC_state, data.start_class); + + Newx(RExC_rxi->data->data[n], 1, regnode_ssc); + StructCopy(data.start_class, + (regnode_ssc*)RExC_rxi->data->data[n], + regnode_ssc); + ri->regstclass = (regnode*)RExC_rxi->data->data[n]; + r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); + regprop(r, sv, (regnode*)data.start_class, NULL); + PerlIO_printf(Perl_debug_log, + "synthetic stclass \"%s\".\n", + SvPVX_const(sv));}); + data.start_class = NULL; + } + } + + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + r->maxlen = REG_INFTY; + } + else { + r->maxlen = RExC_maxlen; + } + + /* Guard against an embedded (?=) or (?<=) with a longer minlen than + the "real" pattern. */ + DEBUG_OPTIMISE_r({ + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); + }); + r->minlenret = minlen; + if (r->minlen < minlen) + r->minlen = minlen; + + if (RExC_seen & REG_GPOS_SEEN) + r->intflags |= PREGf_GPOS_SEEN; + if (RExC_seen & REG_LOOKBEHIND_SEEN) + r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the + lookbehind */ + if (pRExC_state->num_code_blocks) + r->extflags |= RXf_EVAL_SEEN; + if (RExC_seen & REG_CANY_SEEN) + r->intflags |= PREGf_CANY_SEEN; + if (RExC_seen & REG_VERBARG_SEEN) + { + r->intflags |= PREGf_VERBARG_SEEN; + r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ + } + if (RExC_seen & REG_CUTGROUP_SEEN) + r->intflags |= PREGf_CUTGROUP_SEEN; + if (pm_flags & PMf_USE_RE_EVAL) + r->intflags |= PREGf_USE_RE_EVAL; + if (RExC_paren_names) + RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); + else + RXp_PAREN_NAMES(r) = NULL; + + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED + * so it can be used in pp.c */ + if (r->intflags & PREGf_ANCH) + r->extflags |= RXf_IS_ANCHORED; + + + { + /* this is used to identify "special" patterns that might result + * in Perl NOT calling the regex engine and instead doing the match "itself", + * particularly special cases in split//. By having the regex compiler + * do this pattern matching at a regop level (instead of by inspecting the pattern) + * we avoid weird issues with equivalent patterns resulting in different behavior, + * AND we allow non Perl engines to get the same optimizations by the setting the + * flags appropriately - Yves */ + regnode *first = ri->program + 1; + U8 fop = OP(first); + regnode *next = NEXTOPER(first); + U8 nop = OP(next); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS + && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE + && OP(regnext(first)) == END) + r->extflags |= RXf_WHITE; + else if ( r->extflags & RXf_SPLIT + && fop == EXACT + && STR_LEN(first) == 1 + && *(STRING(first)) == ' ' + && OP(regnext(first)) == END ) + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + + } + + if (RExC_contains_locale) { + RXp_EXTFLAGS(r) |= RXf_TAINTED; + } + +#ifdef DEBUGGING + if (RExC_paren_names) { + ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + ri->data->data[ri->name_list_idx] + = (void*)SvREFCNT_inc(RExC_paren_name_list); + } else +#endif + ri->name_list_idx = 0; + + if (RExC_recurse_count) { + for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { + const regnode *scan = RExC_recurse[RExC_recurse_count-1]; + ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); + } + } + Newxz(r->offs, RExC_npar, regexp_paren_pair); + /* assume we don't need to swap parens around before we match */ + + DEBUG_DUMP_r({ + DEBUG_RExC_seen(); + PerlIO_printf(Perl_debug_log,"Final program:\n"); + regdump(r); + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_OFFSETS_r(if (ri->u.offsets) { + const STRLEN len = ri->u.offsets[0]; + STRLEN i; + GET_RE_DEBUG_FLAGS_DECL; + PerlIO_printf(Perl_debug_log, + "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + for (i = 1; i <= len; i++) { + if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) + PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", + (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); + } + PerlIO_printf(Perl_debug_log, "\n"); + }); +#endif + +#ifdef USE_ITHREADS + /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated + * by setting the regexp SV to readonly-only instead. If the + * pattern's been recompiled, the USEDness should remain. */ + if (old_re && SvREADONLY(old_re)) + SvREADONLY_on(rx); +#endif + return rx; +} + + +SV* +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak_no_modify(); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) +{ + AV *retarray = NULL; + SV *ret; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + + if (flags & RXapif_ALL) + retarray=newAV(); + + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) + { + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); + if (!retarray) + return ret; + } else { + if (retarray) + ret = newSVsv(&PL_sv_undef); + } + if (retarray) + av_push(retarray, ret); + } + if (retarray) + return newRV_noinc(MUTABLE_SV(retarray)); + } + } + return NULL; +} + +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, + const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & RXapif_ALL) { + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); + if (sv) { + SvREFCNT_dec_NN(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; + + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); + + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + return newSVhek(HeKEY_hek(temphe)); + } + } + } + return NULL; +} + +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) +{ + SV *ret; + AV *av; + SSize_t length; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = MUTABLE_AV(SvRV(ret)); + length = av_tindex(av); + SvREFCNT_dec_NN(ret); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + AV *av = newAV(); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + av_push(av, newSVhek(HeKEY_hek(temphe))); + } + } + } + + return newRV_noinc(MUTABLE_SV(av)); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) +{ + struct regexp *const rx = ReANY(r); + char *s = NULL; + SSize_t i = 0; + SSize_t s1, t1; + I32 n = paren; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; + + if ( n == RX_BUFF_IDX_CARET_PREMATCH + || n == RX_BUFF_IDX_CARET_FULLMATCH + || n == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } + + if (!rx->subbeg) + goto ret_undef; + + if (n == RX_BUFF_IDX_CARET_FULLMATCH) + /* no need to distinguish between them any more */ + n = RX_BUFF_IDX_FULLMATCH; + + if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) + && rx->offs[0].start != -1) + { + /* $`, ${^PREMATCH} */ + i = rx->offs[0].start; + s = rx->subbeg; + } + else + if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) + && rx->offs[0].end != -1) + { + /* $', ${^POSTMATCH} */ + s = rx->subbeg - rx->suboffset + rx->offs[0].end; + i = rx->sublen + rx->suboffset - rx->offs[0].end; + } + else + if ( 0 <= n && n <= (I32)rx->nparens && + (s1 = rx->offs[n].start) != -1 && + (t1 = rx->offs[n].end) != -1) + { + /* $&, ${^MATCH}, $1 ... */ + i = t1 - s1; + s = rx->subbeg + s1 - rx->suboffset; + } else { + goto ret_undef; + } + + assert(s >= rx->subbeg); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); + if (i >= 0) { +#ifdef NO_TAINT_SUPPORT + sv_setpvn(sv, s, i); +#else + const int oldtainted = TAINT_get; + TAINT_NOT; + sv_setpvn(sv, s, i); + TAINT_set(oldtainted); +#endif + if ( (rx->intflags & PREGf_CANY_SEEN) + ? (RXp_MATCH_UTF8(rx) + && (!i || is_utf8_string((U8*)s, i))) + : (RXp_MATCH_UTF8(rx)) ) + { + SvUTF8_on(sv); + } + else + SvUTF8_off(sv); + if (TAINTING_get) { + if (RXp_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + TAINT; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); + } + } else { + TAINT; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); + } + } else { + ret_undef: + sv_setsv(sv,&PL_sv_undef); + return; + } +} + +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak_no_modify(); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, + const I32 paren) +{ + struct regexp *const rx = ReANY(r); + I32 i; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + + /* Some of this code was originally in C in F */ + switch (paren) { + case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: /* $` */ + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + + case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: /* $' */ + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + + default: /* $& / ${^MATCH}, $1, $2, ... */ + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + warn_undef: + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((const SV *)sv); + return 0; + } + } + getlen: + if (i > 0 && RXp_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg - rx->suboffset + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_ARGS_ASSERT_REG_QR_PACKAGE; + PERL_UNUSED_ARG(rx); + if (0) + return NULL; + else + return newSVpvs("Regexp"); +} + +/* Scans the name of a named buffer from the pattern. + * If flags is REG_RSN_RETURN_NULL returns null. + * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name + * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding + * to the parsed name as looked up in the RExC_paren_names hash. + * If there is an error throws a vFAIL().. type exception. + */ + +#define REG_RSN_RETURN_NULL 0 +#define REG_RSN_RETURN_NAME 1 +#define REG_RSN_RETURN_DATA 2 + +STATIC SV* +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) +{ + char *name_start = RExC_parse; + + PERL_ARGS_ASSERT_REG_SCAN_NAME; + + assert (RExC_parse <= RExC_end); + if (RExC_parse == RExC_end) NOOP; + else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + /* skip IDFIRST by using do...while */ + if (UTF) + do { + RExC_parse += UTF8SKIP(RExC_parse); + } while (isWORDCHAR_utf8((U8*)RExC_parse)); + else + do { + RExC_parse++; + } while (isWORDCHAR(*RExC_parse)); + } else { + RExC_parse++; /* so the <- from the vFAIL is after the offending + character */ + vFAIL("Group name must start with a non-digit word character"); + } + if ( flags ) { + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); + if ( flags == REG_RSN_RETURN_NAME) + return sv_name; + else if (flags==REG_RSN_RETURN_DATA) { + HE *he_str = NULL; + SV *sv_dat = NULL; + if ( ! sv_name ) /* should not happen*/ + Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) + vFAIL("Reference to nonexistent named group"); + return sv_dat; + } + else { + Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", + (unsigned long) flags); + } + assert(0); /* NOT REACHED */ + } + return NULL; +} + +#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ + int rem=(int)(RExC_end - RExC_parse); \ + int cut; \ + int num; \ + int iscut=0; \ + if (rem>10) { \ + rem=10; \ + iscut=1; \ + } \ + cut=10-rem; \ + if (RExC_lastparse!=RExC_parse) \ + PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ + rem, RExC_parse, \ + cut + 4, \ + iscut ? "..." : "<" \ + ); \ + else \ + PerlIO_printf(Perl_debug_log,"%16s",""); \ + \ + if (SIZE_ONLY) \ + num = RExC_size + 1; \ + else \ + num=REG_NODE_NUM(RExC_emit); \ + if (RExC_lastnum!=num) \ + PerlIO_printf(Perl_debug_log,"|%4d",num); \ + else \ + PerlIO_printf(Perl_debug_log,"|%4s",""); \ + PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ + (int)((depth*2)), "", \ + (funcname) \ + ); \ + RExC_lastnum=num; \ + RExC_lastparse=RExC_parse; \ +}) + + + +#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,"%4s","\n"); \ +}) +#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ + DEBUG_PARSE_MSG((funcname)); \ + PerlIO_printf(Perl_debug_log,fmt "\n",args); \ +}) + +/* This section of code defines the inversion list object and its methods. The + * interfaces are highly subject to change, so as much as possible is static to + * this file. An inversion list is here implemented as a malloc'd C UV array + * as an SVt_INVLIST scalar. + * + * An inversion list for Unicode is an array of code points, sorted by ordinal + * number. The zeroth element is the first code point in the list. The 1th + * element is the first element beyond that not in the list. In other words, + * the first range is + * invlist[0]..(invlist[1]-1) + * The other ranges follow. Thus every element whose index is divisible by two + * marks the beginning of a range that is in the list, and every element not + * divisible by two marks the beginning of a range not in the list. A single + * element inversion list that contains the single code point N generally + * consists of two elements + * invlist[0] == N + * invlist[1] == N+1 + * (The exception is when N is the highest representable value on the + * machine, in which case the list containing just it would be a single + * element, itself. By extension, if the last range in the list extends to + * infinity, then the first element of that range will be in the inversion list + * at a position that is divisible by two, and is the final element in the + * list.) + * Taking the complement (inverting) an inversion list is quite simple, if the + * first element is 0, remove it; otherwise add a 0 element at the beginning. + * This implementation reserves an element at the beginning of each inversion + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. + * + * More about inversion lists can be found in "Unicode Demystified" + * Chapter 13 by Richard Gillam, published by Addison-Wesley. + * More will be coming when functionality is added later. + * + * The inversion list data structure is currently implemented as an SV pointing + * to an array of UVs that the SV thinks are bytes. This allows us to have an + * array of UV whose memory management is automatically handled by the existing + * facilities for SV's. + * + * Some of the methods should always be private to the implementation, and some + * should eventually be made public */ + +/* The header definitions are in F */ + +PERL_STATIC_INLINE UV* +S__invlist_array_init(SV* const invlist, const bool will_have_0) +{ + /* Returns a pointer to the first element in the inversion list's array. + * This is called upon initialization of an inversion list. Where the + * array begins depends on whether the list has the code point U+0000 in it + * or not. The other parameter tells it whether the code that follows this + * call is about to put a 0 in the inversion list or not. The first + * element is either the element reserved for 0, if TRUE, or the element + * after it, if FALSE */ + + bool* offset = get_invlist_offset_addr(invlist); + UV* zero_addr = (UV *) SvPVX(invlist); + + PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; + + /* Must be empty */ + assert(! _invlist_len(invlist)); + + *zero_addr = 0; + + /* 1^1 = 0; 1^0 = 1 */ + *offset = 1 ^ will_have_0; + return zero_addr + *offset; +} + +PERL_STATIC_INLINE UV* +S_invlist_array(SV* const invlist) +{ + /* Returns the pointer to the inversion list's array. Every time the + * length changes, this needs to be called in case malloc or realloc moved + * it */ + + PERL_ARGS_ASSERT_INVLIST_ARRAY; + + /* Must not be empty. If these fail, you probably didn't check for + * being non-zero before trying to get the array */ + assert(_invlist_len(invlist)); + + /* The very first element always contains zero, The array begins either + * there, or if the inversion list is offset, at the element after it. + * The offset header field determines which; it contains 0 or 1 to indicate + * how much additionally to add */ + assert(0 == *(SvPVX(invlist))); + return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); +} + +PERL_STATIC_INLINE void +S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) +{ + /* Sets the current number of elements stored in the inversion list. + * Updates SvCUR correspondingly */ + PERL_UNUSED_CONTEXT; + PERL_ARGS_ASSERT_INVLIST_SET_LEN; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + SvCUR_set(invlist, + (len == 0) + ? 0 + : TO_INTERNAL_SIZE(len + offset)); + assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); +} + +PERL_STATIC_INLINE IV* +S_get_invlist_previous_index_addr(SV* invlist) +{ + /* Return the address of the IV that is reserved to hold the cached index + * */ + PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); +} + +PERL_STATIC_INLINE IV +S_invlist_previous_index(SV* const invlist) +{ + /* Returns cached index of previous search */ + + PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; + + return *get_invlist_previous_index_addr(invlist); +} + +PERL_STATIC_INLINE void +S_invlist_set_previous_index(SV* const invlist, const IV index) +{ + /* Caches for later retrieval */ + + PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; + + assert(index == 0 || index < (int) _invlist_len(invlist)); + + *get_invlist_previous_index_addr(invlist) = index; +} + +PERL_STATIC_INLINE UV +S_invlist_max(SV* const invlist) +{ + /* Returns the maximum number of elements storable in the inversion list's + * array, without having to realloc() */ + + PERL_ARGS_ASSERT_INVLIST_MAX; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ + return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; +} + +#ifndef PERL_IN_XSUB_RE +SV* +Perl__new_invlist(pTHX_ IV initial_size) +{ + + /* Return a pointer to a newly constructed inversion list, with enough + * space to store 'initial_size' elements. If that number is negative, a + * system default is used instead */ + + SV* new_list; + + if (initial_size < 0) { + initial_size = 10; + } + + /* Allocate the initial space */ + new_list = newSV_type(SVt_INVLIST); + + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(new_list, 0, 0); + + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX; + + *get_invlist_previous_index_addr(new_list) = 0; + + return new_list; +} + +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) +{ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to , which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands. The passed in 'list' contains + * several header fields at the beginning that are not part of the + * inversion list body proper */ + + const STRLEN length = (STRLEN) list[0]; + const UV version_id = list[1]; + const bool offset = cBOOL(list[2]); +#define HEADER_LENGTH 3 + /* If any of the above changes in any way, you must change HEADER_LENGTH + * (if appropriate) and regenerate INVLIST_VERSION_ID by running + * perl -E 'say int(rand 2**31-1)' + */ +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and + data structure type, so that one being + passed in can be validated to be an + inversion list of the correct vintage. + */ + + SV* invlist = newSV_type(SVt_INVLIST); + + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; + + if (version_id != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + /* The generated array passed in includes header elements that aren't part + * of the list proper, so start it just after them */ + SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); + + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); + + invlist_set_previous_index(invlist, 0); + + /* Initialize the iteration pointer. */ + invlist_iterfinish(invlist); + + SvREADONLY_on(invlist); + + return invlist; +} +#endif /* ifndef PERL_IN_XSUB_RE */ + +STATIC void +S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) +{ + /* Grow the maximum size of an inversion list */ + + PERL_ARGS_ASSERT_INVLIST_EXTEND; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Add one to account for the zero element at the beginning which may not + * be counted by the calling parameters */ + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); +} + +PERL_STATIC_INLINE void +S_invlist_trim(SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_TRIM; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + /* Change the length of the inversion list to how many entries it currently + * has */ + SvPV_shrink_to_cur((SV *) invlist); +} + +STATIC void +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) +{ + /* Subject to change or removal. Append the range from 'start' to 'end' at + * the end of the inversion list. The range must be above any existing + * ones. */ + + UV* array; + UV max = invlist_max(invlist); + UV len = _invlist_len(invlist); + bool offset; + + PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; + + if (len == 0) { /* Empty lists must be initialized */ + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); + } + else { + /* Here, the existing list is non-empty. The current max entry in the + * list is generally the first value not in the set, except when the + * set extends to the end of permissible values, in which case it is + * the first entry in that final set, and so this call is an attempt to + * append out-of-order */ + + UV final_element = len - 1; + array = invlist_array(invlist); + if (array[final_element] > start + || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) + { + Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); + } + + /* Here, it is a legal append. If the new range begins with the first + * value not in the set, it is extending the set, so the new first + * value not in the set is one greater than the newly extended range. + * */ + offset = *get_invlist_offset_addr(invlist); + if (array[final_element] == start) { + if (end != UV_MAX) { + array[final_element] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, + * just let the range that this would extend to have no end */ + invlist_set_len(invlist, len - 1, offset); + } + return; + } + } + + /* Here the new range doesn't extend any existing set. Add it */ + + len += 2; /* Includes an element each for the start and end of range */ + + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ + if (max < len) { + invlist_extend(invlist, len); + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + + array = invlist_array(invlist); + } + else { + invlist_set_len(invlist, len, offset); + } + + /* The next item on the list starts the range, the one after that is + * one past the new range. */ + array[len - 2] = start; + if (end != UV_MAX) { + array[len - 1] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, just let + * the range have no end */ + invlist_set_len(invlist, len - 1, offset); + } +} + +#ifndef PERL_IN_XSUB_RE + +IV +Perl__invlist_search(SV* const invlist, const UV cp) +{ + /* Searches the inversion list for the entry that contains the input code + * point . If is not in the list, -1 is returned. Otherwise, the + * return value is the index into the list's array of the range that + * contains */ + + IV low = 0; + IV mid; + IV high = _invlist_len(invlist); + const IV highest_element = high - 1; + const UV* array; + + PERL_ARGS_ASSERT__INVLIST_SEARCH; + + /* If list is empty, return failure. */ + if (high == 0) { + return -1; + } + + /* (We can't get the array unless we know the list is non-empty) */ + array = invlist_array(invlist); + + mid = invlist_previous_index(invlist); + assert(mid >=0 && mid <= highest_element); + + /* contains the cache of the result of the previous call to this + * function (0 the first time). See if this call is for the same result, + * or if it is for mid-1. This is under the theory that calls to this + * function will often be for related code points that are near each other. + * And benchmarks show that caching gives better results. We also test + * here if the code point is within the bounds of the list. These tests + * replace others that would have had to be made anyway to make sure that + * the array bounds were not exceeded, and these give us extra information + * at the same time */ + if (cp >= array[mid]) { + if (cp >= array[highest_element]) { + return highest_element; + } + + /* Here, array[mid] <= cp < array[highest_element]. This means that + * the final element is not the answer, so can exclude it; it also + * means that is not the final element, so can refer to 'mid + 1' + * safely */ + if (cp < array[mid + 1]) { + return mid; + } + high--; + low = mid + 1; + } + else { /* cp < aray[mid] */ + if (cp < array[0]) { /* Fail if outside the array */ + return -1; + } + high = mid; + if (cp >= array[mid - 1]) { + goto found_entry; + } + } + + /* Binary search. What we are looking for is such that + * array[i] <= cp < array[i+1] + * The loop below converges on the i+1. Note that there may not be an + * (i+1)th element in the array, and things work nonetheless */ + while (low < high) { + mid = (low + high) / 2; + assert(mid <= highest_element); + if (array[mid] <= cp) { /* cp >= array[mid] */ + low = mid + 1; + + /* We could do this extra test to exit the loop early. + if (cp < array[low]) { + return mid; + } + */ + } + else { /* cp < array[mid] */ + high = mid; + } + } + + found_entry: + high--; + invlist_set_previous_index(invlist, high); + return high; +} + +void +Perl__invlist_populate_swatch(SV* const invlist, + const UV start, const UV end, U8* swatch) +{ + /* populates a swatch of a swash the same way swatch_get() does in utf8.c, + * but is used when the swash has an inversion list. This makes this much + * faster, as it uses a binary search instead of a linear one. This is + * intimately tied to that function, and perhaps should be in utf8.c, + * except it is intimately tied to inversion lists as well. It assumes + * that is all 0's on input */ + + UV current = start; + const IV len = _invlist_len(invlist); + IV i; + const UV * array; + + PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; + + if (len == 0) { /* Empty inversion list */ + return; + } + + array = invlist_array(invlist); + + /* Find which element it is */ + i = _invlist_search(invlist, start); + + /* We populate from to */ + while (current < end) { + UV upper; + + /* The inversion list gives the results for every possible code point + * after the first one in the list. Only those ranges whose index is + * even are ones that the inversion list matches. For the odd ones, + * and if the initial code point is not in the list, we have to skip + * forward to the next element */ + if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { + i++; + if (i >= len) { /* Finished if beyond the end of the array */ + return; + } + current = array[i]; + if (current >= end) { /* Finished if beyond the end of what we + are populating */ + if (LIKELY(end < UV_MAX)) { + return; + } + + /* We get here when the upper bound is the maximum + * representable on the machine, and we are looking for just + * that code point. Have to special case it */ + i = len; + goto join_end_of_list; + } + } + assert(current >= start); + + /* The current range ends one below the next one, except don't go past + * */ + i++; + upper = (i < len && array[i] < end) ? array[i] : end; + + /* Here we are in a range that matches. Populate a bit in the 3-bit U8 + * for each code point in it */ + for (; current < upper; current++) { + const STRLEN offset = (STRLEN)(current - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + + join_end_of_list: + + /* Quit if at the end of the list */ + if (i >= len) { + + /* But first, have to deal with the highest possible code point on + * the platform. The previous code assumes that is one + * beyond where we want to populate, but that is impossible at the + * platform's infinity, so have to handle it specially */ + if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) + { + const STRLEN offset = (STRLEN)(end - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + return; + } + + /* Advance to the next range, which will be for code points not in the + * inversion list */ + current = array[i]; + } + + return; +} + +void +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) +{ + /* Take the union of two inversion lists and point to it. *output + * SHOULD BE DEFINED upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *output will be made correspondingly + * mortal. The first list, , may be NULL, in which case a copy of the + * second list is returned. If is TRUE, the union is taken + * of the complement (inversion) of instead of b itself. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. + * + * The algorithm is like a merge sort. + * + * XXX A potential performance improvement is to keep track as we go along + * if only one of the inputs contributes to the result, meaning the other + * is a subset of that one. In that case, we can skip the final copy and + * return the larger of the input lists, but then outside code might need + * to keep track of whether to free the input list or not */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* u; /* the resulting union */ + UV* array_u; + UV len_u; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_u = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 0. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is zero is something not in the set. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* If either one is empty, the union is the other one */ + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + bool make_temp = FALSE; /* Should we mortalize the result? */ + + if (*output == a) { + if (a != NULL) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + } + if (*output != b) { + *output = invlist_clone(b); + if (complement_b) { + _invlist_invert(*output); + } + } /* else *output already = b; */ + + if (make_temp) { + sv_2mortal(*output); + } + return; + } + else if ((len_b = _invlist_len(b)) == 0) { + bool make_temp = FALSE; + if (*output == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + + /* The complement of an empty list is a list that has everything in it, + * so the union with includes everything too */ + if (complement_b) { + if (a == *output) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + *output = _new_invlist(1); + _append_range_to_invlist(*output, 0, UV_MAX); + } + else if (*output != a) { + *output = invlist_clone(a); + } + /* else *output already = a; */ + + if (make_temp) { + sv_2mortal(*output); + } + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the union for the worst case: that the sets are completely + * disjoint */ + u = _new_invlist(len_a + len_b); + + /* Will contain U+0000 if either component does */ + array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) + || (len_b > 0 && array_b[0] == 0)); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the union's array */ + bool cp_in_set; /* is it in the the input list's set or not */ + + /* We need to take one or the other of the two inputs for the union. + * Since we are merging two sorted lists, we take the smaller of the + * next items. In case of a tie, we take the one that is in its set + * first. If we took one not in the set first, it would decrement the + * count, possibly to 0 which would cause it to be output as ending the + * range, and the next time through we would take the same number, and + * output it again as beginning the next range. By doing it the + * opposite way, there is no possibility that the count will be + * momentarily decremented to 0, and thus the two adjoining ranges will + * be seamlessly merged. (In a tie and both are in the set or both not + * in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp = array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 0, which marks the + * beginning/end of a range in that's in the set */ + if (cp_in_set) { + if (count == 0) { + array_u[i_u++] = cp; + } + count++; + } + else { + count--; + if (count == 0) { + array_u[i_u++] = cp; + } + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that hasn't been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to the element beyond + * the one we care about.) If in the set, we decrement 'count'; if 0, there + * is potentially more to output. + * There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. What's left + * in the union is entirely from the non-exhausted set. + * 2) Both were in their sets, count is 2. Nothing further should + * be output, as everything that remains will be in the exhausted + * list's set, hence in the union; decrementing to 1 but not 0 insures + * that + * 3) the exhausted was in its set, non-exhausted isn't, count is 1. + * Nothing further should be output because the union includes + * everything from the exhausted set. Not decrementing ensures that. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; + * decrementing to 0 insures that we look at the remainder of the + * non-exhausted set */ + if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count--; + } + + /* The final length is what we've output so far, plus what else is about to + * be output. (If 'count' is non-zero, then the input list we exhausted + * has everything remaining up to the machine's limit in its set, and hence + * in the union, so there will be no further output. */ + len_u = i_u; + if (count == 0) { + /* At most one of the subexpressions will be non-zero */ + len_u += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_u, so + * re-find it */ + if (len_u != _invlist_len(u)) { + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); + invlist_trim(u); + array_u = invlist_array(u); + } + + /* When 'count' is 0, the list that was exhausted (if one was shorter than + * the other) ended with everything above it not in its set. That means + * that the remaining part of the union is precisely the same as the + * non-exhausted list, so can just copy it unchanged. (If both list were + * exhausted at the same time, then the operations below will be both 0.) + */ + if (count == 0) { + IV copy_count; /* At most one will have a non-zero copy count */ + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_u + i_u, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_u + i_u, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ + if (a == *output || b == *output) { + assert(! invlist_is_iterating(*output)); + if ((SvTEMP(*output))) { + sv_2mortal(u); + } + else { + SvREFCNT_dec_NN(*output); + } + } + + *output = u; + + return; +} + +void +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) +{ + /* Take the intersection of two inversion lists and point to it. *i + * SHOULD BE DEFINED upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented if not already a + * temporary (mortal); otherwise *i will be made correspondingly mortal. + * The first list, , may be NULL, in which case an empty list is + * returned. If is TRUE, the result will be the + * intersection of and the complement (or inversion) of instead of + * directly. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. In fact, it had bugs + * + * The algorithm is like a merge sort, and is essentially the same as the + * union above + */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* r; /* the resulting intersection */ + UV* array_r; + UV len_r; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_r = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 2. + * The count is incremented when we start a range that's in the set, and + * decremented when we start a range that's not in the set. So its range + * is 0 to 2. Only when the count is 2 is something in the intersection. + */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* Special case if either one is empty */ + len_a = (a == NULL) ? 0 : _invlist_len(a); + if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + bool make_temp = FALSE; + + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the above 'if', 'b' must + * be empty. Here, also we are using 'b's complement, which hence + * must be every possible code point. Thus the intersection is + * simply 'a'. */ + if (*i != a) { + if (*i == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + + *i = invlist_clone(a); + } + /* else *i is already 'a' */ + + if (make_temp) { + sv_2mortal(*i); + } + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ + if (*i == a) { + if (! (make_temp = cBOOL(SvTEMP(a)))) { + SvREFCNT_dec_NN(a); + } + } + else if (*i == b) { + if (! (make_temp = cBOOL(SvTEMP(b)))) { + SvREFCNT_dec_NN(b); + } + } + *i = _new_invlist(0); + if (make_temp) { + sv_2mortal(*i); + } + + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the intersection for the worst case: that the intersection ends up + * fragmenting everything to be completely disjoint */ + r= _new_invlist(len_a + len_b); + + /* Will contain U+0000 iff both components do */ + array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 + && len_b > 0 && array_b[0] == 0); + + /* Go through each list item by item, stopping when exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the intersection's + array */ + bool cp_in_set; /* Is it in the input list's set or not */ + + /* We need to take one or the other of the two inputs for the + * intersection. Since we are merging two sorted lists, we take the + * smaller of the next items. In case of a tie, we take the one that + * is not in its set first (a difference from the union algorithm). If + * we took one in the set first, it would increment the count, possibly + * to 2 which would cause it to be output as starting a range in the + * intersection, and the next time through we would take that same + * number, and output it again as ending the set. By doing it the + * opposite of this, there is no possibility that the count will be + * momentarily incremented to 2. (In a tie and both are in the set or + * both not in the set, it doesn't matter which we take first.) */ + if (array_a[i_a] < array_b[i_b] + || (array_a[i_a] == array_b[i_b] + && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp= array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp= array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 2, which marks the + * beginning/end of a range that's in the intersection */ + if (cp_in_set) { + count++; + if (count == 2) { + array_r[i_r++] = cp; + } + } + else { + if (count == 2) { + array_r[i_r++] = cp; + } + count--; + } + } + + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that has been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to elements 1 beyond + * the ones we care about.) There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. There's + * nothing left in the intersection. + * 2) Both were in their sets, count is 2 and perhaps is incremented to + * above 2. What should be output is exactly that which is in the + * non-exhausted set, as everything it has is also in the intersection + * set, and everything it doesn't have can't be in the intersection + * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and + * gets incremented to 2. Like the previous case, the intersection is + * everything that remains in the non-exhausted set. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and + * remains 1. And the intersection has nothing more. */ + if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count++; + } + + /* The final length is what we've output so far plus what else is in the + * intersection. At most one of the subexpressions below will be non-zero + * */ + len_r = i_r; + if (count >= 2) { + len_r += (len_a - i_a) + (len_b - i_b); + } + + /* Set result to final length, which can change the pointer to array_r, so + * re-find it */ + if (len_r != _invlist_len(r)) { + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); + invlist_trim(r); + array_r = invlist_array(r); + } + + /* Finish outputting any remaining */ + if (count >= 2) { /* At most one will have a non-zero copy count */ + IV copy_count; + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_r + i_r, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_r + i_r, copy_count, UV); + } + } + + /* We may be removing a reference to one of the inputs. If so, the output + * is made mortal if the input was. (Mortal SVs shouldn't have their ref + * count decremented) */ + if (a == *i || b == *i) { + assert(! invlist_is_iterating(*i)); + if (SvTEMP(*i)) { + sv_2mortal(r); + } + else { + SvREFCNT_dec_NN(*i); + } + } + + *i = r; + + return; +} + +SV* +Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) +{ + /* Add the range from 'start' to 'end' inclusive to the inversion list's + * set. A pointer to the inversion list is returned. This may actually be + * a new list, in which case the passed in one has been destroyed. The + * passed in inversion list can be NULL, in which case a new one is created + * with just the one range in it */ + + SV* range_invlist; + UV len; + + if (invlist == NULL) { + invlist = _new_invlist(2); + len = 0; + } + else { + len = _invlist_len(invlist); + } + + /* If comes after the final entry actually in the list, can just append it + * to the end, */ + if (len == 0 + || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1) + && start >= invlist_array(invlist)[len - 1])) + { + _append_range_to_invlist(invlist, start, end); + return invlist; + } + + /* Here, can't just append things, create and return a new inversion list + * which is the union of this range and the existing inversion list */ + range_invlist = _new_invlist(2); + _append_range_to_invlist(range_invlist, start, end); + + _invlist_union(invlist, range_invlist, &invlist); + + /* The temporary can be freed */ + SvREFCNT_dec_NN(range_invlist); + + return invlist; +} + +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + _append_range_to_invlist(invlist, element0, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + +#endif + +PERL_STATIC_INLINE SV* +S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { + return _add_range_to_invlist(invlist, cp, cp); +} + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_invert(pTHX_ SV* const invlist) +{ + /* Complement the input inversion list. This adds a 0 if the list didn't + * have a zero; removes it otherwise. As described above, the data + * structure is set up so that this is very efficient */ + + PERL_ARGS_ASSERT__INVLIST_INVERT; + + assert(! invlist_is_iterating(invlist)); + + /* The inverse of matching nothing is matching everything */ + if (_invlist_len(invlist) == 0) { + _append_range_to_invlist(invlist, 0, UV_MAX); + return; + } + + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); +} + +#endif + +PERL_STATIC_INLINE SV* +S_invlist_clone(pTHX_ SV* const invlist) +{ + + /* Return a new inversion list that is a copy of the input one, which is + * unchanged. The new list will not be mortal even if the old one was. */ + + /* Need to allocate extra space to accommodate Perl's addition of a + * trailing NUL to SvPV's, since it thinks they are always strings */ + SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); + STRLEN physical_length = SvCUR(invlist); + bool offset = *(get_invlist_offset_addr(invlist)); + + PERL_ARGS_ASSERT_INVLIST_CLONE; + + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, _invlist_len(invlist), offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); + + return new_invlist; +} + +PERL_STATIC_INLINE STRLEN* +S_get_invlist_iter_addr(SV* invlist) +{ + /* Return the address of the UV that contains the current iteration + * position */ + + PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + return &(((XINVLIST*) SvANY(invlist))->iterator); +} + +PERL_STATIC_INLINE void +S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ +{ + PERL_ARGS_ASSERT_INVLIST_ITERINIT; + + *get_invlist_iter_addr(invlist) = 0; +} + +PERL_STATIC_INLINE void +S_invlist_iterfinish(SV* invlist) +{ + /* Terminate iterator for invlist. This is to catch development errors. + * Any iteration that is interrupted before completed should call this + * function. Functions that add code points anywhere else but to the end + * of an inversion list assert that they are not in the middle of an + * iteration. If they were, the addition would make the iteration + * problematical: if the iteration hadn't reached the place where things + * were being added, it would be ok */ + + PERL_ARGS_ASSERT_INVLIST_ITERFINISH; + + *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX; +} + +STATIC bool +S_invlist_iternext(SV* invlist, UV* start, UV* end) +{ + /* An C call on must be used to set this up. + * This call sets in <*start> and <*end>, the next range in . + * Returns if successful and the next call will return the next + * range; if was already at the end of the list. If the latter, + * <*start> and <*end> are unchanged, and the next call to this function + * will start over at the beginning of the list */ + + STRLEN* pos = get_invlist_iter_addr(invlist); + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_ITERNEXT; + + if (*pos >= len) { + *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */ + return FALSE; + } + + array = invlist_array(invlist); + + *start = array[(*pos)++]; + + if (*pos >= len) { + *end = UV_MAX; + } + else { + *end = array[(*pos)++] - 1; + } + + return TRUE; +} + +PERL_STATIC_INLINE bool +S_invlist_is_iterating(SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; + + return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; +} + +PERL_STATIC_INLINE UV +S_invlist_highest(SV* const invlist) +{ + /* Returns the highest code point that matches an inversion list. This API + * has an ambiguity, as it returns 0 under either the highest is actually + * 0, or if the list is empty. If this distinction matters to you, check + * for emptiness before calling this function */ + + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_HIGHEST; + + if (len == 0) { + return 0; + } + + array = invlist_array(invlist); + + /* The last element in the array in the inversion list always starts a + * range that goes to infinity. That range may be for code points that are + * matched in the inversion list, or it may be for ones that aren't + * matched. In the latter case, the highest code point in the set is one + * less than the beginning of this range; otherwise it is the final element + * of this range: infinity */ + return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) + ? UV_MAX + : array[len - 1] - 1; +} + +#ifndef PERL_IN_XSUB_RE +SV * +Perl__invlist_contents(pTHX_ SV* const invlist) +{ + /* Get the contents of an inversion list into a string SV so that they can + * be printed out. It uses the format traditionally done for debug tracing + */ + + UV start, end; + SV* output = newSVpvs("\n"); + + PERL_ARGS_ASSERT__INVLIST_CONTENTS; + + assert(! invlist_is_iterating(invlist)); + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); + } + else if (end != start) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", + start, end); + } + else { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); + } + } + + return output; +} +#endif + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) +{ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFINITY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ + + UV start, end; + STRLEN count = 0; + + PERL_ARGS_ASSERT__INVLIST_DUMP; + + if (invlist_is_iterating(invlist)) { + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); + return; + } + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + indent, (UV)count, start); + } + else if (end != start) { + Perl_dump_indent(aTHX_ level, file, + "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + indent, (UV)count, start, end); + } + else { + Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + indent, (UV)count, start); + } + count += 2; + } +} + +void +Perl__load_PL_utf8_foldclosures (pTHX) +{ + assert(! PL_utf8_foldclosures); + + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); +} +#endif + +#ifdef PERL_ARGS_ASSERT__INVLISTEQ +bool +S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) +{ + /* Return a boolean as to if the two passed in inversion lists are + * identical. The final argument, if TRUE, says to take the complement of + * the second inversion list before doing the comparison */ + + const UV* array_a = invlist_array(a); + const UV* array_b = invlist_array(b); + UV len_a = _invlist_len(a); + UV len_b = _invlist_len(b); + + UV i = 0; /* current index into the arrays */ + bool retval = TRUE; /* Assume are identical until proven otherwise */ + + PERL_ARGS_ASSERT__INVLISTEQ; + + /* If are to compare 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* The complement of nothing is everything, so would have to have + * just one element, starting at zero (ending at infinity) */ + if (len_b == 0) { + return (len_a == 1 && array_a[0] == 0); + } + else if (array_b[0] == 0) { + + /* Otherwise, to complement, we invert. Here, the first element is + * 0, just remove it. To do this, we just pretend the array starts + * one later */ + + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Make sure that the lengths are the same, as well as the final element + * before looping through the remainder. (Thus we test the length, final, + * and first elements right off the bat) */ + if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { + retval = FALSE; + } + else for (i = 0; i < len_a - 1; i++) { + if (array_a[i] != array_b[i]) { + retval = FALSE; + break; + } + } + + return retval; +} +#endif + +#undef HEADER_LENGTH +#undef TO_INTERNAL_SIZE +#undef FROM_INTERNAL_SIZE +#undef INVLIST_VERSION_ID + +/* End of inversion list object */ + +STATIC void +S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) +{ + /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' + * constructs, and updates RExC_flags with them. On input, RExC_parse + * should point to the first flag; it is updated on output to point to the + * final ')' or ':'. There needs to be at least one flag, or this will + * abort */ + + /* for (?g), (?gc), and (?o) warnings; warning + about (?c) will warn about (?g) -- japhy */ + +#define WASTED_O 0x01 +#define WASTED_G 0x02 +#define WASTED_C 0x04 +#define WASTED_GC (WASTED_G|WASTED_C) + I32 wastedflags = 0x00; + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; + char has_charset_modifier = '\0'; + regex_charset cs; + bool has_use_defaults = FALSE; + const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ + + PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; + + /* '^' as an initial flag sets certain defaults */ + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + has_use_defaults = TRUE; + STD_PMMOD_FLAGS_CLEAR(&RExC_flags); + set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET); + } + + cs = get_regex_charset(RExC_flags); + if (cs == REGEX_DEPENDS_CHARSET + && (RExC_utf8 || RExC_uni_semantics)) + { + cs = REGEX_UNICODE_CHARSET; + } + + while (*RExC_parse) { + /* && strchr("iogcmsx", *RExC_parse) */ + /* (?g), (?gc) and (?o) are useless here + and must be globally applied -- japhy */ + switch (*RExC_parse) { + + /* Code for the imsx flags */ + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + + case LOCALE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_LOCALE_CHARSET; + has_charset_modifier = LOCALE_PAT_MOD; + break; + case UNICODE_PAT_MOD: + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + cs = REGEX_UNICODE_CHARSET; + has_charset_modifier = UNICODE_PAT_MOD; + break; + case ASCII_RESTRICT_PAT_MOD: + if (flagsp == &negflags) { + goto neg_modifier; + } + if (has_charset_modifier) { + if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { + goto excess_modifier; + } + /* Doubled modifier implies more restricted */ + cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; + } + else { + cs = REGEX_ASCII_RESTRICTED_CHARSET; + } + has_charset_modifier = ASCII_RESTRICT_PAT_MOD; + break; + case DEPENDS_PAT_MOD: + if (has_use_defaults) { + goto fail_modifiers; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + else if (has_charset_modifier) { + goto excess_modifier; + } + + /* The dual charset means unicode semantics if the + * pattern (or target, not known until runtime) are + * utf8, or something in the pattern indicates unicode + * semantics */ + cs = (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET; + has_charset_modifier = DEPENDS_PAT_MOD; + break; + excess_modifier: + RExC_parse++; + if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { + vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); + } + else if (has_charset_modifier == *(RExC_parse - 1)) { + vFAIL2("Regexp modifier \"%c\" may not appear twice", + *(RExC_parse - 1)); + } + else { + vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); + } + /*NOTREACHED*/ + neg_modifier: + RExC_parse++; + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", + *(RExC_parse - 1)); + /*NOTREACHED*/ + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + const I32 wflagbit = *RExC_parse == 'o' + ? WASTED_O + : WASTED_G; + if (! (wastedflags & wflagbit) ) { + wastedflags |= wflagbit; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ + vWARN5( + RExC_parse + 1, + "Useless (%s%c) - %suse /%c modifier", + flagsp == &negflags ? "?-" : "?", + *RExC_parse, + flagsp == &negflags ? "don't " : "", + *RExC_parse + ); + } + } + break; + + case CONTINUE_PAT_MOD: /* 'c' */ + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (! (wastedflags & WASTED_C) ) { + wastedflags |= WASTED_GC; + /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ + vWARN3( + RExC_parse + 1, + "Useless (%sc) - %suse /gc modifier", + flagsp == &negflags ? "?-" : "?", + flagsp == &negflags ? "don't " : "" + ); + } + } + break; + case KEEPCOPY_PAT_MOD: /* 'p' */ + if (flagsp == &negflags) { + if (SIZE_ONLY) + ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); + } else { + *flagsp |= RXf_PMf_KEEPCOPY; + } + break; + case '-': + /* A flag is a default iff it is following a minus, so + * if there is a minus, it means will be trying to + * re-specify a default which is an error */ + if (has_use_defaults || flagsp == &negflags) { + goto fail_modifiers; + } + flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case ':': + case ')': + RExC_flags |= posflags; + RExC_flags &= ~negflags; + set_regex_charset(&RExC_flags, cs); + if (RExC_flags & RXf_PMf_FOLD) { + RExC_contains_i = 1; + } + return; + /*NOTREACHED*/ + default: + fail_modifiers: + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); + /*NOTREACHED*/ + } + + ++RExC_parse; + } +} + +/* + - reg - regular expression, i.e. main body or parenthesized thing + * + * Caller must absorb opening parenthesis. + * + * Combining parenthesis handling with the base level of regular expression + * is a trifle forced, but the need to tie the tails of the branches to what + * follows makes it hard to avoid. + */ +#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) +#ifdef DEBUGGING +#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) +#else +#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) +#endif + +/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets + flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan + needs to be restarted. + Otherwise would only return NULL if regbranch() returns NULL, which + cannot happen. */ +STATIC regnode * +S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) + /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. + * 2 is like 1, but indicates that nextchar() has been called to advance + * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and + * this flag alerts us to the need to check for that */ +{ + regnode *ret; /* Will be the head of the group. */ + regnode *br; + regnode *lastbr; + regnode *ender = NULL; + I32 parno = 0; + I32 flags; + U32 oregflags = RExC_flags; + bool have_branch = 0; + bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; + I32 num; /* numeric backreferences */ + + char * parse_start = RExC_parse; /* MJD */ + char * const oregcomp_parse = RExC_parse; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG; + DEBUG_PARSE("reg "); + + *flagp = 0; /* Tentatively. */ + + + /* Make an OPEN node, if parenthesized. */ + if (paren) { + + /* Under /x, space and comments can be gobbled up between the '(' and + * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such + * intervening space, as the sequence is a token, and a token should be + * indivisible */ + bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; + + if ( *RExC_parse == '*') { /* (*VERB:ARG) */ + char *start_verb = RExC_parse; + STRLEN verb_len = 0; + char *start_arg = NULL; + unsigned char op = 0; + int argok = 1; + int internal_argval = 0; /* internal_argval is only useful if + !argok */ + + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); + } + while ( *RExC_parse && *RExC_parse != ')' ) { + if ( *RExC_parse == ':' ) { + start_arg = RExC_parse + 1; + break; + } + RExC_parse++; + } + ++start_verb; + verb_len = RExC_parse - start_verb; + if ( start_arg ) { + RExC_parse++; + while ( *RExC_parse && *RExC_parse != ')' ) + RExC_parse++; + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern argument"); + if ( RExC_parse == start_arg ) + start_arg = NULL; + } else { + if ( *RExC_parse != ')' ) + vFAIL("Unterminated verb pattern"); + } + + switch ( *start_verb ) { + case 'A': /* (*ACCEPT) */ + if ( memEQs(start_verb,verb_len,"ACCEPT") ) { + op = ACCEPT; + internal_argval = RExC_nestroot; + } + break; + case 'C': /* (*COMMIT) */ + if ( memEQs(start_verb,verb_len,"COMMIT") ) + op = COMMIT; + break; + case 'F': /* (*FAIL) */ + if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { + op = OPFAIL; + argok = 0; + } + break; + case ':': /* (*:NAME) */ + case 'M': /* (*MARK:NAME) */ + if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { + op = MARKPOINT; + argok = -1; + } + break; + case 'P': /* (*PRUNE) */ + if ( memEQs(start_verb,verb_len,"PRUNE") ) + op = PRUNE; + break; + case 'S': /* (*SKIP) */ + if ( memEQs(start_verb,verb_len,"SKIP") ) + op = SKIP; + break; + case 'T': /* (*THEN) */ + /* [19:06] :: is then */ + if ( memEQs(start_verb,verb_len,"THEN") ) { + op = CUTGROUP; + RExC_seen |= REG_CUTGROUP_SEEN; + } + break; + } + if ( ! op ) { + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL2utf8f( + "Unknown verb pattern '%"UTF8f"'", + UTF8fARG(UTF, verb_len, start_verb)); + } + if ( argok ) { + if ( start_arg && internal_argval ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else if ( argok < 0 && !start_arg ) { + vFAIL3("Verb pattern '%.*s' has a mandatory argument", + verb_len, start_verb); + } else { + ret = reganode(pRExC_state, op, internal_argval); + if ( ! internal_argval && ! SIZE_ONLY ) { + if (start_arg) { + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); + RExC_rxi->data->data[ARG(ret)]=(void*)sv; + ret->flags = 0; + } else { + ret->flags = 1; + } + } + } + if (!internal_argval) + RExC_seen |= REG_VERBARG_SEEN; + } else if ( start_arg ) { + vFAIL3("Verb pattern '%.*s' may not have an argument", + verb_len, start_verb); + } else { + ret = reg_node(pRExC_state, op); + } + nextchar(pRExC_state); + return ret; + } + else if (*RExC_parse == '?') { /* (?...) */ + bool is_logical = 0; + const char * const seqstart = RExC_parse; + const char * endptr; + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(?...)', the '(' and '?' must be adjacent"); + } + + RExC_parse++; + paren = *RExC_parse++; + ret = NULL; /* For look-ahead/behind. */ + switch (paren) { + + case 'P': /* (?P...) variants for those used to PCRE/Python */ + paren = *RExC_parse++; + if ( paren == '<') /* (?P<...>) named capture */ + goto named_capture; + else if (paren == '>') { /* (?P>name) named recursion */ + goto named_recursion; + } + else if (paren == '=') { /* (?P=...) named backref */ + /* this pretty much dupes the code for \k in + * regatom(), if you change this make sure you change that + * */ + char* name_start = RExC_parse; + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ')') + /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + + nextchar(pRExC_state); + return ret; + } + RExC_parse++; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL3("Sequence (%.*s...) not recognized", + RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + case '<': /* (?<...) */ + if (*RExC_parse == '!') + paren = ','; + else if (*RExC_parse != '=') + named_capture: + { /* (?<...>) */ + char *name_start; + SV *svname; + paren= '>'; + case '\'': /* (?'...') */ + name_start= RExC_parse; + svname = reg_scan_name(pRExC_state, + SIZE_ONLY /* reverse test from the others */ + ? REG_RSN_RETURN_NAME + : REG_RSN_RETURN_NULL); + if (RExC_parse == name_start || *RExC_parse != paren) + vFAIL2("Sequence (?%c... not terminated", + paren=='>' ? '<' : paren); + if (SIZE_ONLY) { + HE *he_str; + SV *sv_dat = NULL; + if (!svname) /* shouldn't happen */ + Perl_croak(aTHX_ + "panic: reg_scan_name returned NULL"); + if (!RExC_paren_names) { + RExC_paren_names= newHV(); + sv_2mortal(MUTABLE_SV(RExC_paren_names)); +#ifdef DEBUGGING + RExC_paren_name_list= newAV(); + sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); +#endif + } + he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) { + /* croak baby croak */ + Perl_croak(aTHX_ + "panic: paren_name hash element allocation failed"); + } else if ( SvPOK(sv_dat) ) { + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, + SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIV_set(sv_dat, SvIVX(sv_dat) + 1); + } + } else { + (void)SvUPGRADE(sv_dat,SVt_PVNV); + sv_setpvn(sv_dat, (char *)&(RExC_npar), + sizeof(I32)); + SvIOK_on(sv_dat); + SvIV_set(sv_dat, 1); + } +#ifdef DEBUGGING + /* Yes this does cause a memory leak in debugging Perls + * */ + if (!av_store(RExC_paren_name_list, + RExC_npar, SvREFCNT_inc(svname))) + SvREFCNT_dec_NN(svname); +#endif + + /*sv_dump(sv_dat);*/ + } + nextchar(pRExC_state); + paren = 1; + goto capturing_parens; + } + RExC_seen |= REG_LOOKBEHIND_SEEN; + RExC_in_lookbehind++; + RExC_parse++; + /* FALLTHROUGH */ + case '=': /* (?=...) */ + RExC_seen_zerolen++; + break; + case '!': /* (?!...) */ + RExC_seen_zerolen++; + if (*RExC_parse == ')') { + ret=reg_node(pRExC_state, OPFAIL); + nextchar(pRExC_state); + return ret; + } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; + case ':': /* (?:...) */ + case '>': /* (?>...) */ + break; + case '$': /* (?$...) */ + case '@': /* (?@...) */ + vFAIL2("Sequence (?%c...) not implemented", (int)paren); + break; + case '0' : /* (?0) */ + case 'R' : /* (?R) */ + if (*RExC_parse != ')') + FAIL("Sequence (?R) not terminated"); + ret = reg_node(pRExC_state, GOSTART); + RExC_seen |= REG_GOSTART_SEEN; + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + /*notreached*/ + /* named and numeric backreferences */ + case '&': /* (?&NAME) */ + parse_start = RExC_parse - 1; + named_recursion: + { + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + if (RExC_parse == RExC_end || *RExC_parse != ')') + vFAIL("Sequence (?&... not terminated"); + goto gen_recurse_regop; + assert(0); /* NOT REACHED */ + case '+': + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse++; + vFAIL("Illegal pattern"); + } + goto parse_recursion; + /* NOT REACHED*/ + case '-': /* (?-1) */ + if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { + RExC_parse--; /* rewind to let it be handled later */ + goto parse_flags; + } + /* FALLTHROUGH */ + case '1': case '2': case '3': case '4': /* (?1) */ + case '5': case '6': case '7': case '8': case '9': + RExC_parse--; + parse_recursion: + { + bool is_neg = FALSE; + parse_start = RExC_parse - 1; /* MJD */ + if (*RExC_parse == '-') { + RExC_parse++; + is_neg = TRUE; + } + num = grok_atou(RExC_parse, &endptr); + if (endptr) + RExC_parse = (char*)endptr; + if (is_neg) { + /* Some limit for num? */ + num = -num; + } + } + if (*RExC_parse!=')') + vFAIL("Expecting close bracket"); + + gen_recurse_regop: + if ( paren == '-' ) { + /* + Diagram of capture buffer numbering. + Top line is the normal capture buffer numbers + Bottom line is the negative indexing as from + the X (the (?-2)) + + + 1 2 3 4 5 X 6 7 + /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ + - 5 4 3 2 1 X x x + + */ + num = RExC_npar + num; + if (num < 1) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + } else if ( paren == '+' ) { + num = RExC_npar + num - 1; + } + + ret = reganode(pRExC_state, GOSUB, num); + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) { + RExC_parse++; + vFAIL("Reference to nonexistent group"); + } + ARG2L_SET( ret, RExC_recurse_count++); + RExC_emit++; + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Recurse #%"UVuf" to %"IVdf"\n", + (UV)ARG(ret), (IV)ARG2L(ret))); + } else { + RExC_size++; + } + RExC_seen |= REG_RECURSE_SEEN; + Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ + Set_Node_Offset(ret, parse_start); /* MJD */ + + *flagp |= POSTPONED; + nextchar(pRExC_state); + return ret; + + assert(0); /* NOT REACHED */ + + case '?': /* (??...) */ + is_logical = 1; + if (*RExC_parse != '{') { + RExC_parse++; + /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f( + "Sequence (%"UTF8f"...) not recognized", + UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); + /*NOTREACHED*/ + } + *flagp |= POSTPONED; + paren = *RExC_parse++; + /* FALLTHROUGH */ + case '{': /* (?{...}) */ + { + U32 n = 0; + struct reg_code_block *cb; + + RExC_seen_zerolen++; + + if ( !pRExC_state->num_code_blocks + || pRExC_state->code_index >= pRExC_state->num_code_blocks + || pRExC_state->code_blocks[pRExC_state->code_index].start + != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) + - RExC_start) + ) { + if (RExC_pm_flags & PMf_USE_RE_EVAL) + FAIL("panic: Sequence (?{...}): no code block found\n"); + FAIL("Eval-group not allowed at runtime, use re 'eval'"); + } + /* this is a pre-compiled code block (?{...}) */ + cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + RExC_parse = RExC_start + cb->end; + if (!SIZE_ONLY) { + OP *o = cb->block; + if (cb->src_regex) { + n = add_data(pRExC_state, STR_WITH_LEN("rl")); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o; + } + else { + n = add_data(pRExC_state, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); + RExC_rxi->data->data[n] = (void*)o; + } + } + pRExC_state->code_index++; + nextchar(pRExC_state); + + if (is_logical) { + regnode *eval; + ret = reg_node(pRExC_state, LOGICAL); + eval = reganode(pRExC_state, EVAL, n); + if (!SIZE_ONLY) { + ret->flags = 2; + /* for later propagation into (??{}) return value */ + eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); + } + REGTAIL(pRExC_state, ret, eval); + /* deal with the length of this later - MJD */ + return ret; + } + ret = reganode(pRExC_state, EVAL, n); + Set_Node_Length(ret, RExC_parse - parse_start + 1); + Set_Node_Offset(ret, parse_start); + return ret; + } + case '(': /* (?(?{...})...) and (?(?=...)...) */ + { + int is_define= 0; + if (RExC_parse[0] == '?') { /* (?(?...)) */ + if (RExC_parse[1] == '=' || RExC_parse[1] == '!' + || RExC_parse[1] == '<' + || RExC_parse[1] == '{') { /* Lookahead or eval. */ + I32 flag; + regnode *tail; + + ret = reg_node(pRExC_state, LOGICAL); + if (!SIZE_ONLY) + ret->flags = 1; + + tail = reg(pRExC_state, 1, &flag, depth+1); + if (flag & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + REGTAIL(pRExC_state, ret, tail); + goto insert_if; + } + /* Fall through to ‘Unknown switch condition’ at the + end of the if/else chain. */ + } + else if ( RExC_parse[0] == '<' /* (?()...) */ + || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ + { + char ch = RExC_parse[0] == '<' ? '>' : '\''; + char *name_start= RExC_parse++; + U32 num = 0; + SV *sv_dat=reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ch) + vFAIL2("Sequence (?(%c... not terminated", + (ch == '>' ? '<' : ch)); + RExC_parse++; + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + ret = reganode(pRExC_state,NGROUPP,num); + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'D' && + RExC_parse[1] == 'E' && + RExC_parse[2] == 'F' && + RExC_parse[3] == 'I' && + RExC_parse[4] == 'N' && + RExC_parse[5] == 'E') + { + ret = reganode(pRExC_state,DEFINEP,0); + RExC_parse +=6 ; + is_define = 1; + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'R') { + RExC_parse++; + parno = 0; + if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + parno = grok_atou(RExC_parse, &endptr); + if (endptr) + RExC_parse = (char*)endptr; + } else if (RExC_parse[0] == '&') { + SV *sv_dat; + RExC_parse++; + sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); + parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + ret = reganode(pRExC_state,INSUBP,parno); + goto insert_if_check_paren; + } + else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + /* (?(1)...) */ + char c; + char *tmp; + parno = grok_atou(RExC_parse, &endptr); + if (endptr) + RExC_parse = (char*)endptr; + ret = reganode(pRExC_state, GROUPP, parno); + + insert_if_check_paren: + if (*(tmp = nextchar(pRExC_state)) != ')') { + /* nextchar also skips comments, so undo its work + * and skip over the the next character. + */ + RExC_parse = tmp; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Switch condition not recognized"); + } + insert_if: + REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); + br = regbranch(pRExC_state, &flags, 1,depth+1); + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + (UV) flags); + } else + REGTAIL(pRExC_state, br, reganode(pRExC_state, + LONGJMP, 0)); + c = *nextchar(pRExC_state); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + if (c == '|') { + if (is_define) + vFAIL("(?(DEFINE)....) does not allow branches"); + + /* Fake one for optimizer. */ + lastbr = reganode(pRExC_state, IFTHEN, 0); + + if (!regbranch(pRExC_state, &flags, 1,depth+1)) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + (UV) flags); + } + REGTAIL(pRExC_state, ret, lastbr); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; + c = *nextchar(pRExC_state); + } + else + lastbr = NULL; + if (c != ')') + vFAIL("Switch (?(condition)... contains too many branches"); + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, br, ender); + if (lastbr) { + REGTAIL(pRExC_state, lastbr, ender); + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); + } + else + REGTAIL(pRExC_state, ret, ender); + RExC_size++; /* XXX WHY do we need this?!! + For large programs it seems to be required + but I can't figure out why. -- dmq*/ + return ret; + } + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); + } + case '[': /* (?[ ... ]) */ + return handle_regex_sets(pRExC_state, NULL, flagp, depth, + oregcomp_parse); + case 0: + RExC_parse--; /* for vFAIL to print correctly */ + vFAIL("Sequence (? incomplete"); + break; + default: /* e.g., (?i) */ + --RExC_parse; + parse_flags: + parse_lparen_question_flags(pRExC_state); + if (UCHARAT(RExC_parse) != ':') { + nextchar(pRExC_state); + *flagp = TRYAGAIN; + return NULL; + } + paren = ':'; + nextchar(pRExC_state); + ret = NULL; + goto parse_rest; + } /* end switch */ + } + else { /* (...) */ + capturing_parens: + parno = RExC_npar; + RExC_npar++; + + ret = reganode(pRExC_state, OPEN, parno); + if (!SIZE_ONLY ){ + if (!RExC_nestroot) + RExC_nestroot = parno; + if (RExC_seen & REG_RECURSE_SEEN + && !RExC_open_parens[parno-1]) + { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting open paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ret))); + RExC_open_parens[parno-1]= ret; + } + } + Set_Node_Length(ret, 1); /* MJD */ + Set_Node_Offset(ret, RExC_parse); /* MJD */ + is_open = 1; + } + } + else /* ! paren */ + ret = NULL; + + parse_rest: + /* Pick up the branches, linking them together. */ + parse_start = RExC_parse; /* MJD */ + br = regbranch(pRExC_state, &flags, 1,depth+1); + + /* branch_len = (paren != 0); */ + + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + } + if (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, BRANCHJ, br, depth+1); + } + else { /* MJD */ + reginsert(pRExC_state, BRANCH, br, depth+1); + Set_Node_Length(br, paren != 0); + Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); + } + have_branch = 1; + if (SIZE_ONLY) + RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ + } + else if (paren == ':') { + *flagp |= flags&SIMPLE; + } + if (is_open) { /* Starts with OPEN. */ + REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */ + } + else if (paren != '?') /* Not Conditional */ + ret = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + lastbr = br; + while (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + ender = reganode(pRExC_state, LONGJMP,0); + + /* Append to the previous. */ + REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); + } + if (SIZE_ONLY) + RExC_extralen += 2; /* Account for LONGJMP. */ + nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } + br = regbranch(pRExC_state, &flags, 0, depth+1); + + if (br == NULL) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + } + REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ + lastbr = br; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); + } + + if (have_branch || paren != ':') { + /* Make a closing node, and hook it on the end. */ + switch (paren) { + case ':': + ender = reg_node(pRExC_state, TAIL); + break; + case 1: case 2: + ender = reganode(pRExC_state, CLOSE, parno); + if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { + DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + "Setting close paren #%"IVdf" to %d\n", + (IV)parno, REG_NODE_NUM(ender))); + RExC_close_parens[parno-1]= ender; + if (RExC_nestroot == parno) + RExC_nestroot = 0; + } + Set_Node_Offset(ender,RExC_parse+1); /* MJD */ + Set_Node_Length(ender,1); /* MJD */ + break; + case '<': + case ',': + case '=': + case '!': + *flagp &= ~HASWIDTH; + /* FALLTHROUGH */ + case '>': + ender = reg_node(pRExC_state, SUCCEED); + break; + case 0: + ender = reg_node(pRExC_state, END); + if (!SIZE_ONLY) { + assert(!RExC_opend); /* there can only be one! */ + RExC_opend = ender; + } + break; + } + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("lsbr"); + regprop(RExC_rx, mysv_val1, lastbr, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); + PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(lastbr), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - lastbr) + ); + }); + REGTAIL(pRExC_state, lastbr, ender); + + if (have_branch && !SIZE_ONLY) { + char is_nothing= 1; + if (depth==1) + RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; + + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br; br = regnext(br)) { + const U8 op = PL_regkind[OP(br)]; + if (op == BRANCH) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); + if ( OP(NEXTOPER(br)) != NOTHING + || regnext(NEXTOPER(br)) != ender) + is_nothing= 0; + } + else if (op == BRANCHJ) { + REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); + /* for now we always disable this optimisation * / + if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING + || regnext(NEXTOPER(NEXTOPER(br))) != ender) + */ + is_nothing= 0; + } + } + if (is_nothing) { + br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("NADA"); + regprop(RExC_rx, mysv_val1, ret, NULL); + regprop(RExC_rx, mysv_val2, ender, NULL); + PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(ret), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - ret) + ); + }); + OP(br)= NOTHING; + if (OP(ender) == TAIL) { + NEXT_OFF(br)= 0; + RExC_emit= br + 1; + } else { + regnode *opt; + for ( opt= br + 1; opt < ender ; opt++ ) + OP(opt)= OPTIMIZED; + NEXT_OFF(br)= ender - br; + } + } + } + } + + { + const char *p; + static const char parens[] = "=!<,>"; + + if (paren && (p = strchr(parens, paren))) { + U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; + int flag = (p - parens) > 1; + + if (paren == '>') + node = SUSPEND, flag = 0; + reginsert(pRExC_state, node,ret, depth+1); + Set_Node_Cur_Length(ret, parse_start); + Set_Node_Offset(ret, parse_start + 1); + ret->flags = flag; + REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); + } + } + + /* Check for proper termination. */ + if (paren) { + /* restore original flags, but keep (?p) */ + RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); + if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ("); + } + } + else if (!paren && RExC_parse < RExC_end) { + if (*RExC_parse == ')') { + RExC_parse++; + vFAIL("Unmatched )"); + } + else + FAIL("Junk on end of regexp"); /* "Can't happen". */ + assert(0); /* NOTREACHED */ + } + + if (RExC_in_lookbehind) { + RExC_in_lookbehind--; + } + if (after_freeze > RExC_npar) + RExC_npar = after_freeze; + return(ret); +} + +/* + - regbranch - one alternative of an | operator + * + * Implements the concatenation operator. + * + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + * restarted. + */ +STATIC regnode * +S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) +{ + regnode *ret; + regnode *chain = NULL; + regnode *latest; + I32 flags = 0, c = 0; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGBRANCH; + + DEBUG_PARSE("brnc"); + + if (first) + ret = NULL; + else { + if (!SIZE_ONLY && RExC_extralen) + ret = reganode(pRExC_state, BRANCHJ,0); + else { + ret = reg_node(pRExC_state, BRANCH); + Set_Node_Length(ret, 1); + } + } + + if (!first && SIZE_ONLY) + RExC_extralen += 1; /* BRANCHJ */ + + *flagp = WORST; /* Tentatively. */ + + RExC_parse--; + nextchar(pRExC_state); + while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { + flags &= ~TRYAGAIN; + latest = regpiece(pRExC_state, &flags,depth+1); + if (latest == NULL) { + if (flags & TRYAGAIN) + continue; + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); + } + else if (ret == NULL) + ret = latest; + *flagp |= flags&(HASWIDTH|POSTPONED); + if (chain == NULL) /* First piece. */ + *flagp |= flags&SPSTART; + else { + RExC_naughty++; + REGTAIL(pRExC_state, chain, latest); + } + chain = latest; + c++; + } + if (chain == NULL) { /* Loop ran zero times. */ + chain = reg_node(pRExC_state, NOTHING); + if (ret == NULL) + ret = chain; + } + if (c == 1) { + *flagp |= flags&SIMPLE; + } + + return ret; +} + +/* + - regpiece - something followed by possible [*+?] + * + * Note that the branching code sequences used for ? and the general cases + * of * and + are somewhat optimized: they use the same NOTHING node as + * both the endmarker for their branch list and the body of the last branch. + * It might seem that this node could be dispensed with entirely, but the + * endmarker role is not redundant. + * + * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with + * TRYAGAIN. + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + * restarted. + */ +STATIC regnode * +S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + regnode *ret; + char op; + char *next; + I32 flags; + const char * const origparse = RExC_parse; + I32 min; + I32 max = REG_INFTY; +#ifdef RE_TRACK_PATTERN_OFFSETS + char *parse_start; +#endif + const char *maxpos = NULL; + + /* Save the original in case we change the emitted regop to a FAIL. */ + regnode * const orig_emit = RExC_emit; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPIECE; + + DEBUG_PARSE("piec"); + + ret = regatom(pRExC_state, &flags,depth+1); + if (ret == NULL) { + if (flags & (TRYAGAIN|RESTART_UTF8)) + *flagp |= flags & (TRYAGAIN|RESTART_UTF8); + else + FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); + return(NULL); + } + + op = *RExC_parse; + + if (op == '{' && regcurly(RExC_parse)) { + maxpos = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; /* MJD */ +#endif + next = RExC_parse + 1; + while (isDIGIT(*next) || *next == ',') { + if (*next == ',') { + if (maxpos) + break; + else + maxpos = next; + } + next++; + } + if (*next == '}') { /* got one */ + const char* endptr; + if (!maxpos) + maxpos = next; + RExC_parse++; + min = grok_atou(RExC_parse, &endptr); + if (*maxpos == ',') + maxpos++; + else + maxpos = RExC_parse; + max = grok_atou(maxpos, &endptr); + if (!max && *maxpos != '0') + max = REG_INFTY; /* meaning "infinity" */ + else if (max >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + RExC_parse = next; + nextchar(pRExC_state); + if (max < min) { /* If can't match, warn and optimize to fail + unconditionally */ + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); + + /* We can't back off the size because we have to reserve + * enough space for all the things we are about to throw + * away, but we can shrink it by the ammount we are about + * to re-use here */ + RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; + } + else { + RExC_emit = orig_emit; + } + ret = reg_node(pRExC_state, OPFAIL); + return ret; + } + else if (min == max + && RExC_parse < RExC_end + && (*RExC_parse == '?' || *RExC_parse == '+')) + { + if (SIZE_ONLY) { + ckWARN2reg(RExC_parse + 1, + "Useless use of greediness modifier '%c'", + *RExC_parse); + } + /* Absorb the modifier, so later code doesn't see nor use + * it */ + nextchar(pRExC_state); + } + + do_curly: + if ((flags&SIMPLE)) { + RExC_naughty += 2 + RExC_naughty / 2; + reginsert(pRExC_state, CURLY, ret, depth+1); + Set_Node_Offset(ret, parse_start+1); /* MJD */ + Set_Node_Cur_Length(ret, parse_start); + } + else { + regnode * const w = reg_node(pRExC_state, WHILEM); + + w->flags = 0; + REGTAIL(pRExC_state, ret, w); + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, LONGJMP,ret, depth+1); + reginsert(pRExC_state, NOTHING,ret, depth+1); + NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ + } + reginsert(pRExC_state, CURLYX,ret, depth+1); + /* MJD hk */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Length(ret, + op == '{' ? (RExC_parse - parse_start) : 1); + + if (!SIZE_ONLY && RExC_extralen) + NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ + REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); + if (SIZE_ONLY) + RExC_whilem_seen++, RExC_extralen += 3; + RExC_naughty += 4 + RExC_naughty; /* compound interest */ + } + ret->flags = 0; + + if (min > 0) + *flagp = WORST; + if (max > 0) + *flagp |= HASWIDTH; + if (!SIZE_ONLY) { + ARG1_SET(ret, (U16)min); + ARG2_SET(ret, (U16)max); + } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + + goto nest_check; + } + } + + if (!ISMULT1(op)) { + *flagp = flags; + return(ret); + } + +#if 0 /* Now runtime fix should be reliable. */ + + /* if this is reinstated, don't forget to put this back into perldiag: + + =item Regexp *+ operand could be empty at {#} in regex m/%s/ + + (F) The part of the regexp subject to either the * or + quantifier + could match an empty string. The {#} shows in the regular + expression about where the problem was discovered. + + */ + + if (!(flags&HASWIDTH) && op != '?') + vFAIL("Regexp *+ operand could be empty"); +#endif + +#ifdef RE_TRACK_PATTERN_OFFSETS + parse_start = RExC_parse; +#endif + nextchar(pRExC_state); + + *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); + + if (op == '*' && (flags&SIMPLE)) { + reginsert(pRExC_state, STAR, ret, depth+1); + ret->flags = 0; + RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + } + else if (op == '*') { + min = 0; + goto do_curly; + } + else if (op == '+' && (flags&SIMPLE)) { + reginsert(pRExC_state, PLUS, ret, depth+1); + ret->flags = 0; + RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + } + else if (op == '+') { + min = 1; + goto do_curly; + } + else if (op == '?') { + min = 0; max = 1; + goto do_curly; + } + nest_check: + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + ckWARN2reg(RExC_parse, + "%"UTF8f" matches null string many times", + UTF8fARG(UTF, (RExC_parse >= origparse + ? RExC_parse - origparse + : 0), + origparse)); + (void)ReREFCNT_inc(RExC_rx_sv); + } + + if (RExC_parse < RExC_end && *RExC_parse == '?') { + nextchar(pRExC_state); + reginsert(pRExC_state, MINMOD, ret, depth+1); + REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); + } + else + if (RExC_parse < RExC_end && *RExC_parse == '+') { + regnode *ender; + nextchar(pRExC_state); + ender = reg_node(pRExC_state, SUCCEED); + REGTAIL(pRExC_state, ret, ender); + reginsert(pRExC_state, SUSPEND, ret, depth+1); + ret->flags = 0; + ender = reg_node(pRExC_state, TAIL); + REGTAIL(pRExC_state, ret, ender); + } + + if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { + RExC_parse++; + vFAIL("Nested quantifiers"); + } + + return(ret); +} + +STATIC bool +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, + UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ + ) +{ + + /* This is expected to be called by a parser routine that has recognized '\N' + and needs to handle the rest. RExC_parse is expected to point at the first + char following the N at the time of the call. On successful return, + RExC_parse has been updated to point to just after the sequence identified + by this routine, and <*flagp> has been updated. + + The \N may be inside (indicated by the boolean ) or outside a + character class. + + \N may begin either a named sequence, or if outside a character class, mean + to match a non-newline. For non single-quoted regexes, the tokenizer has + attempted to decide which, and in the case of a named sequence, converted it + into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, + where c1... are the characters in the sequence. For single-quoted regexes, + the tokenizer passes the \N sequence through unchanged; this code will not + attempt to determine this nor expand those, instead raising a syntax error. + The net effect is that if the beginning of the passed-in pattern isn't '{U+' + or there is no '}', it signals that this \N occurrence means to match a + non-newline. + + Only the \N{U+...} form should occur in a character class, for the same + reason that '.' inside a character class means to just match a period: it + just doesn't make sense. + + The function raises an error (via vFAIL), and doesn't return for various + syntax errors. Otherwise it returns TRUE and sets or on + success; it returns FALSE otherwise. Returns FALSE, setting *flagp to + RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is + only possible if node_p is non-NULL. + + + If is non-null, it means the caller can accept an input sequence + consisting of a just a single code point; <*valuep> is set to that value + if the input is such. + + If is non-null it signifies that the caller can accept any other + legal sequence (i.e., one that isn't just a single code point). <*node_p> + is set as follows: + 1) \N means not-a-NL: points to a newly created REG_ANY node; + 2) \N{}: points to a new NOTHING node; + 3) otherwise: points to a new EXACT node containing the resolved + string. + Note that FALSE is returned for single code point sequences if is + null. + */ + + char * endbrace; /* '}' following the name */ + char* p; + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + bool has_multiple_chars; /* true if the input stream contains a sequence of + more than one character */ + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_GROK_BSLASH_N; + + GET_RE_DEBUG_FLAGS; + + assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ + + /* The [^\n] meaning of \N ignores spaces and comments under the /x + * modifier. The other meaning does not, so use a temporary until we find + * out which we are being called with */ + p = (RExC_flags & RXf_PMf_EXTENDED) + ? regpatws(pRExC_state, RExC_parse, + TRUE) /* means recognize comments */ + : RExC_parse; + + /* Disambiguate between \N meaning a named character versus \N meaning + * [^\n]. The former is assumed when it can't be the latter. */ + if (*p != '{' || regcurly(p)) { + RExC_parse = p; + if (! node_p) { + /* no bare \N allowed in a charclass */ + if (in_char_class) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + return FALSE; + } + RExC_parse--; /* Need to back off so nextchar() doesn't skip the + current char */ + nextchar(pRExC_state); + *node_p = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + Set_Node_Length(*node_p, 1); /* MJD */ + return TRUE; + } + + /* Here, we have decided it should be a named character or sequence */ + + /* The test above made sure that the next real character is a '{', but + * under the /x modifier, it could be separated by space (or a comment and + * \n) and this is not allowed (for consistency with \x{...} and the + * tokenizer handling of \N{NAME}). */ + if (*RExC_parse != '{') { + vFAIL("Missing braces on \\N{}"); + } + + RExC_parse++; /* Skip past the '{' */ + + if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ + || ! (endbrace == RExC_parse /* nothing between the {} */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below + */ + && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) + */ + { + if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ + vFAIL("\\N{NAME} must be resolved by the lexer"); + } + + if (endbrace == RExC_parse) { /* empty: \N{} */ + bool ret = TRUE; + if (node_p) { + *node_p = reg_node(pRExC_state,NOTHING); + } + else if (in_char_class) { + if (SIZE_ONLY && in_char_class) { + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } + else { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class"); + } + } + ret = FALSE; + } + else { + return FALSE; + } + nextchar(pRExC_state); + return ret; + } + + RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ + RExC_parse += 2; /* Skip past the 'U+' */ + + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + + /* Code points are separated by dots. If none, there is only one code + * point, and is terminated by the brace */ + has_multiple_chars = (endchar < endbrace); + + if (valuep && (! has_multiple_chars || in_char_class)) { + /* We only pay attention to the first char of + multichar strings being returned in char classes. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. XXX Solution is to recharacterize as + [rest-of-class]|multi1|multi2... */ + + STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); + I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + + *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); + + /* The tokenizer should have guaranteed validity, but it's possible to + * bypass it by using single quoting, so check */ + if (length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) { + RExC_parse = endchar; + } + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + + if (in_char_class && has_multiple_chars) { + if (strict) { + RExC_parse = endbrace; + vFAIL("\\N{} in character class restricted to one character"); + } + else { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } + } + + RExC_parse = endbrace + 1; + } + else if (! node_p || ! has_multiple_chars) { + + /* Here, the input is legal, but not according to the caller's + * options. We fail without advancing the parse, so that the + * caller can try again */ + RExC_parse = p; + return FALSE; + } + else { + + /* What is done here is to convert this to a sub-pattern of the form + * (?:\x{char1}\x{char2}...) + * and then call reg recursively. That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. toke.c has converted the original Unicode values + * to native, so that we can just pass on the hex values unchanged. We + * do have to set a flag to keep recoding from happening in the + * recursion */ + + SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); + STRLEN len; + char *orig_end = RExC_end; + I32 flags; + + while (RExC_parse < endbrace) { + + /* Convert to notation the rest of the code understands */ + sv_catpv(substitute_parse, "\\x{"); + sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(substitute_parse, "}"); + + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + } + sv_catpv(substitute_parse, ")"); + + RExC_parse = SvPV(substitute_parse, len); + + /* Don't allow empty number */ + if (len < 8) { + vFAIL("Invalid hexadecimal number in \\N{U+...}"); + } + RExC_end = RExC_parse + len; + + /* The values are Unicode, and therefore not subject to recoding */ + RExC_override_recoding = 1; + + if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return FALSE; + } + FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", + (UV) flags); + } + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + + RExC_parse = endbrace; + RExC_end = orig_end; + RExC_override_recoding = 0; + + nextchar(pRExC_state); + } + + return TRUE; +} + + +/* + * reg_recode + * + * It returns the code point in utf8 for the value in *encp. + * value: a code value in the source encoding + * encp: a pointer to an Encode object + * + * If the result from Encode is not a single character, + * it returns U+FFFD (Replacement character) and sets *encp to NULL. + */ +STATIC UV +S_reg_recode(pTHX_ const char value, SV **encp) +{ + STRLEN numlen = 1; + SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); + const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); + const STRLEN newlen = SvCUR(sv); + UV uv = UNICODE_REPLACEMENT; + + PERL_ARGS_ASSERT_REG_RECODE; + + if (newlen) + uv = SvUTF8(sv) + ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) + : *(U8*)s; + + if (!newlen || numlen != newlen) { + uv = UNICODE_REPLACEMENT; + *encp = NULL; + } + return uv; +} + +PERL_STATIC_INLINE U8 +S_compute_EXACTish(RExC_state_t *pRExC_state) +{ + U8 op; + + PERL_ARGS_ASSERT_COMPUTE_EXACTISH; + + if (! FOLD) { + return EXACT; + } + + op = get_regex_charset(RExC_flags); + if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { + op--; /* /a is same as /u, and map /aa's offset to what /a's would have + been, so there is no hole */ + } + + return op + EXACTF; +} + +PERL_STATIC_INLINE void +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, + regnode *node, I32* flagp, STRLEN len, UV code_point, + bool downgradable) +{ + /* This knows the details about sizing an EXACTish node, setting flags for + * it (by setting <*flagp>, and potentially populating it with a single + * character. + * + * If (the length in bytes) is non-zero, this function assumes that + * the node has already been populated, and just does the sizing. In this + * case should be the final code point that has already been + * placed into the node. This value will be ignored except that under some + * circumstances <*flagp> is set based on it. + * + * If is zero, the function assumes that the node is to contain only + * the single character given by and calculates what + * should be. In pass 1, it sizes the node appropriately. In pass 2, it + * additionally will populate the node's STRING with or its + * fold if folding. + * + * In both cases <*flagp> is appropriately set + * + * It knows that under FOLD, the Latin Sharp S and UTF characters above + * 255, must be folded (the former only when the rules indicate it can + * match 'ss') + * + * When it does the populating, it looks at the flag 'downgradable'. If + * true with a node that folds, it checks if the single code point + * participates in a fold, and if not downgrades the node to an EXACT. + * This helps the optimizer */ + + bool len_passed_in = cBOOL(len != 0); + U8 character[UTF8_MAXBYTES_CASE+1]; + + PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + + /* Don't bother to check for downgrading in PASS1, as it doesn't make any + * sizing difference, and is extra work that is thrown away */ + if (downgradable && ! PASS2) { + downgradable = FALSE; + } + + if (! len_passed_in) { + if (UTF) { + if (UNI_IS_INVARIANT(code_point)) { + if (LOC || ! FOLD) { /* /l defers folding until runtime */ + *character = (U8) code_point; + } + else { /* Here is /i and not /l (toFOLD() is defined on just + ASCII, which isn't the same thing as INVARIANT on + EBCDIC, but it works there, as the extra invariants + fold to themselves) */ + *character = toFOLD((U8) code_point); + if (downgradable + && *character == code_point + && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point)) + { + OP(node) = EXACT; + } + } + len = 1; + } + else if (FOLD && (! LOC + || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) + { /* Folding, and ok to do so now */ + UV folded = _to_uni_fold_flags( + code_point, + character, + &len, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + if (downgradable + && folded == code_point + && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) + { + OP(node) = EXACT; + } + } + else if (code_point <= MAX_UTF8_TWO_BYTE) { + + /* Not folding this cp, and can output it directly */ + *character = UTF8_TWO_BYTE_HI(code_point); + *(character + 1) = UTF8_TWO_BYTE_LO(code_point); + len = 2; + } + else { + uvchr_to_utf8( character, code_point); + len = UTF8SKIP(character); + } + } /* Else pattern isn't UTF8. */ + else if (! FOLD) { + *character = (U8) code_point; + len = 1; + } /* Else is folded non-UTF8 */ + else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { + + /* We don't fold any non-UTF8 except possibly the Sharp s (see + * comments at join_exact()); */ + *character = (U8) code_point; + len = 1; + + /* Can turn into an EXACT node if we know the fold at compile time, + * and it folds to itself and doesn't particpate in other folds */ + if (downgradable + && ! LOC + && PL_fold_latin1[code_point] == code_point + && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) + || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) + { + OP(node) = EXACT; + } + } /* else is Sharp s. May need to fold it */ + else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { + *character = 's'; + *(character + 1) = 's'; + len = 2; + } + else { + *character = LATIN_SMALL_LETTER_SHARP_S; + len = 1; + } + } + + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } + else { + RExC_emit += STR_SZ(len); + STR_LEN(node) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(node), len, char); + } + } + + *flagp |= HASWIDTH; + + /* A single character node is SIMPLE, except for the special-cased SHARP S + * under /di. */ + if ((len == 1 || (UTF && len == UNISKIP(code_point))) + && (code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS)) + { + *flagp |= SIMPLE; + } + + /* The OP may not be well defined in PASS1 */ + if (PASS2 && OP(node) == EXACTFL) { + RExC_contains_locale = 1; + } +} + + +/* Parse backref decimal value, unless it's too big to sensibly be a backref, + * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ + +static I32 +S_backref_value(char *p) +{ + const char* endptr; + UV val = grok_atou(p, &endptr); + if (endptr == p || endptr == NULL || val > I32_MAX) + return I32_MAX; + return (I32)val; +} + + +/* + - regatom - the lowest level + + Try to identify anything special at the start of the pattern. If there + is, then handle it as required. This may involve generating a single regop, + such as for an assertion; or it may involve recursing, such as to + handle a () structure. + + If the string doesn't start with something special then we gobble up + as much literal text as we can. + + Once we have been able to handle whatever type of thing started the + sequence, we return. + + Note: we have to be careful with escapes, as they can be both literal + and special, and in the case of \10 and friends, context determines which. + + A summary of the code structure is: + + switch (first_byte) { + cases for each special: + handle this special; + break; + case '\\': + switch (2nd byte) { + cases for each unambiguous special: + handle this special; + break; + cases for each ambigous special/literal: + disambiguate; + if (special) handle here + else goto defchar; + default: // unambiguously literal: + goto defchar; + } + default: // is a literal char + // FALL THROUGH + defchar: + create EXACTish node for literal; + while (more input and node isn't full) { + switch (input_byte) { + cases for each special; + make sure parse pointer is set so that the next call to + regatom will see this special first + goto loopdone; // EXACTish node terminated by prev. char + default: + append char to EXACTISH node; + } + get next input byte; + } + loopdone: + } + return the generated node; + + Specifically there are two separate switches for handling + escape sequences, with the one for handling literal escapes requiring + a dummy entry for all of the special escapes that are actually handled + by the other. + + Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with + TRYAGAIN. + Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be + restarted. + Otherwise does not return NULL. +*/ + +STATIC regnode * +S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) +{ + regnode *ret = NULL; + I32 flags = 0; + char *parse_start = RExC_parse; + U8 op; + int invert = 0; + U8 arg; + + GET_RE_DEBUG_FLAGS_DECL; + + *flagp = WORST; /* Tentatively. */ + + DEBUG_PARSE("atom"); + + PERL_ARGS_ASSERT_REGATOM; + +tryagain: + switch ((U8)*RExC_parse) { + case '^': + RExC_seen_zerolen++; + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MBOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SBOL); + else + ret = reg_node(pRExC_state, BOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '$': + nextchar(pRExC_state); + if (*RExC_parse) + RExC_seen_zerolen++; + if (RExC_flags & RXf_PMf_MULTILINE) + ret = reg_node(pRExC_state, MEOL); + else if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SEOL); + else + ret = reg_node(pRExC_state, EOL); + Set_Node_Length(ret, 1); /* MJD */ + break; + case '.': + nextchar(pRExC_state); + if (RExC_flags & RXf_PMf_SINGLELINE) + ret = reg_node(pRExC_state, SANY); + else + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + Set_Node_Length(ret, 1); /* MJD */ + break; + case '[': + { + char * const oregcomp_parse = ++RExC_parse; + ret = regclass(pRExC_state, flagp,depth+1, + FALSE, /* means parse the whole char class */ + TRUE, /* allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + NULL); + if (*RExC_parse != ']') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } + if (ret == NULL) { + if (*flagp & RESTART_UTF8) + return NULL; + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); + } + nextchar(pRExC_state); + Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ + break; + } + case '(': + nextchar(pRExC_state); + ret = reg(pRExC_state, 2, &flags,depth+1); + if (ret == NULL) { + if (flags & TRYAGAIN) { + if (RExC_parse == RExC_end) { + /* Make parent create an empty node if needed. */ + *flagp |= TRYAGAIN; + return(NULL); + } + goto tryagain; + } + if (flags & RESTART_UTF8) { + *flagp = RESTART_UTF8; + return NULL; + } + FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + (UV) flags); + } + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + break; + case '|': + case ')': + if (flags & TRYAGAIN) { + *flagp |= TRYAGAIN; + return NULL; + } + vFAIL("Internal urp"); + /* Supposed to be caught earlier. */ + break; + case '?': + case '+': + case '*': + RExC_parse++; + vFAIL("Quantifier follows nothing"); + break; + case '\\': + /* Special Escapes + + This switch handles escape sequences that resolve to some kind + of special regop and not to literal text. Escape sequnces that + resolve to literal text are handled below in the switch marked + "Literal Escapes". + + Every entry in this switch *must* have a corresponding entry + in the literal escape switch. However, the opposite is not + required, as the default for this switch is to jump to the + literal text handling code. + */ + switch ((U8)*++RExC_parse) { + /* Special Escapes */ + case 'A': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, SBOL); + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'G': + ret = reg_node(pRExC_state, GPOS); + RExC_seen |= REG_GPOS_SEEN; + *flagp |= SIMPLE; + goto finish_meta_pat; + case 'K': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, KEEPS); + *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_LOOKBEHIND_SEEN; + goto finish_meta_pat; + case 'Z': + ret = reg_node(pRExC_state, SEOL); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'z': + ret = reg_node(pRExC_state, EOS); + *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ + goto finish_meta_pat; + case 'C': + ret = reg_node(pRExC_state, CANY); + RExC_seen |= REG_CANY_SEEN; + *flagp |= HASWIDTH|SIMPLE; + if (SIZE_ONLY) { + ckWARNdep(RExC_parse+1, "\\C is deprecated"); + } + goto finish_meta_pat; + case 'X': + ret = reg_node(pRExC_state, CLUMP); + *flagp |= HASWIDTH; + goto finish_meta_pat; + + case 'W': + invert = 1; + /* FALLTHROUGH */ + case 'w': + arg = ANYOF_WORDCHAR; + goto join_posix; + + case 'b': + RExC_seen_zerolen++; + RExC_seen |= REG_LOOKBEHIND_SEEN; + op = BOUND + get_regex_charset(RExC_flags); + if (op > BOUNDA) { /* /aa is same as /a */ + op = BOUNDA; + } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); + } + goto finish_meta_pat; + case 'B': + RExC_seen_zerolen++; + RExC_seen |= REG_LOOKBEHIND_SEEN; + op = NBOUND + get_regex_charset(RExC_flags); + if (op > NBOUNDA) { /* /aa is same as /a */ + op = NBOUNDA; + } + else if (op == NBOUNDL) { + RExC_contains_locale = 1; + } + ret = reg_node(pRExC_state, op); + FLAGS(ret) = get_regex_charset(RExC_flags); + *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + /* diag_listed_as: Use "%s" instead of "%s" */ + vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); + } + goto finish_meta_pat; + + case 'D': + invert = 1; + /* FALLTHROUGH */ + case 'd': + arg = ANYOF_DIGIT; + goto join_posix; + + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + + case 'H': + invert = 1; + /* FALLTHROUGH */ + case 'h': + arg = ANYOF_BLANK; + op = POSIXU; + goto join_posix_op_known; + + case 'V': + invert = 1; + /* FALLTHROUGH */ + case 'v': + arg = ANYOF_VERTWS; + op = POSIXU; + goto join_posix_op_known; + + case 'S': + invert = 1; + /* FALLTHROUGH */ + case 's': + arg = ANYOF_SPACE; + + join_posix: + + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; + } + else if (op == POSIXL) { + RExC_contains_locale = 1; + } + + join_posix_op_known: + + if (invert) { + op += NPOSIXD - POSIXD; + } + + ret = reg_node(pRExC_state, op); + if (! SIZE_ONLY) { + FLAGS(ret) = namedclass_to_classnum(arg); + } + + *flagp |= HASWIDTH|SIMPLE; + /* FALLTHROUGH */ + + finish_meta_pat: + nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ + break; + case 'p': + case 'P': + { +#ifdef DEBUGGING + char* parse_start = RExC_parse - 2; +#endif + + RExC_parse--; + + ret = regclass(pRExC_state, flagp,depth+1, + TRUE, /* means just parse this element */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. + It would be a bug if these returned + non-portables */ + NULL); + /* regclass() can only return RESTART_UTF8 if multi-char folds + are allowed. */ + if (!ret) + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); + + RExC_parse--; + + Set_Node_Offset(ret, parse_start + 2); + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + } + break; + case 'N': + /* Handle \N and \N{NAME} with multiple code points here and not + * below because it can be multicharacter. join_exact() will join + * them up later on. Also this makes sure that things like + * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. + * The options to the grok function call causes it to fail if the + * sequence is just a single code point. We then go treat it as + * just another character in the current EXACT node, and hence it + * gets uniform treatment with all the other characters. The + * special treatment for quantifiers is not needed for such single + * character sequences */ + ++RExC_parse; + if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, + FALSE /* not strict */ )) { + if (*flagp & RESTART_UTF8) + return NULL; + RExC_parse--; + goto defchar; + } + break; + case 'k': /* Handle \k and \k'NAME' */ + parse_named_seq: + { + char ch= RExC_parse[1]; + if (ch != '<' && ch != '\'' && ch != '{') { + RExC_parse++; + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.2s... not terminated",parse_start); + } else { + /* this pretty much dupes the code for (?P=...) in reg(), if + you change this make sure you change that */ + char* name_start = (RExC_parse += 2); + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; + if (RExC_parse == name_start || *RExC_parse != ch) + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + + } + break; + } + case 'g': + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + { + I32 num; + bool hasbrace = 0; + + if (*RExC_parse == 'g') { + bool isrel = 0; + + RExC_parse++; + if (*RExC_parse == '{') { + RExC_parse++; + hasbrace = 1; + } + if (*RExC_parse == '-') { + RExC_parse++; + isrel = 1; + } + if (hasbrace && !isDIGIT(*RExC_parse)) { + if (isrel) RExC_parse--; + RExC_parse -= 2; + goto parse_named_seq; + } + + num = S_backref_value(RExC_parse); + if (num == 0) + vFAIL("Reference to invalid group 0"); + else if (num == I32_MAX) { + if (isDIGIT(*RExC_parse)) + vFAIL("Reference to nonexistent group"); + else + vFAIL("Unterminated \\g... pattern"); + } + + if (isrel) { + num = RExC_npar - num; + if (num < 1) + vFAIL("Reference to nonexistent or unclosed group"); + } + } + else { + num = S_backref_value(RExC_parse); + /* bare \NNN might be backref or octal - if it is larger than or equal + * RExC_npar then it is assumed to be and octal escape. + * Note RExC_npar is +1 from the actual number of parens*/ + if (num == I32_MAX || (num > 9 && num >= RExC_npar + && *RExC_parse != '8' && *RExC_parse != '9')) + { + /* Probably a character specified in octal, e.g. \35 */ + goto defchar; + } + } + + /* at this point RExC_parse definitely points to a backref + * number */ + { +#ifdef RE_TRACK_PATTERN_OFFSETS + char * const parse_start = RExC_parse - 1; /* MJD */ +#endif + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (hasbrace) { + if (*RExC_parse != '}') + vFAIL("Unterminated \\g{...} pattern"); + RExC_parse++; + } + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) + vFAIL("Reference to nonexistent group"); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? REF + : (ASCII_FOLD_RESTRICTED) + ? REFFA + : (AT_LEAST_UNI_SEMANTICS) + ? REFFU + : (LOC) + ? REFFL + : REFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + RExC_parse--; + nextchar(pRExC_state); + } + } + break; + case '\0': + if (RExC_parse >= RExC_end) + FAIL("Trailing \\"); + /* FALLTHROUGH */ + default: + /* Do not generate "unrecognized" warnings here, we fall + back into the quick-grab loop below */ + parse_start--; + goto defchar; + } + break; + + case '#': + if (RExC_flags & RXf_PMf_EXTENDED) { + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) + goto tryagain; + } + /* FALLTHROUGH */ + + default: + + parse_start = RExC_parse - 1; + + RExC_parse++; + + defchar: { + STRLEN len = 0; + UV ender = 0; + char *p; + char *s; +#define MAX_NODE_STRING_SIZE 127 + char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; + char *s0; + U8 upper_parse = MAX_NODE_STRING_SIZE; + U8 node_type = compute_EXACTish(pRExC_state); + bool next_is_quantifier; + char * oldp = NULL; + + /* We can convert EXACTF nodes to EXACTFU if they contain only + * characters that match identically regardless of the target + * string's UTF8ness. The reason to do this is that EXACTF is not + * trie-able, EXACTFU is. + * + * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * contain only above-Latin1 characters (hence must be in UTF8), + * which don't participate in folds with Latin1-range characters, + * as the latter's folds aren't known until runtime. (We don't + * need to figure this out until pass 2) */ + bool maybe_exactfu = PASS2 + && (node_type == EXACTF || node_type == EXACTFL); + + /* If a folding node contains only code points that don't + * participate in folds, it can be changed into an EXACT node, + * which allows the optimizer more things to look for */ + bool maybe_exact; + + ret = reg_node(pRExC_state, node_type); + + /* In pass1, folded, we use a temporary buffer instead of the + * actual node, as the node doesn't exist yet */ + s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); + + s0 = s; + + reparse: + + /* We do the EXACTFish to EXACT node only if folding. (And we + * don't need to figure this out until pass 2) */ + maybe_exact = FOLD && PASS2; + + /* XXX The node can hold up to 255 bytes, yet this only goes to + * 127. I (khw) do not know why. Keeping it somewhat less than + * 255 allows us to not have to worry about overflow due to + * converting to utf8 and fold expansion, but that value is + * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes + * split up by this limit into a single one using the real max of + * 255. Even at 127, this breaks under rare circumstances. If + * folding, we do not want to split a node at a character that is a + * non-final in a multi-char fold, as an input string could just + * happen to want to match across the node boundary. The join + * would solve that problem if the join actually happens. But a + * series of more than two nodes in a row each of 127 would cause + * the first join to succeed to get to 254, but then there wouldn't + * be room for the next one, which could at be one of those split + * multi-char folds. I don't know of any fool-proof solution. One + * could back off to end with only a code point that isn't such a + * non-final, but it is possible for there not to be any in the + * entire node. */ + for (p = RExC_parse - 1; + len < upper_parse && p < RExC_end; + len++) + { + oldp = p; + + if (RExC_flags & RXf_PMf_EXTENDED) + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ + switch ((U8)*p) { + case '^': + case '$': + case '.': + case '[': + case '(': + case ')': + case '|': + goto loopdone; + case '\\': + /* Literal Escapes Switch + + This switch is meant to handle escape sequences that + resolve to a literal character. + + Every escape sequence that represents something + else, like an assertion or a char class, is handled + in the switch marked 'Special Escapes' above in this + routine, but also has an entry here as anything that + isn't explicitly mentioned here will be treated as + an unescaped equivalent literal. + */ + + switch ((U8)*++p) { + /* These are all the special escapes. */ + case 'A': /* Start assertion */ + case 'b': case 'B': /* Word-boundary assertion*/ + case 'C': /* Single char !DANGEROUS! */ + case 'd': case 'D': /* digit class */ + case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ + case 'k': case 'K': /* named backref, keep marker */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ + case 's': case 'S': /* space class */ + case 'v': case 'V': /* VERTWS */ + case 'w': case 'W': /* word class */ + case 'X': /* eXtended Unicode "combining + character sequence" */ + case 'z': case 'Z': /* End of line/string assertion */ + --p; + goto loopdone; + + /* Anything after here is an escape that resolves to a + literal. (Except digits, which may or may not) + */ + case 'n': + ender = '\n'; + p++; + break; + case 'N': /* Handle a single-code point named character. */ + /* The options cause it to fail if a multiple code + * point sequence. Handle those in the switch() above + * */ + RExC_parse = p + 1; + if (! grok_bslash_N(pRExC_state, NULL, &ender, + flagp, depth, FALSE, + FALSE /* not strict */ )) + { + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + RExC_parse = p = oldp; + goto loopdone; + } + p = RExC_parse; + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + case 'r': + ender = '\r'; + p++; + break; + case 't': + ender = '\t'; + p++; + break; + case 'f': + ender = '\f'; + p++; + break; + case 'e': + ender = ASCII_TO_NATIVE('\033'); + p++; + break; + case 'a': + ender = '\a'; + p++; + break; + case 'o': + { + UV result; + const char* error_msg; + + bool valid = grok_bslash_o(&p, + &result, + &error_msg, + TRUE, /* out warnings */ + FALSE, /* not strict */ + TRUE, /* Output warnings + for non- + portables */ + UTF); + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + ender = result; + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } + case 'x': + { + UV result = UV_MAX; /* initialize to erroneous + value */ + const char* error_msg; + + bool valid = grok_bslash_x(&p, + &result, + &error_msg, + TRUE, /* out warnings */ + FALSE, /* not strict */ + TRUE, /* Output warnings + for non- + portables */ + UTF); + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); + } + ender = result; + + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; + } + case 'c': + p++; + ender = grok_bslash_c(*p++, SIZE_ONLY); + break; + case '8': case '9': /* must be a backreference */ + --p; + goto loopdone; + case '1': case '2': case '3':case '4': + case '5': case '6': case '7': + /* When we parse backslash escapes there is ambiguity + * between backreferences and octal escapes. Any escape + * from \1 - \9 is a backreference, any multi-digit + * escape which does not start with 0 and which when + * evaluated as decimal could refer to an already + * parsed capture buffer is a backslash. Anything else + * is octal. + * + * Note this implies that \118 could be interpreted as + * 118 OR as "\11" . "8" depending on whether there + * were 118 capture buffers defined already in the + * pattern. */ + + /* NOTE, RExC_npar is 1 more than the actual number of + * parens we have seen so far, hence the < RExC_npar below. */ + + if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar) + { /* Not to be treated as an octal constant, go + find backref */ + --p; + goto loopdone; + } + /* FALLTHROUGH */ + case '0': + { + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + STRLEN numlen = 3; + ender = grok_oct(p, &numlen, &flags, NULL); + if (ender > 0xff) { + REQUIRE_UTF8; + } + p += numlen; + if (SIZE_ONLY /* like \08, \178 */ + && numlen < 3 + && p < RExC_end + && isDIGIT(*p) && ckWARN(WARN_REGEXP)) + { + reg_warn_non_literal_string( + p + 1, + form_short_octal_warning(p, numlen)); + } + } + if (PL_encoding && ender < 0x100) + goto recode_encoding; + break; + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + ender = reg_recode((const char)(U8)ender, &enc); + if (!enc && SIZE_ONLY) + ckWARNreg(p, "Invalid escape in the specified encoding"); + REQUIRE_UTF8; + } + break; + case '\0': + if (p >= RExC_end) + FAIL("Trailing \\"); + /* FALLTHROUGH */ + default: + if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { + /* Include any { following the alpha to emphasize + * that it could be part of an escape at some point + * in the future */ + int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; + ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p); + } + goto normal_default; + } /* End of switch on '\' */ + break; + case '{': + /* Currently we don't warn when the lbrace is at the start + * of a construct. This catches it in the middle of a + * literal string, or when its the first thing after + * something like "\b" */ + if (! SIZE_ONLY + && (len || (p > RExC_start && isALPHA_A(*(p -1))))) + { + ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); + } + /*FALLTHROUGH*/ + default: /* A literal character */ + normal_default: + if (UTF8_IS_START(*p) && UTF) { + STRLEN numlen; + ender = utf8n_to_uvchr((U8*)p, RExC_end - p, + &numlen, UTF8_ALLOW_DEFAULT); + p += numlen; + } + else + ender = (U8) *p++; + break; + } /* End of switch on the literal */ + + /* Here, have looked at the literal character and + * contains its ordinal,

points to the character after it + */ + + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ + + /* If the next thing is a quantifier, it applies to this + * character only, which means that this character has to be in + * its own node and can't just be appended to the string in an + * existing node, so if there are already other characters in + * the node, close the node with just them, and set up to do + * this character again next time through, when it will be the + * only thing in its new node */ + if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + { + p = oldp; + goto loopdone; + } + + if (! FOLD /* The simple case, just append the literal */ + || (LOC /* Also don't fold for tricky chars under /l */ + && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) + { + if (UTF) { + const STRLEN unilen = reguni(pRExC_state, ender, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len--; + } + else { + REGC((char)ender, s++); + } + + /* Can get here if folding only if is one of the /l + * characters whose fold depends on the locale. The + * occurrence of any of these indicate that we can't + * simplify things */ + if (FOLD) { + maybe_exact = FALSE; + maybe_exactfu = FALSE; + } + } + else /* FOLD */ + if (! ( UTF + /* See comments for join_exact() as to why we fold this + * non-UTF at compile time */ + || (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S))) + { + /* Here, are folding and are not UTF-8 encoded; therefore + * the character must be in the range 0-255, and is not /l + * (Not /l because we already handled these under /l in + * is_PROBLEMATIC_LOCALE_FOLD_cp */ + if (IS_IN_SOME_FOLD_L1(ender)) { + maybe_exact = FALSE; + + /* See if the character's fold differs between /d and + * /u. This includes the multi-char fold SHARP S to + * 'ss' */ + if (maybe_exactfu + && (PL_fold[ender] != PL_fold_latin1[ender] + || ender == LATIN_SMALL_LETTER_SHARP_S + || (len > 0 + && isARG2_lower_or_UPPER_ARG1('s', ender) + && isARG2_lower_or_UPPER_ARG1('s', + *(s-1))))) + { + maybe_exactfu = FALSE; + } + } + + /* Even when folding, we store just the input character, as + * we have an array that finds its fold quickly */ + *(s++) = (char) ender; + } + else { /* FOLD and UTF */ + /* Unlike the non-fold case, we do actually have to + * calculate the results here in pass 1. This is for two + * reasons, the folded length may be longer than the + * unfolded, and we have to calculate how many EXACTish + * nodes it will take; and we may run out of room in a node + * in the middle of a potential multi-char fold, and have + * to back off accordingly. (Hence we can't use REGC for + * the simple case just below.) */ + + UV folded; + if (isASCII(ender)) { + folded = toFOLD(ender); + *(s)++ = (U8) folded; + } + else { + STRLEN foldlen; + + folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + s += foldlen; + + /* The loop increments each time, as all but this + * path (and one other) through it add a single byte to + * the EXACTish node. But this one has changed len to + * be the correct final value, so subtract one to + * cancel out the increment that follows */ + len += foldlen - 1; + } + /* If this node only contains non-folding code points so + * far, see if this new one is also non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have to check + * further to see if anything folds to it */ + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { + maybe_exact = FALSE; + } + } + } + ender = folded; + } + + if (next_is_quantifier) { + + /* Here, the next input is a quantifier, and to get here, + * the current character is the only one in the node. + * Also, here doesn't include the final byte for this + * character */ + len++; + goto loopdone; + } + + } /* End of loop through literal characters */ + + /* Here we have either exhausted the input or ran out of room in + * the node. (If we encountered a character that can't be in the + * node, transfer is made directly to , and so we + * wouldn't have fallen off the end of the loop.) In the latter + * case, we artificially have to split the node into two, because + * we just don't have enough space to hold everything. This + * creates a problem if the final character participates in a + * multi-character fold in the non-final position, as a match that + * should have occurred won't, due to the way nodes are matched, + * and our artificial boundary. So back off until we find a non- + * problematic character -- one that isn't at the beginning or + * middle of such a fold. (Either it doesn't participate in any + * folds, or appears only in the final position of all the folds it + * does participate in.) A better solution with far fewer false + * positives, and that would fill the nodes more completely, would + * be to actually have available all the multi-character folds to + * test against, and to back-off only far enough to be sure that + * this node isn't ending with a partial one. is set + * further below (if we need to reparse the node) to include just + * up through that final non-problematic character that this code + * identifies, so when it is set to less than the full node, we can + * skip the rest of this */ + if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { + + const STRLEN full_len = len; + + assert(len >= MAX_NODE_STRING_SIZE); + + /* Here, points to the final byte of the final character. + * Look backwards through the string until find a non- + * problematic character */ + + if (! UTF) { + + /* This has no multi-char folds to non-UTF characters */ + if (ASCII_FOLD_RESTRICTED) { + goto loopdone; + } + + while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } + len = s - s0 + 1; + } + else { + if (! PL_NonL1NonFinalFold) { + PL_NonL1NonFinalFold = _new_invlist_C_array( + NonL1_Perl_Non_Final_Folds_invlist); + } + + /* Point to the first byte of the final character */ + s = (char *) utf8_hop((U8 *) s, -1); + + while (s >= s0) { /* Search backwards until find + non-problematic char */ + if (UTF8_IS_INVARIANT(*s)) { + + /* There are no ascii characters that participate + * in multi-char folds under /aa. In EBCDIC, the + * non-ascii invariants are all control characters, + * so don't ever participate in any folds. */ + if (ASCII_FOLD_RESTRICTED + || ! IS_NON_FINAL_FOLD(*s)) + { + break; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + *s, *(s+1)))) + { + break; + } + } + else if (! _invlist_contains_cp( + PL_NonL1NonFinalFold, + valid_utf8_to_uvchr((U8 *) s, NULL))) + { + break; + } + + /* Here, the current character is problematic in that + * it does occur in the non-final position of some + * fold, so try the character before it, but have to + * special case the very first byte in the string, so + * we don't read outside the string */ + s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); + } /* End of loop backwards through the string */ + + /* If there were only problematic characters in the string, + * will point to before s0, in which case the length + * should be 0, otherwise include the length of the + * non-problematic character just found */ + len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); + } + + /* Here, have found the final character, if any, that is + * non-problematic as far as ending the node without splitting + * it across a potential multi-char fold. contains the + * number of bytes in the node up-to and including that + * character, or is 0 if there is no such character, meaning + * the whole node contains only problematic characters. In + * this case, give up and just take the node as-is. We can't + * do any better */ + if (len == 0) { + len = full_len; + + /* If the node ends in an 's' we make sure it stays EXACTF, + * as if it turns into an EXACTFU, it could later get + * joined with another 's' that would then wrongly match + * the sharp s */ + if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + { + maybe_exactfu = FALSE; + } + } else { + + /* Here, the node does contain some characters that aren't + * problematic. If one such is the final character in the + * node, we are done */ + if (len == full_len) { + goto loopdone; + } + else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { + + /* If the final character is problematic, but the + * penultimate is not, back-off that last character to + * later start a new node with it */ + p = oldp; + goto loopdone; + } + + /* Here, the final non-problematic character is earlier + * in the input than the penultimate character. What we do + * is reparse from the beginning, going up only as far as + * this final ok one, thus guaranteeing that the node ends + * in an acceptable character. The reason we reparse is + * that we know how far in the character is, but we don't + * know how to correlate its position with the input parse. + * An alternate implementation would be to build that + * correlation as we go along during the original parse, + * but that would entail extra work for every node, whereas + * this code gets executed only when the string is too + * large for the node, and the final two characters are + * problematic, an infrequent occurrence. Yet another + * possible strategy would be to save the tail of the + * string, and the next time regatom is called, initialize + * with that. The problem with this is that unless you + * back off one more character, you won't be guaranteed + * regatom will get called again, unless regbranch, + * regpiece ... are also changed. If you do back off that + * extra character, so that there is input guaranteed to + * force calling regatom, you can't handle the case where + * just the first character in the node is acceptable. I + * (khw) decided to try this method which doesn't have that + * pitfall; if performance issues are found, we can do a + * combination of the current approach plus that one */ + upper_parse = len; + len = 0; + s = s0; + goto reparse; + } + } /* End of verifying node ends with an appropriate char */ + + loopdone: /* Jumped to when encounters something that shouldn't be in + the node */ + + /* I (khw) don't know if you can get here with zero length, but the + * old code handled this situation by creating a zero-length EXACT + * node. Might as well be NOTHING instead */ + if (len == 0) { + OP(ret) = NOTHING; + } + else { + if (FOLD) { + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds; + * similarly for 'maybe_exactfu' and code points that match + * differently depending on UTF8ness of the target string + * (for /u), or depending on locale for /l */ + if (maybe_exact) { + OP(ret) = EXACT; + } + else if (maybe_exactfu) { + OP(ret) = EXACTFU; + } + } + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, + FALSE /* Don't look to see if could + be turned into an EXACT + node, as we have already + computed that */ + ); + } + + RExC_parse = p - 1; + Set_Node_Cur_Length(ret, parse_start); + nextchar(pRExC_state); + { + /* len is STRLEN which is unsigned, need to copy to signed */ + IV iv = len; + if (iv < 0) + vFAIL("Internal disaster"); + } + + } /* End of label 'defchar:' */ + break; + } /* End of giant switch on input character */ + + return(ret); +} + +STATIC char * +S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) +{ + /* Returns the next non-pattern-white space, non-comment character (the + * latter only if 'recognize_comment is true) in the string p, which is + * ended by RExC_end. See also reg_skipcomment */ + const char *e = RExC_end; + + PERL_ARGS_ASSERT_REGPATWS; + + while (p < e) { + STRLEN len; + if ((len = is_PATWS_safe(p, e, UTF))) { + p += len; + } + else if (recognize_comment && *p == '#') { + p = reg_skipcomment(pRExC_state, p); + } + else + break; + } + return p; +} + +STATIC void +S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +{ + /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It + * sets up the bitmap and any flags, removing those code points from the + * inversion list, setting it to NULL should it become completely empty */ + + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; + assert(PL_regkind[OP(node)] == ANYOF); + + ANYOF_BITMAP_ZERO(node); + if (*invlist_ptr) { + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through *invlist_ptr */ + invlist_iterinit(*invlist_ptr); + while (invlist_iternext(*invlist_ptr, &start, &end)) { + UV high; + int i; + + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; + } + else if (end >= 256) { + ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + + /* Quit if are above what we should change */ + if (start > 255) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < 255) ? end : 255; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(node, i)) { + ANYOF_BITMAP_SET(node, i); + } + } + } + invlist_iterfinish(*invlist_ptr); + + /* Done with loop; remove any code points that are in the bitmap from + * *invlist_ptr; similarly for code points above latin1 if we have a + * flag to match all of them anyways */ + if (change_invlist) { + _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + } + if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { + _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + } + + /* If have completely emptied it, remove it completely */ + if (_invlist_len(*invlist_ptr) == 0) { + SvREFCNT_dec_NN(*invlist_ptr); + *invlist_ptr = NULL; + } + } +} + +/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. + Character classes ([:foo:]) can also be negated ([:^foo:]). + Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. + Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, + but trigger failures because they are currently unimplemented. */ + +#define POSIXCC_DONE(c) ((c) == ':') +#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') +#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) + +PERL_STATIC_INLINE I32 +S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) +{ + I32 namedclass = OOB_NAMEDCLASS; + + PERL_ARGS_ASSERT_REGPPOSIXCC; + + if (value == '[' && RExC_parse + 1 < RExC_end && + /* I smell either [: or [= or [. -- POSIX has been here, right? */ + POSIXCC(UCHARAT(RExC_parse))) + { + const char c = UCHARAT(RExC_parse); + char* const s = RExC_parse++; + + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) + RExC_parse++; + if (RExC_parse == RExC_end) { + if (strict) { + + /* Try to give a better location for the error (than the end of + * the string) by looking for the matching ']' */ + RExC_parse = s; + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { + RExC_parse++; + } + vFAIL2("Unmatched '%c' in POSIX class", c); + } + /* Grandfather lone [:, [=, [. */ + RExC_parse = s; + } + else { + const char* const t = RExC_parse++; /* skip over the c */ + assert(*t == c); + + if (UCHARAT(RExC_parse) == ']') { + const char *posixcc = s + 1; + RExC_parse++; /* skip over the ending ] */ + + if (*s == ':') { + const I32 complement = *posixcc == '^' ? *posixcc++ : 0; + const I32 skip = t - posixcc; + + /* Initially switch on the length of the name. */ + switch (skip) { + case 4: + if (memEQ(posixcc, "word", 4)) /* this is not POSIX, + this is the Perl \w + */ + namedclass = ANYOF_WORDCHAR; + break; + case 5: + /* Names all of length 5. */ + /* alnum alpha ascii blank cntrl digit graph lower + print punct space upper */ + /* Offset 4 gives the best switch position. */ + switch (posixcc[4]) { + case 'a': + if (memEQ(posixcc, "alph", 4)) /* alpha */ + namedclass = ANYOF_ALPHA; + break; + case 'e': + if (memEQ(posixcc, "spac", 4)) /* space */ + namedclass = ANYOF_PSXSPC; + break; + case 'h': + if (memEQ(posixcc, "grap", 4)) /* graph */ + namedclass = ANYOF_GRAPH; + break; + case 'i': + if (memEQ(posixcc, "asci", 4)) /* ascii */ + namedclass = ANYOF_ASCII; + break; + case 'k': + if (memEQ(posixcc, "blan", 4)) /* blank */ + namedclass = ANYOF_BLANK; + break; + case 'l': + if (memEQ(posixcc, "cntr", 4)) /* cntrl */ + namedclass = ANYOF_CNTRL; + break; + case 'm': + if (memEQ(posixcc, "alnu", 4)) /* alnum */ + namedclass = ANYOF_ALPHANUMERIC; + break; + case 'r': + if (memEQ(posixcc, "lowe", 4)) /* lower */ + namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; + else if (memEQ(posixcc, "uppe", 4)) /* upper */ + namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; + break; + case 't': + if (memEQ(posixcc, "digi", 4)) /* digit */ + namedclass = ANYOF_DIGIT; + else if (memEQ(posixcc, "prin", 4)) /* print */ + namedclass = ANYOF_PRINT; + else if (memEQ(posixcc, "punc", 4)) /* punct */ + namedclass = ANYOF_PUNCT; + break; + } + break; + case 6: + if (memEQ(posixcc, "xdigit", 6)) + namedclass = ANYOF_XDIGIT; + break; + } + + if (namedclass == OOB_NAMEDCLASS) + vFAIL2utf8f( + "POSIX class [:%"UTF8f":] unknown", + UTF8fARG(UTF, t - s - 1, s + 1)); + + /* The #defines are structured so each complement is +1 to + * the normal one */ + if (complement) { + namedclass++; + } + assert (posixcc[skip] == ':'); + assert (posixcc[skip+1] == ']'); + } else if (!SIZE_ONLY) { + /* [[=foo=]] and [[.foo.]] are still future. */ + + /* adjust RExC_parse so the warning shows after + the class closes */ + while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') + RExC_parse++; + vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } + } else { + /* Maternal grandfather: + * "[:" ending in ":" but not in ":]" */ + if (strict) { + vFAIL("Unmatched '[' in POSIX class"); + } + + /* Grandfather lone [:, [=, [. */ + RExC_parse = s; + } + } + } + + return namedclass; +} + +STATIC bool +S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state) +{ + /* This applies some heuristics at the current parse position (which should + * be at a '[') to see if what follows might be intended to be a [:posix:] + * class. It returns true if it really is a posix class, of course, but it + * also can return true if it thinks that what was intended was a posix + * class that didn't quite make it. + * + * It will return true for + * [:alphanumerics: + * [:alphanumerics] (as long as the ] isn't followed immediately by a + * ')' indicating the end of the (?[ + * [:any garbage including %^&$ punctuation:] + * + * This is designed to be called only from S_handle_regex_sets; it could be + * easily adapted to be called from the spot at the beginning of regclass() + * that checks to see in a normal bracketed class if the surrounding [] + * have been omitted ([:word:] instead of [[:word:]]). But doing so would + * change long-standing behavior, so I (khw) didn't do that */ + char* p = RExC_parse + 1; + char first_char = *p; + + PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS; + + assert(*(p - 1) == '['); + + if (! POSIXCC(first_char)) { + return FALSE; + } + + p++; + while (p < RExC_end && isWORDCHAR(*p)) p++; + + if (p >= RExC_end) { + return FALSE; + } + + if (p - RExC_parse > 2 /* Got at least 1 word character */ + && (*p == first_char + || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')'))) + { + return TRUE; + } + + p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse); + + return (p + && p - RExC_parse > 2 /* [:] evaluates to colon; + [::] is a bad posix class. */ + && first_char == *(p - 1)); +} + +STATIC regnode * +S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, + I32 *flagp, U32 depth, + char * const oregcomp_parse) +{ + /* Handle the (?[...]) construct to do set operations */ + + U8 curchar; + UV start, end; /* End points of code point ranges */ + SV* result_string; + char *save_end, *save_parse; + SV* final; + STRLEN len; + regnode* node; + AV* stack; + const bool save_fold = FOLD; + + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; + + if (LOC) { + vFAIL("(?[...]) not valid in locale"); + } + RExC_uni_semantics = 1; + + /* This will return only an ANYOF regnode, or (unlikely) something smaller + * (such as EXACT). Thus we can skip most everything if just sizing. We + * call regclass to handle '[]' so as to not have to reinvent its parsing + * rules here (throwing away the size it computes each time). And, we exit + * upon an unescaped ']' that isn't one ending a regclass. To do both + * these things, we need to realize that something preceded by a backslash + * is escaped, so we have to keep track of backslashes */ + if (SIZE_ONLY) { + UV depth = 0; /* how many nested (?[...]) constructs */ + + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__REGEX_SETS), + "The regex_sets feature is experimental" REPORT_LOCATION, + UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), + UTF8fARG(UTF, + RExC_end - RExC_start - (RExC_parse - RExC_precomp), + RExC_precomp + (RExC_parse - RExC_precomp))); + + while (RExC_parse < RExC_end) { + SV* current = NULL; + RExC_parse = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + switch (*RExC_parse) { + case '?': + if (RExC_parse[1] == '[') depth++, RExC_parse++; + /* FALLTHROUGH */ + default: + break; + case '\\': + /* Skip the next byte (which could cause us to end up in + * the middle of a UTF-8 character, but since none of those + * are confusable with anything we currently handle in this + * switch (invariants all), it's safe. We'll just hit the + * default: case next time and keep on incrementing until + * we find one of the invariants we do handle. */ + RExC_parse++; + break; + case '[': + { + /* If this looks like it is a [:posix:] class, leave the + * parse pointer at the '[' to fool regclass() into + * thinking it is part of a '[[:posix:]]'. That function + * will use strict checking to force a syntax error if it + * doesn't work out to a legitimate class */ + bool is_posix_class + = could_it_be_a_POSIX_class(pRExC_state); + if (! is_posix_class) { + RExC_parse++; + } + + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if (!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char + class only if not a + posix class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + + /* function call leaves parse pointing to the ']', except + * if we faked it */ + if (is_posix_class) { + RExC_parse--; + } + + SvREFCNT_dec(current); /* In case it returned something */ + break; + } + + case ']': + if (depth--) break; + RExC_parse++; + if (RExC_parse < RExC_end + && *RExC_parse == ')') + { + node = reganode(pRExC_state, ANYOF, 0); + RExC_size += ANYOF_SKIP; + nextchar(pRExC_state); + Set_Node_Length(node, + RExC_parse - oregcomp_parse + 1); /* MJD */ + return node; + } + goto no_close; + } + RExC_parse++; + } + + no_close: + FAIL("Syntax error in (?[...])"); + } + + /* Pass 2 only after this. Everything in this construct is a + * metacharacter. Operands begin with either a '\' (for an escape + * sequence), or a '[' for a bracketed character class. Any other + * character should be an operator, or parenthesis for grouping. Both + * types of operands are handled by calling regclass() to parse them. It + * is called with a parameter to indicate to return the computed inversion + * list. The parsing here is implemented via a stack. Each entry on the + * stack is a single character representing one of the operators, or the + * '('; or else a pointer to an operand inversion list. */ + +#define IS_OPERAND(a) (! SvIOK(a)) + + /* The stack starts empty. It is a syntax error if the first thing parsed + * is a binary operator; everything else is pushed on the stack. When an + * operand is parsed, the top of the stack is examined. If it is a binary + * operator, the item before it should be an operand, and both are replaced + * by the result of doing that operation on the new operand and the one on + * the stack. Thus a sequence of binary operands is reduced to a single + * one before the next one is parsed. + * + * A unary operator may immediately follow a binary in the input, for + * example + * [a] + ! [b] + * When an operand is parsed and the top of the stack is a unary operator, + * the operation is performed, and then the stack is rechecked to see if + * this new operand is part of a binary operation; if so, it is handled as + * above. + * + * A '(' is simply pushed on the stack; it is valid only if the stack is + * empty, or the top element of the stack is an operator or another '(' + * (for which the parenthesized expression will become an operand). By the + * time the corresponding ')' is parsed everything in between should have + * been parsed and evaluated to a single operand (or else is a syntax + * error), and is handled as a regular operand */ + + sv_2mortal((SV *)(stack = newAV())); + + while (RExC_parse < RExC_end) { + I32 top_index = av_tindex(stack); + SV** top_ptr; + SV* current = NULL; + + /* Skip white space */ + RExC_parse = regpatws(pRExC_state, RExC_parse, + TRUE /* means recognize comments */ ); + if (RExC_parse >= RExC_end) { + Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); + } + if ((curchar = UCHARAT(RExC_parse)) == ']') { + break; + } + + switch (curchar) { + + case '?': + if (av_tindex(stack) >= 0 /* This makes sure that we can + safely subtract 1 from + RExC_parse in the next clause. + If we have something on the + stack, we have parsed something + */ + && UCHARAT(RExC_parse - 1) == '(' + && RExC_parse < RExC_end) + { + /* If is a '(?', could be an embedded '(?flags:(?[...])'. + * This happens when we have some thing like + * + * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; + * ... + * qr/(?[ \p{Digit} & $thai_or_lao ])/; + * + * Here we would be handling the interpolated + * '$thai_or_lao'. We handle this by a recursive call to + * ourselves which returns the inversion list the + * interpolated expression evaluates to. We use the flags + * from the interpolated pattern. */ + U32 save_flags = RExC_flags; + const char * const save_parse = ++RExC_parse; + + parse_lparen_question_flags(pRExC_state); + + if (RExC_parse == save_parse /* Makes sure there was at + least one flag (or this + embedding wasn't compiled) + */ + || RExC_parse >= RExC_end - 4 + || UCHARAT(RExC_parse) != ':' + || UCHARAT(++RExC_parse) != '(' + || UCHARAT(++RExC_parse) != '?' + || UCHARAT(++RExC_parse) != '[') + { + + /* In combination with the above, this moves the + * pointer to the point just after the first erroneous + * character (or if there are no flags, to where they + * should have been) */ + if (RExC_parse >= RExC_end - 4) { + RExC_parse = RExC_end; + } + else if (RExC_parse != save_parse) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } + vFAIL("Expecting '(?flags:(?[...'"); + } + RExC_parse++; + (void) handle_regex_sets(pRExC_state, ¤t, flagp, + depth+1, oregcomp_parse); + + /* Here, 'current' contains the embedded expression's + * inversion list, and RExC_parse points to the trailing + * ']'; the next character should be the ')' which will be + * paired with the '(' that has been put on the stack, so + * the whole embedded expression reduces to '(operand)' */ + RExC_parse++; + + RExC_flags = save_flags; + goto handle_operand; + } + /* FALLTHROUGH */ + + default: + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unexpected character"); + + case '\\': + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if (!regclass(pRExC_state, flagp,depth+1, + TRUE, /* means parse just the next thing */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + /* regclass() will return with parsing just the \ sequence, + * leaving the parse pointer at the next thing to parse */ + RExC_parse--; + goto handle_operand; + + case '[': /* Is a bracketed character class */ + { + bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state); + + if (! is_posix_class) { + RExC_parse++; + } + + /* regclass() can only return RESTART_UTF8 if multi-char + folds are allowed. */ + if(!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char class + only if not a posix class */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. */ + ¤t)) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", + (UV) *flagp); + /* function call leaves parse pointing to the ']', except if we + * faked it */ + if (is_posix_class) { + RExC_parse--; + } + + goto handle_operand; + } + + case '&': + case '|': + case '+': + case '-': + case '^': + if (top_index < 0 + || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) + || ! IS_OPERAND(*top_ptr)) + { + RExC_parse++; + vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar); + } + av_push(stack, newSVuv(curchar)); + break; + + case '!': + av_push(stack, newSVuv(curchar)); + break; + + case '(': + if (top_index >= 0) { + top_ptr = av_fetch(stack, top_index, FALSE); + assert(top_ptr); + if (IS_OPERAND(*top_ptr)) { + RExC_parse++; + vFAIL("Unexpected '(' with no preceding operator"); + } + } + av_push(stack, newSVuv(curchar)); + break; + + case ')': + { + SV* lparen; + if (top_index < 1 + || ! (current = av_pop(stack)) + || ! IS_OPERAND(current) + || ! (lparen = av_pop(stack)) + || IS_OPERAND(lparen) + || SvUV(lparen) != '(') + { + SvREFCNT_dec(current); + RExC_parse++; + vFAIL("Unexpected ')'"); + } + top_index -= 2; + SvREFCNT_dec_NN(lparen); + + /* FALLTHROUGH */ + } + + handle_operand: + + /* Here, we have an operand to process, in 'current' */ + + if (top_index < 0) { /* Just push if stack is empty */ + av_push(stack, current); + } + else { + SV* top = av_pop(stack); + SV *prev = NULL; + char current_operator; + + if (IS_OPERAND(top)) { + SvREFCNT_dec_NN(top); + SvREFCNT_dec_NN(current); + vFAIL("Operand with no preceding operator"); + } + current_operator = (char) SvUV(top); + switch (current_operator) { + case '(': /* Push the '(' back on followed by the new + operand */ + av_push(stack, top); + av_push(stack, current); + SvREFCNT_inc(top); /* Counters the '_dec' done + just after the 'break', so + it doesn't get wrongly freed + */ + break; + + case '!': + _invlist_invert(current); + + /* Unlike binary operators, the top of the stack, + * now that this unary one has been popped off, may + * legally be an operator, and we now have operand + * for it. */ + top_index--; + SvREFCNT_dec_NN(top); + goto handle_operand; + + case '&': + prev = av_pop(stack); + _invlist_intersection(prev, + current, + ¤t); + av_push(stack, current); + break; + + case '|': + case '+': + prev = av_pop(stack); + _invlist_union(prev, current, ¤t); + av_push(stack, current); + break; + + case '-': + prev = av_pop(stack);; + _invlist_subtract(prev, current, ¤t); + av_push(stack, current); + break; + + case '^': /* The union minus the intersection */ + { + SV* i = NULL; + SV* u = NULL; + SV* element; + + prev = av_pop(stack); + _invlist_union(prev, current, &u); + _invlist_intersection(prev, current, &i); + /* _invlist_subtract will overwrite current + without freeing what it already contains */ + element = current; + _invlist_subtract(u, i, ¤t); + av_push(stack, current); + SvREFCNT_dec_NN(i); + SvREFCNT_dec_NN(u); + SvREFCNT_dec_NN(element); + break; + } + + default: + Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); + } + SvREFCNT_dec_NN(top); + SvREFCNT_dec(prev); + } + } + + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } + + if (av_tindex(stack) < 0 /* Was empty */ + || ((final = av_pop(stack)) == NULL) + || ! IS_OPERAND(final) + || av_tindex(stack) >= 0) /* More left on stack */ + { + vFAIL("Incomplete expression within '(?[ ])'"); + } + + /* Here, 'final' is the resultant inversion list from evaluating the + * expression. Return it if so requested */ + if (return_invlist) { + *return_invlist = final; + return END; + } + + /* Otherwise generate a resultant node, based on 'final'. regclass() is + * expecting a string of ranges and individual code points */ + invlist_iterinit(final); + result_string = newSVpvs(""); + while (invlist_iternext(final, &start, &end)) { + if (start == end) { + Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start); + } + else { + Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}", + start, end); + } + } + + save_parse = RExC_parse; + RExC_parse = SvPV(result_string, len); + save_end = RExC_end; + RExC_end = RExC_parse + len; + + /* We turn off folding around the call, as the class we have constructed + * already has all folding taken into consideration, and we don't want + * regclass() to add to that */ + RExC_flags &= ~RXf_PMf_FOLD; + /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. + */ + node = regclass(pRExC_state, flagp,depth+1, + FALSE, /* means parse the whole char class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. The above may very + well have generated non-portable code points, but + they're valid on this machine */ + NULL); + if (!node) + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, + PTR2UV(flagp)); + if (save_fold) { + RExC_flags |= RXf_PMf_FOLD; + } + RExC_parse = save_parse + 1; + RExC_end = save_end; + SvREFCNT_dec_NN(final); + SvREFCNT_dec_NN(result_string); + + nextchar(pRExC_state); + Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ + return node; +} +#undef IS_OPERAND + +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a simple fold with other code points above + * Latin1. It would give false results if /aa has been specified. + * Multi-char folds are outside the scope of this, and must be handled + * specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp)); + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + default: + /* Use deprecated warning to increase the chances of this being + * output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + break; + } +} + +/* The names of properties whose definitions are not known at compile time are + * stored in this SV, after a constant heading. So if the length has been + * changed since initialization, then there is a run-time definition. */ +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ + (SvCUR(listsv) != initial_listsv_len) + +STATIC regnode * +S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, + const bool stop_at_1, /* Just parse the next thing, don't + look for a full character class */ + bool allow_multi_folds, + const bool silence_non_portable, /* Don't output warnings + about too large + characters */ + SV** ret_invlist) /* Return an inversion list, not a node */ +{ + /* parse a bracketed class specification. Most of these will produce an + * ANYOF node; but something like [a] will produce an EXACT node; [aA], an + * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex + * under /i with multi-character folds: it will be rewritten following the + * paradigm of this example, where the s are characters which + * fold to multiple character sequences: + * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i + * gets effectively rewritten as: + * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i + * reg() gets called (recursively) on the rewritten version, and this + * function will return what it constructs. (Actually the s + * aren't physically removed from the [abcdefghi], it's just that they are + * ignored in the recursion by means of a flag: + * .) + * + * ANYOF nodes contain a bit map for the first 256 characters, with the + * corresponding bit set if that character is in the list. For characters + * above 255, a range list or swash is used. There are extra bits for \w, + * etc. in locale ANYOFs, as what these match is not determinable at + * compile time + * + * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs + * to be restarted. This can only happen if ret_invlist is non-NULL. + */ + + UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; + IV range = 0; + UV value = OOB_UNICODE, save_value = OOB_UNICODE; + regnode *ret; + STRLEN numlen; + IV namedclass = OOB_NAMEDCLASS; + char *rangebegin = NULL; + bool need_class = 0; + SV *listsv = NULL; + STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more + than just initialized. */ + SV* properties = NULL; /* Code points that match \p{} \P{} */ + SV* posixes = NULL; /* Code points that match classes like [:word:], + extended beyond the Latin1 range. These have to + be kept separate from other code points for much + of this function because their handling is + different under /i, and for most classes under + /d as well */ + SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept + separate for a while from the non-complemented + versions because of complications with /d + matching */ + UV element_count = 0; /* Number of distinct elements in the class. + Optimizations may be possible if this is tiny */ + AV * multi_char_matches = NULL; /* Code points that fold to more than one + character; used under /i */ + UV n; + char * stop_ptr = RExC_end; /* where to stop parsing */ + const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white + space? */ + const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */ + + /* Unicode properties are stored in a swash; this holds the current one + * being parsed. If this swash is the only above-latin1 component of the + * character class, an optimization is to pass it directly on to the + * execution engine. Otherwise, it is set to NULL to indicate that there + * are other things in the class that have to be dealt with at execution + * time */ + SV* swash = NULL; /* Code points that match \p{} \P{} */ + + /* Set if a component of this character class is user-defined; just passed + * on to the engine */ + bool has_user_defined_property = FALSE; + + /* inversion list of code points this node matches only when the target + * string is in UTF-8. (Because is under /d) */ + SV* depends_list = NULL; + + /* Inversion list of code points this node matches regardless of things + * like locale, folding, utf8ness of the target string */ + SV* cp_list = NULL; + + /* Like cp_list, but code points on this list need to be checked for things + * that fold to/from them under /i */ + SV* cp_foldable_list = NULL; + + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + +#ifdef EBCDIC + /* In a range, counts how many 0-2 of the ends of it came from literals, + * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ + UV literal_endpoint = 0; +#endif + bool invert = FALSE; /* Is this class to be complemented */ + + bool warn_super = ALWAYS_WARN_SUPER; + + regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in + case we need to change the emitted regop to an EXACT. */ + const char * orig_parse = RExC_parse; + const SSize_t orig_size = RExC_size; + bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCLASS; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + DEBUG_PARSE("clas"); + + /* Assume we are going to generate an ANYOF node. */ + ret = reganode(pRExC_state, ANYOF, 0); + + if (SIZE_ONLY) { + RExC_size += ANYOF_SKIP; + listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ + } + else { + ANYOF_FLAGS(ret) = 0; + + RExC_emit += ANYOF_SKIP; + listsv = newSVpvs_flags("# comment\n", SVs_TEMP); + initial_listsv_len = SvCUR(listsv); + SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ + } + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ + RExC_parse++; + invert = TRUE; + allow_multi_folds = FALSE; + RExC_naughty++; + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + } + + /* Check that they didn't say [:posix:] instead of [[:posix:]] */ + if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) { + const char *s = RExC_parse; + const char c = *s++; + + while (isWORDCHAR(*s)) + s++; + if (*s && c == *s && s[1] == ']') { + SAVEFREESV(RExC_rx_sv); + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + + /* If the caller wants us to just parse a single element, accomplish this + * by faking the loop ending condition */ + if (stop_at_1 && RExC_end > RExC_parse) { + stop_ptr = RExC_parse + 1; + } + + /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ + if (UCHARAT(RExC_parse) == ']') + goto charclassloop; + +parseit: + while (1) { + if (RExC_parse >= stop_ptr) { + break; + } + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (UCHARAT(RExC_parse) == ']') { + break; + } + + charclassloop: + + namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + save_value = value; + save_prevvalue = prevvalue; + + if (!range) { + rangebegin = RExC_parse; + element_count++; + } + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + + if (value == '[' + && RExC_parse < RExC_end + && POSIXCC(UCHARAT(RExC_parse))) + { + namedclass = regpposixcc(pRExC_state, value, strict); + } + else if (value == '\\') { + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. + * A similar issue a little bit later when switching on + * namedclass. --jhi */ + + /* If the \ is escaping white space when white space is being + * skipped, it means that that white space is wanted literally, and + * is already in 'value'. Otherwise, need to translate the escape + * into what it signifies. */ + if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) { + + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; + case 's': namedclass = ANYOF_SPACE; break; + case 'S': namedclass = ANYOF_NSPACE; break; + case 'd': namedclass = ANYOF_DIGIT; break; + case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; + case 'N': /* Handle \N{NAME} in class */ + { + /* We only pay attention to the first char of + multichar strings being returned. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. */ + if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, + TRUE, /* => charclass */ + strict)) + { + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + goto parseit; + } + } + break; + case 'p': + case 'P': + { + char *e; + + /* We will handle any undefined properties ourselves */ + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF + /* And we actually would prefer to get + * the straight inversion list of the + * swash, since we will be accessing it + * anyway, to save a little time */ + |_CORE_SWASH_INIT_ACCEPT_INVLIST; + + if (RExC_parse >= RExC_end) + vFAIL2("Empty \\%c{}", (U8)value); + if (*RExC_parse == '{') { + const U8 c = (U8)value; + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL2("Missing right brace on \\%c{}", c); + while (isSPACE(*RExC_parse)) + RExC_parse++; + if (e == RExC_parse) + vFAIL2("Empty \\%c{}", c); + n = e - RExC_parse; + while (isSPACE(*(RExC_parse + n - 1))) + n--; + } + else { + e = RExC_parse; + n = 1; + } + if (!SIZE_ONLY) { + SV* invlist; + char* name; + + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + n--; + /* toggle. (The rhs xor gets the single bit that + * differs between P and p; the other xor inverts just + * that bit) */ + value ^= 'P' ^ 'p'; + + while (isSPACE(*RExC_parse)) { + RExC_parse++; + n--; + } + } + /* Try to get the definition of the property into + * . If /i is in effect, the effective property + * will have its name be <__NAME_i>. The design is + * discussed in commit + * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ + name = savepv(Perl_form(aTHX_ + "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + )); + + /* Look up the property name, and get its swash and + * inversion list, if the property is found */ + if (swash) { + SvREFCNT_dec_NN(swash); + } + swash = _core_swash_init("utf8", name, &PL_sv_undef, + 1, /* binary */ + 0, /* not tr/// */ + NULL, /* No inversion list */ + &swash_init_flags + ); + if (! swash || ! (invlist = _get_swash_invlist(swash))) { + HV* curpkg = (IN_PERL_COMPILETIME) + ? PL_curstash + : CopSTASH(PL_curcop); + if (swash) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + /* Here didn't find it. It could be a user-defined + * property that will be available at run-time. If we + * accept only compile-time properties, is an error; + * otherwise add it to the list for run-time look up */ + if (ret_invlist) { + RExC_parse = e + 1; + vFAIL2utf8f( + "Property '%"UTF8f"' is unknown", + UTF8fARG(UTF, n, name)); + } + + /* If the property name doesn't already have a package + * name, add the current one to it so that it can be + * referred to outside it. [perl #121777] */ + if (curpkg && ! instr(name, "::")) { + char* pkgname = HvNAME(curpkg); + if (strNE(pkgname, "main")) { + char* full_name = Perl_form(aTHX_ + "%s::%s", + pkgname, + name); + n = strlen(full_name); + Safefree(name); + name = savepvn(full_name, n); + } + } + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", + (value == 'p' ? '+' : '!'), + UTF8fARG(UTF, n, name)); + has_user_defined_property = TRUE; + + /* We don't know yet, so have to assume that the + * property could match something in the Latin1 range, + * hence something that isn't utf8. Note that this + * would cause things in to match + * inappropriately, except that any \p{}, including + * this one forces Unicode semantics, which means there + * is no */ + ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; + } + else { + + /* Here, did get the swash and its inversion list. If + * the swash is from a user-defined property, then this + * whole character class should be regarded as such */ + if (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + { + has_user_defined_property = TRUE; + } + else if + /* We warn on matching an above-Unicode code point + * if the match would return true, except don't + * warn for \p{All}, which has exactly one element + * = 0 */ + (_invlist_contains_cp(invlist, 0x110000) + && (! (_invlist_len(invlist) == 1 + && *invlist_array(invlist) == 0))) + { + warn_super = TRUE; + } + + + /* Invert if asking for the complement */ + if (value == 'P') { + _invlist_union_complement_2nd(properties, + invlist, + &properties); + + /* The swash can't be used as-is, because we've + * inverted things; delay removing it to here after + * have copied its invlist above */ + SvREFCNT_dec_NN(swash); + swash = NULL; + } + else { + _invlist_union(properties, invlist, &properties); + } + } + Safefree(name); + } + RExC_parse = e + 1; + namedclass = ANYOF_UNIPROP; /* no official name, but it's + named */ + + /* \p means they want Unicode semantics */ + RExC_uni_semantics = 1; + } + break; + case 'n': value = '\n'; break; + case 'r': value = '\r'; break; + case 't': value = '\t'; break; + case 'f': value = '\f'; break; + case 'b': value = '\b'; break; + case 'e': value = ASCII_TO_NATIVE('\033');break; + case 'a': value = '\a'; break; + case 'o': + RExC_parse--; /* function expects to be pointed at the 'o' */ + { + const char* error_msg; + bool valid = grok_bslash_o(&RExC_parse, + &value, + &error_msg, + SIZE_ONLY, /* warnings in pass + 1 only */ + strict, + silence_non_portable, + UTF); + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) { + goto recode_encoding; + } + break; + case 'x': + RExC_parse--; /* function expects to be pointed at the 'x' */ + { + const char* error_msg; + bool valid = grok_bslash_x(&RExC_parse, + &value, + &error_msg, + TRUE, /* Output warnings */ + strict, + silence_non_portable, + UTF); + if (! valid) { + vFAIL(error_msg); + } + } + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + case 'c': + value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + /* Take 1-3 octal digits */ + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; + numlen = (strict) ? 4 : 3; + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); + RExC_parse += numlen; + if (numlen != 3) { + if (strict) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Need exactly 3 octal digits"); + } + else if (! SIZE_ONLY /* like \08, \178 */ + && numlen < 3 + && RExC_parse < RExC_end + && isDIGIT(*RExC_parse) + && ckWARN(WARN_REGEXP)) + { + SAVEFREESV(RExC_rx_sv); + reg_warn_non_literal_string( + RExC_parse + 1, + form_short_octal_warning(RExC_parse, numlen)); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + if (PL_encoding && value < 0x100) + goto recode_encoding; + break; + } + recode_encoding: + if (! RExC_override_recoding) { + SV* enc = PL_encoding; + value = reg_recode((const char)(U8)value, &enc); + if (!enc) { + if (strict) { + vFAIL("Invalid escape in the specified encoding"); + } + else if (SIZE_ONLY) { + ckWARNreg(RExC_parse, + "Invalid escape in the specified encoding"); + } + } + break; + } + default: + /* Allow \_ to not give an error */ + if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') { + if (strict) { + vFAIL2("Unrecognized escape \\%c in character class", + (int)value); + } + else { + SAVEFREESV(RExC_rx_sv); + ckWARN2reg(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); + (void)ReREFCNT_inc(RExC_rx_sv); + } + } + break; + } /* End of switch on char following backslash */ + } /* end of handling backslash escape sequences */ +#ifdef EBCDIC + else + literal_endpoint++; +#endif + + /* Here, we have the current token in 'value' */ + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + U8 classnum; + + /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a + * literal, as is the character that began the false range, i.e. + * the 'a' in the examples */ + if (range) { + if (!SIZE_ONLY) { + const int w = (RExC_parse >= rangebegin) + ? RExC_parse - rangebegin + : 0; + if (strict) { + vFAIL2utf8f( + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + } + else { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + ckWARN2reg(RExC_parse, + "False [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + (void)ReREFCNT_inc(RExC_rx_sv); + cp_list = add_cp_to_invlist(cp_list, '-'); + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, + prevvalue); + } + } + + range = 0; /* this was not a true range */ + element_count += 2; /* So counts for three values */ + } + + classnum = namedclass_to_classnum(namedclass); + + if (LOC && namedclass < ANYOF_POSIXL_MAX +#ifndef HAS_ISASCII + && classnum != _CC_ASCII +#endif + ) { + /* What the Posix classes (like \w, [:space:]) match in locale + * isn't knowable under locale until actual match time. Room + * must be reserved (one time per outer bracketed class) to + * store such classes. The space will contain a bit for each + * named class that is to be matched against. This isn't + * needed for \p{} and pseudo-classes, as they are not affected + * by locale, and hence are dealt with separately */ + if (! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + else { + RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; + } + ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_POSIXL_ZERO(ret); + } + + /* Coverity thinks it is possible for this to be negative; both + * jhi and khw think it's not, but be safer */ + assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); + + /* See if it already matches the complement of this POSIX + * class */ + if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) + ? -1 + : 1))) + { + posixl_matches_all = TRUE; + break; /* No need to continue. Since it matches both + e.g., \w and \W, it matches everything, and the + bracketed class can be optimized into qr/./s */ + } + + /* Add this class to those that should be checked at runtime */ + ANYOF_POSIXL_SET(ret, namedclass); + + /* The above-Latin1 characters are not subject to locale rules. + * Just add them, in the second pass, to the + * unconditionally-matched list */ + if (! SIZE_ONLY) { + SV* scratch_list = NULL; + + /* Get the list of the above-Latin1 code points this + * matches */ + _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + + /* Odd numbers are complements, like + * NDIGIT, NASCII, ... */ + namedclass % 2 != 0, + &scratch_list); + /* Checking if 'cp_list' is NULL first saves an extra + * clone. Its reference count will be decremented at the + * next union, etc, or if this is the only instance, at the + * end of the routine */ + if (! cp_list) { + cp_list = scratch_list; + } + else { + _invlist_union(cp_list, scratch_list, &cp_list); + SvREFCNT_dec_NN(scratch_list); + } + continue; /* Go get next character */ + } + } + else if (! SIZE_ONLY) { + + /* Here, not in pass1 (in that pass we skip calculating the + * contents of this class), and is /l, or is a POSIX class for + * which /l doesn't matter (or is a Unicode property, which is + * skipped here). */ + if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ + if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ + + /* Here, should be \h, \H, \v, or \V. None of /d, /i + * nor /l make a difference in what these match, + * therefore we just add what they match to cp_list. */ + if (classnum != _CC_VERTSPACE) { + assert( namedclass == ANYOF_HORIZWS + || namedclass == ANYOF_NHORIZWS); + + /* It turns out that \h is just a synonym for + * XPosixBlank */ + classnum = _CC_BLANK; + } + + _invlist_union_maybe_complement_2nd( + cp_list, + PL_XPosix_ptrs[classnum], + namedclass % 2 != 0, /* Complement if odd + (NHORIZWS, NVERTWS) + */ + &cp_list); + } + } + else { /* Garden variety class. If is NASCII, NDIGIT, ... + complement and use nposixes */ + SV** posixes_ptr = namedclass % 2 == 0 + ? &posixes + : &nposixes; + SV** source_ptr = &PL_XPosix_ptrs[classnum]; + _invlist_union_maybe_complement_2nd( + *posixes_ptr, + *source_ptr, + namedclass % 2 != 0, + posixes_ptr); + } + continue; /* Go get next character */ + } + } /* end of namedclass \blah */ + + /* Here, we have a single value. If 'range' is set, it is the ending + * of a range--check its validity. Later, we will handle each + * individual code point in the range. If 'range' isn't set, this + * could be the beginning of a range, so check for that by looking + * ahead to see if the next real character to be processed is the range + * indicator--the minus sign */ + + if (skip_white) { + RExC_parse = regpatws(pRExC_state, RExC_parse, + FALSE /* means don't recognize comments */ ); + } + + if (range) { + if (prevvalue > value) /* b-a */ { + const int w = RExC_parse - rangebegin; + vFAIL2utf8f( + "Invalid [] range \"%"UTF8f"\"", + UTF8fARG(UTF, w, rangebegin)); + range = 0; /* not a valid range */ + } + } + else { + prevvalue = value; /* save the beginning of the potential range */ + if (! stop_at_1 /* Can't be a range if parsing just one thing */ + && *RExC_parse == '-') + { + char* next_char_ptr = RExC_parse + 1; + if (skip_white) { /* Get the next real char after the '-' */ + next_char_ptr = regpatws(pRExC_state, + RExC_parse + 1, + FALSE); /* means don't recognize + comments */ + } + + /* If the '-' is at the end of the class (just before the ']', + * it is a literal minus; otherwise it is a range */ + if (next_char_ptr < RExC_end && *next_char_ptr != ']') { + RExC_parse = next_char_ptr; + + /* a bad range like \w-, [:word:]- ? */ + if (namedclass > OOB_NAMEDCLASS) { + if (strict || ckWARN(WARN_REGEXP)) { + const int w = + RExC_parse >= rangebegin ? + RExC_parse - rangebegin : 0; + if (strict) { + vFAIL4("False [] range \"%*.*s\"", + w, w, rangebegin); + } + else { + vWARN4(RExC_parse, + "False [] range \"%*.*s\"", + w, w, rangebegin); + } + } + if (!SIZE_ONLY) { + cp_list = add_cp_to_invlist(cp_list, '-'); + } + element_count++; + } else + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ + } + } + } + + /* Here, is the beginning of the range, if any; or + * if not */ + + /* non-Latin1 code point implies unicode semantics. Must be set in + * pass1 so is there for the whole of pass 2 */ + if (value > 255) { + RExC_uni_semantics = 1; + } + + /* Ready to process either the single value, or the completed range. + * For single-valued non-inverted ranges, we consider the possibility + * of multi-char folds. (We made a conscious decision to not do this + * for the other cases because it can often lead to non-intuitive + * results. For example, you have the peculiar case that: + * "s s" =~ /^[^\xDF]+$/i => Y + * "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + if (FOLD && allow_multi_folds && value == prevvalue) { + if (value == LATIN_SMALL_LETTER_SHARP_S + || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, + value))) + { + /* Here is indeed a multi-char fold. Get what it is */ + + U8 foldbuf[UTF8_MAXBYTES_CASE]; + STRLEN foldlen; + + UV folded = _to_uni_fold_flags( + value, + foldbuf, + &foldlen, + FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + + /* Here, should be the first character of the + * multi-char fold of , with containing the + * whole thing. But, if this fold is not allowed (because of + * the flags), will be the same as , and should + * be processed like any other character, so skip the special + * handling */ + if (folded != value) { + + /* Skip if we are recursed, currently parsing the class + * again. Otherwise add this character to the list of + * multi-char folds. */ + if (! RExC_in_multi_char_class) { + AV** this_array_ptr; + AV* this_array; + STRLEN cp_count = utf8_length(foldbuf, + foldbuf + foldlen); + SV* multi_fold = sv_2mortal(newSVpvs("")); + + Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); + + + if (! multi_char_matches) { + multi_char_matches = newAV(); + } + + /* is actually an array of arrays. + * There will be one or two top-level elements: [2], + * and/or [3]. The [2] element is an array, each + * element thereof is a character which folds to TWO + * characters; [3] is for folds to THREE characters. + * (Unicode guarantees a maximum of 3 characters in any + * fold.) When we rewrite the character class below, + * we will do so such that the longest folds are + * written first, so that it prefers the longest + * matching strings first. This is done even if it + * turns out that any quantifier is non-greedy, out of + * programmer laziness. Tom Christiansen has agreed + * that this is ok. This makes the test for the + * ligature 'ffi' come before the test for 'ff' */ + if (av_exists(multi_char_matches, cp_count)) { + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + this_array = *this_array_ptr; + } + else { + this_array = newAV(); + av_store(multi_char_matches, cp_count, + (SV*) this_array); + } + av_push(this_array, multi_fold); + } + + /* This element should not be processed further in this + * class */ + element_count--; + value = save_value; + prevvalue = save_prevvalue; + continue; + } + } + } + + /* Deal with this element of the class */ + if (! SIZE_ONLY) { +#ifndef EBCDIC + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); +#else + SV* this_range = _new_invlist(1); + _append_range_to_invlist(this_range, prevvalue, value); + + /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. + * If this range was specified using something like 'i-j', we want + * to include only the 'i' and the 'j', and not anything in + * between, so exclude non-ASCII, non-alphabetics from it. + * However, if the range was specified with something like + * [\x89-\x91] or [\x89-j], all code points within it should be + * included. literal_endpoint==2 means both ends of the range used + * a literal character, not \x{foo} */ + if (literal_endpoint == 2 + && ((prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z'))) + { + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII], + &this_range); + + /* Since this above only contains ascii, the intersection of it + * with anything will still yield only ascii */ + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], + &this_range); + } + _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); + literal_endpoint = 0; +#endif + } + + range = 0; /* this range (if it was one) is done now */ + } /* End of loop through all the text within the brackets */ + + /* If anything in the class expands to more than one character, we have to + * deal with them by building up a substitute parse string, and recursively + * calling reg() on it, instead of proceeding */ + if (multi_char_matches) { + SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); + I32 cp_count; + STRLEN len; + char *save_end = RExC_end; + char *save_parse = RExC_parse; + bool first_time = TRUE; /* First multi-char occurrence doesn't get + a "|" */ + I32 reg_flags; + + assert(! invert); +#if 0 /* Have decided not to deal with multi-char folds in inverted classes, + because too confusing */ + if (invert) { + sv_catpv(substitute_parse, "(?:"); + } +#endif + + /* Look at the longest folds first */ + for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { + + if (av_exists(multi_char_matches, cp_count)) { + AV** this_array_ptr; + SV* this_sequence; + + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + while ((this_sequence = av_pop(*this_array_ptr)) != + &PL_sv_undef) + { + if (! first_time) { + sv_catpv(substitute_parse, "|"); + } + first_time = FALSE; + + sv_catpv(substitute_parse, SvPVX(this_sequence)); + } + } + } + + /* If the character class contains anything else besides these + * multi-character folds, have to include it in recursive parsing */ + if (element_count) { + sv_catpv(substitute_parse, "|["); + sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); + sv_catpv(substitute_parse, "]"); + } + + sv_catpv(substitute_parse, ")"); +#if 0 + if (invert) { + /* This is a way to get the parse to skip forward a whole named + * sequence instead of matching the 2nd character when it fails the + * first */ + sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); + } +#endif + + RExC_parse = SvPV(substitute_parse, len); + RExC_end = RExC_parse + len; + RExC_in_multi_char_class = 1; + RExC_emit = (regnode *)orig_emit; + + ret = reg(pRExC_state, 1, ®_flags, depth+1); + + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8); + + RExC_parse = save_parse; + RExC_end = save_end; + RExC_in_multi_char_class = 0; + SvREFCNT_dec_NN(multi_char_matches); + return ret; + } + + /* Here, we've gone through the entire class and dealt with multi-char + * folds. We are now in a position that we can do some checks to see if we + * can optimize this ANYOF node into a simpler one, even in Pass 1. + * Currently we only do two checks: + * 1) is in the unlikely event that the user has specified both, eg. \w and + * \W under /l, then the class matches everything. (This optimization + * is done only to make the optimizer code run later work.) + * 2) if the character class contains only a single element (including a + * single range), we see if there is an equivalent node for it. + * Other checks are possible */ + if (! ret_invlist /* Can't optimize if returning the constructed + inversion list */ + && (UNLIKELY(posixl_matches_all) || element_count == 1)) + { + U8 op = END; + U8 arg = 0; + + if (UNLIKELY(posixl_matches_all)) { + op = SANY; + } + else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like + \w or [:digit:] or \p{foo} + */ + + /* All named classes are mapped into POSIXish nodes, with its FLAG + * argument giving which class it is */ + switch ((I32)namedclass) { + case ANYOF_UNIPROP: + break; + + /* These don't depend on the charset modifiers. They always + * match under /u rules */ + case ANYOF_NHORIZWS: + case ANYOF_HORIZWS: + namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS; + /* FALLTHROUGH */ + + case ANYOF_NVERTWS: + case ANYOF_VERTWS: + op = POSIXU; + goto join_posix; + + /* The actual POSIXish node for all the rest depends on the + * charset modifier. The ones in the first set depend only on + * ASCII or, if available on this platform, locale */ + case ANYOF_ASCII: + case ANYOF_NASCII: +#ifdef HAS_ISASCII + op = (LOC) ? POSIXL : POSIXA; +#else + op = POSIXA; +#endif + goto join_posix; + + case ANYOF_NCASED: + case ANYOF_LOWER: + case ANYOF_NLOWER: + case ANYOF_UPPER: + case ANYOF_NUPPER: + /* under /a could be alpha */ + if (FOLD) { + if (ASCII_RESTRICTED) { + namedclass = ANYOF_ALPHA + (namedclass % 2); + } + else if (! LOC) { + break; + } + } + /* FALLTHROUGH */ + + /* The rest have more possibilities depending on the charset. + * We take advantage of the enum ordering of the charset + * modifiers to get the exact node type, */ + default: + op = POSIXD + get_regex_charset(RExC_flags); + if (op > POSIXA) { /* /aa is same as /a */ + op = POSIXA; + } + + join_posix: + /* The odd numbered ones are the complements of the + * next-lower even number one */ + if (namedclass % 2 == 1) { + invert = ! invert; + namedclass--; + } + arg = namedclass_to_classnum(namedclass); + break; + } + } + else if (value == prevvalue) { + + /* Here, the class consists of just a single code point */ + + if (invert) { + if (! LOC && value == '\n') { + op = REG_ANY; /* Optimize [^\n] */ + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + else if (value < 256 || UTF) { + + /* Optimize a single value into an EXACTish node, but not if it + * would require converting the pattern to UTF-8. */ + op = compute_EXACTish(pRExC_state); + } + } /* Otherwise is a range */ + else if (! LOC) { /* locale could vary these */ + if (prevvalue == '0') { + if (value == '9') { + arg = _CC_DIGIT; + op = POSIXA; + } + } + else if (prevvalue == 'A') { + if (value == 'Z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } + } + else if (prevvalue == 'a') { + if (value == 'z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } + } + } + + /* Here, we have changed away from its initial value iff we found + * an optimization */ + if (op != END) { + + /* Throw away this ANYOF regnode, and emit the calculated one, + * which should correspond to the beginning, not current, state of + * the parse */ + const char * cur_parse = RExC_parse; + RExC_parse = (char *)orig_parse; + if ( SIZE_ONLY) { + if (! LOC) { + + /* To get locale nodes to not use the full ANYOF size would + * require moving the code above that writes the portions + * of it that aren't in other nodes to after this point. + * e.g. ANYOF_POSIXL_SET */ + RExC_size = orig_size; + } + } + else { + RExC_emit = (regnode *)orig_emit; + if (PL_regkind[op] == POSIXD) { + if (op == POSIXL) { + RExC_contains_locale = 1; + } + if (invert) { + op += NPOSIXD - POSIXD; + } + } + } + + ret = reg_node(pRExC_state, op); + + if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { + if (! SIZE_ONLY) { + FLAGS(ret) = arg; + } + *flagp |= HASWIDTH|SIMPLE; + } + else if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); + } + + RExC_parse = (char *) cur_parse; + + SvREFCNT_dec(posixes); + SvREFCNT_dec(nposixes); + SvREFCNT_dec(cp_list); + SvREFCNT_dec(cp_foldable_list); + return ret; + } + } + + if (SIZE_ONLY) + return ret; + /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ + + /* If folding, we calculate all characters that could fold to or from the + * ones already on the list */ + if (cp_foldable_list) { + if (FOLD) { + UV start, end; /* End points of code point ranges */ + + SV* fold_intersection = NULL; + SV** use_list; + + /* Our calculated list will be for Unicode rules. For locale + * matching, we have to keep a separate list that is consulted at + * runtime only when the locale indicates Unicode rules. For + * non-locale, we just use to the general list */ + if (LOC) { + use_list = &only_utf8_locale_list; + } + else { + use_list = &cp_list; + } + + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_foldable_list, + &fold_intersection); + + /* The folds for all the Latin1 characters are hard-coded into this + * program, but we have to go out to disk to get the others. */ + if (invlist_highest(cp_foldable_list) >= 256) { + + /* This is a hash that for a particular fold gives all + * characters that are involved in it */ + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + } + + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + SV** listp; + + if (j < 256) { + + if (IS_IN_SOME_FOLD_L1(j)) { + + /* ASCII is always matched; non-ASCII is matched + * only under Unicode rules (which could happen + * under /l if the locale is a UTF-8 one */ + if (isASCII(j) || ! DEPENDS_SEMANTICS) { + *use_list = add_cp_to_invlist(*use_list, + PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, + PL_fold_latin1[j]); + } + } + + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); + } + continue; + } + + /* Here is an above Latin1 character. We don't have the + * rules hard-coded for it. First, get its fold. This is + * the simple fold, as the multi-character folds have been + * handled earlier and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j)))) + { + continue; + } + + /* Folds under /l which cross the 255/256 boundary + * are added to a separate list. (These are valid + * only when the locale is UTF-8.) */ + if (c < 256 && LOC) { + *use_list = add_cp_to_invlist(*use_list, c); + continue; + } + + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + /* Similarly folds involving non-ascii Latin1 + * characters under /d are added to their list */ + depends_list = add_cp_to_invlist(depends_list, + c); + } + } + } + } + } + SvREFCNT_dec_NN(fold_intersection); + } + + /* Now that we have finished adding all the folds, there is no reason + * to keep the foldable list separate */ + _invlist_union(cp_list, cp_foldable_list, &cp_list); + SvREFCNT_dec_NN(cp_foldable_list); + } + + /* And combine the result (if any) with any inversion list from posix + * classes. The lists are kept separate up to now because we don't want to + * fold the classes (folding of those is automatically handled by the swash + * fetching code) */ + if (posixes || nposixes) { + if (posixes && AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); + } + if (nposixes) { + if (DEPENDS_SEMANTICS) { + /* Under /d, everything in the upper half of the Latin1 range + * matches these complements */ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + } + else if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, everything above ASCII matches these + * complements */ + _invlist_union_complement_2nd(nposixes, + PL_XPosix_ptrs[_CC_ASCII], + &nposixes); + } + if (posixes) { + _invlist_union(posixes, nposixes, &posixes); + SvREFCNT_dec_NN(nposixes); + } + else { + posixes = nposixes; + } + } + if (! DEPENDS_SEMANTICS) { + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + } + else { + cp_list = posixes; + } + } + else { + /* Under /d, we put into a separate list the Latin1 things that + * match only when the target string is utf8 */ + SV* nonascii_but_latin1_properties = NULL; + _invlist_intersection(posixes, PL_UpperLatin1, + &nonascii_but_latin1_properties); + _invlist_subtract(posixes, nonascii_but_latin1_properties, + &posixes); + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + } + else { + cp_list = posixes; + } + + if (depends_list) { + _invlist_union(depends_list, nonascii_but_latin1_properties, + &depends_list); + SvREFCNT_dec_NN(nonascii_but_latin1_properties); + } + else { + depends_list = nonascii_but_latin1_properties; + } + } + } + + /* And combine the result (if any) with any inversion list from properties. + * The lists are kept separate up to now so that we can distinguish the two + * in regards to matching above-Unicode. A run-time warning is generated + * if a Unicode property is matched against a non-Unicode code point. But, + * we allow user-defined properties to match anything, without any warning, + * and we also suppress the warning if there is a portion of the character + * class that isn't a Unicode property, and which matches above Unicode, \W + * or [\x{110000}] for example. + * (Note that in this case, unlike the Posix one above, there is no + * , because having a Unicode property forces Unicode + * semantics */ + if (properties) { + if (cp_list) { + + /* If it matters to the final outcome, see if a non-property + * component of the class matches above Unicode. If so, the + * warning gets suppressed. This is true even if just a single + * such code point is specified, as though not strictly correct if + * another such code point is matched against, the fact that they + * are using above-Unicode code points indicates they should know + * the issues involved */ + if (warn_super) { + warn_super = ! (invert + ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX)); + } + + _invlist_union(properties, cp_list, &cp_list); + SvREFCNT_dec_NN(properties); + } + else { + cp_list = properties; + } + + if (warn_super) { + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + } + } + + /* Here, we have calculated what code points should be in the character + * class. + * + * Now we can see about various optimizations. Fold calculation (which we + * did above) needs to take place before inversion. Otherwise /[^k]/i + * would invert to include K, which under /i would match k, which it + * shouldn't. Therefore we can't invert folded locale now, as it won't be + * folded until runtime */ + + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching). We know to set the flag if we have a non-NULL list for UTF-8 + * locales, or the class matches at least one 0-255 range code point */ + if (LOC && FOLD) { + if (only_utf8_locale_list) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + else if (cp_list) { /* Look to see if there a 0-255 code point is in + the list */ + UV start, end; + invlist_iterinit(cp_list); + if (invlist_iternext(cp_list, &start, &end) && start < 256) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + invlist_iterfinish(cp_list); + } + } + + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known + * at compile time. Besides not inverting folded locale now, we can't + * invert if there are things such as \w, which aren't known until runtime + * */ + if (cp_list + && invert + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! depends_list + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + _invlist_invert(cp_list); + + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + /* Clear the invert flag since have just done it here */ + invert = FALSE; + } + + if (ret_invlist) { + *ret_invlist = cp_list; + SvREFCNT_dec(swash); + + /* Discard the generated node */ + if (SIZE_ONLY) { + RExC_size = orig_size; + } + else { + RExC_emit = orig_emit; + } + return orig_emit; + } + + /* Some character classes are equivalent to other nodes. Such nodes take + * up less room and generally fewer operations to execute than ANYOF nodes. + * Above, we checked for and optimized into some such equivalents for + * certain common classes that are easy to test. Getting to this point in + * the code means that the class didn't get optimized there. Since this + * code is only executed in Pass 2, it is too late to save space--it has + * been allocated in Pass 1, and currently isn't given back. But turning + * things into an EXACTish node can allow the optimizer to join it to any + * adjacent such nodes. And if the class is equivalent to things like /./, + * expensive run-time swashes can be avoided. Now that we have more + * complete information, we can find things necessarily missed by the + * earlier code. I (khw) am not sure how much to look for here. It would + * be easy, but perhaps too slow, to check any candidates against all the + * node types they could possibly match using _invlistEQ(). */ + + if (cp_list + && ! invert + && ! depends_list + && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + + /* We don't optimize if we are supposed to make sure all non-Unicode + * code points raise a warning, as only ANYOF nodes have this check. + * */ + && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) + { + UV start, end; + U8 op = END; /* The optimzation node-type */ + const char * cur_parse= RExC_parse; + + invlist_iterinit(cp_list); + if (! invlist_iternext(cp_list, &start, &end)) { + + /* Here, the list is empty. This happens, for example, when a + * Unicode property is the only thing in the character class, and + * it doesn't match anything. (perluniprops.pod notes such + * properties) */ + op = OPFAIL; + *flagp |= HASWIDTH|SIMPLE; + } + else if (start == end) { /* The range is a single code point */ + if (! invlist_iternext(cp_list, &start, &end) + + /* Don't do this optimization if it would require changing + * the pattern to UTF-8 */ + && (start < 256 || UTF)) + { + /* Here, the list contains a single code point. Can optimize + * into an EXACTish node */ + + value = start; + + if (! FOLD) { + op = EXACT; + } + else if (LOC) { + + /* A locale node under folding with one code point can be + * an EXACTFL, as its fold won't be calculated until + * runtime */ + op = EXACTFL; + } + else { + + /* Here, we are generally folding, but there is only one + * code point to match. If we have to, we use an EXACT + * node, but it would be better for joining with adjacent + * nodes in the optimization pass if we used the same + * EXACTFish node that any such are likely to be. We can + * do this iff the code point doesn't participate in any + * folds. For example, an EXACTF of a colon is the same as + * an EXACT one, since nothing folds to or from a colon. */ + if (value < 256) { + if (IS_IN_SOME_FOLD_L1(value)) { + op = EXACT; + } + } + else { + if (_invlist_contains_cp(PL_utf8_foldable, value)) { + op = EXACT; + } + } + + /* If we haven't found the node type, above, it means we + * can use the prevailing one */ + if (op == END) { + op = compute_EXACTish(pRExC_state); + } + } + } + } + else if (start == 0) { + if (end == UV_MAX) { + op = SANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + else if (end == '\n' - 1 + && invlist_iternext(cp_list, &start, &end) + && start == '\n' + 1 && end == UV_MAX) + { + op = REG_ANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + invlist_iterfinish(cp_list); + + if (op != END) { + RExC_parse = (char *)orig_parse; + RExC_emit = (regnode *)orig_emit; + + ret = reg_node(pRExC_state, op); + + RExC_parse = (char *)cur_parse; + + if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, + TRUE /* downgradable to EXACT */ + ); + } + + SvREFCNT_dec_NN(cp_list); + return ret; + } + } + + /* Here, contains all the code points we can determine at + * compile time that match under all conditions. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * . While we are at it, see if everything above 255 is in the + * list, and if so, set a flag to speed up execution */ + + populate_ANYOF_from_invlist(ret, &cp_list); + + if (invert) { + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + } + + /* Here, the bitmap has been populated with all the Latin1 code points that + * always match. Can now add to the overall list those that match only + * when the target string is UTF-8 (). */ + if (depends_list) { + if (cp_list) { + _invlist_union(cp_list, depends_list, &cp_list); + SvREFCNT_dec_NN(depends_list); + } + else { + cp_list = depends_list; + } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; + } + + /* If there is a swash and more than one element, we can't use the swash in + * the optimization below. */ + if (swash && element_count > 1) { + SvREFCNT_dec_NN(swash); + swash = NULL; + } + + set_ANYOF_arg(pRExC_state, ret, cp_list, + (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv : NULL, + only_utf8_locale_list, + swash, has_user_defined_property); + + *flagp |= HASWIDTH|SIMPLE; + + if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + + return ret; +} + +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION + +STATIC void +S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, + regnode* const node, + SV* const cp_list, + SV* const runtime_defns, + SV* const only_utf8_locale_list, + SV* const swash, + const bool has_user_defined_property) +{ + /* Sets the arg field of an ANYOF-type node 'node', using information about + * the node passed-in. If there is nothing outside the node's bitmap, the + * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * the count returned by add_data(), having allocated and stored an array, + * av, that that count references, as follows: + * av[0] stores the character class description in its textual form. + * This is used later (regexec.c:Perl_regclass_swash()) to + * initialize the appropriate swash, and is also useful for dumping + * the regnode. This is set to &PL_sv_undef if the textual + * description is not needed at run-time (as happens if the other + * elements completely define the class) + * av[1] if &PL_sv_undef, is a placeholder to later contain the swash + * computed from av[0]. But if no further computation need be done, + * the swash is stored here now (and av[0] is &PL_sv_undef). + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead + * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. + * (Otherwise everything needed is already in av[0] and av[1]) + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ + + UV n; + + PERL_ARGS_ASSERT_SET_ANYOF_ARG; + + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + ARG_SET(node, ANYOF_NONBITMAP_EMPTY); + } + else { + AV * const av = newAV(); + SV *rv; + + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + av_store(av, 0, (runtime_defns) + ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); + if (swash) { + assert(cp_list); + av_store(av, 1, swash); + SvREFCNT_dec_NN(cp_list); + } + else { + av_store(av, 1, &PL_sv_undef); + if (cp_list) { + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); + } + } + + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + + rv = newRV_noinc(MUTABLE_SV(av)); + n = add_data(pRExC_state, STR_WITH_LEN("s")); + RExC_rxi->data->data[n] = (void*)rv; + ARG_SET(node, n); + } +} + + +/* reg_skipcomment() + + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. + + Note it's the callers responsibility to ensure that we are + actually in /x mode + +*/ + +PERL_STATIC_INLINE char* +S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) +{ + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + + assert(*p == '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; + } + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; +} + +/* nextchar() + + Advances the parse position, and optionally absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + +STATIC char* +S_nextchar(pTHX_ RExC_state_t *pRExC_state) +{ + char* const retval = RExC_parse++; + + PERL_ARGS_ASSERT_NEXTCHAR; + + for (;;) { + if (RExC_end - RExC_parse >= 3 + && *RExC_parse == '(' + && RExC_parse[1] == '?' + && RExC_parse[2] == '#') + { + while (*RExC_parse != ')') { + if (RExC_parse == RExC_end) + FAIL("Sequence (?#... not terminated"); + RExC_parse++; + } + RExC_parse++; + continue; + } + if (RExC_flags & RXf_PMf_EXTENDED) { + char * p = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + if (p != RExC_parse) { + RExC_parse = p; + continue; + } + } + return retval; + } +} + +/* +- reg_node - emit a node +*/ +STATIC regnode * /* Location. */ +S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) +{ + regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 1; + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, (void*)RExC_emit, (void*)RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE(ptr, op); + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + "reg_node", __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reganode - emit a node with an argument +*/ +STATIC regnode * /* Location. */ +S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) +{ + regnode *ptr; + regnode * const ret = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGANODE; + + if (SIZE_ONLY) { + SIZE_ALIGN(RExC_size); + RExC_size += 2; + /* + We can't do this: + + assert(2==regarglen[op]+1); + + Anything larger than this has to allocate the extra amount. + If we changed this to be: + + RExC_size += (1 + regarglen[op]); + + then it wouldn't matter. Its not clear what side effect + might come from that so its not done so far. + -- dmq + */ + return(ret); + } + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, (void*)RExC_emit, (void*)RExC_emit_bound); + + NODE_ALIGN_FILL(ret); + ptr = ret; + FILL_ADVANCE_NODE_ARG(ptr, op, arg); + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reganode", + __LINE__, + PL_reg_name[op], + (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? + "Overwriting end of array!\n" : "OK", + (UV)(RExC_emit - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Cur_Node_Offset; + } +#endif + RExC_emit = ptr; + return(ret); +} + +/* +- reguni - emit (if appropriate) a Unicode character +*/ +PERL_STATIC_INLINE STRLEN +S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) +{ + PERL_ARGS_ASSERT_REGUNI; + + return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); +} + +/* +- reginsert - insert an operator in front of already-emitted operand +* +* Means relocating the operand. +*/ +STATIC void +S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) +{ + regnode *src; + regnode *dst; + regnode *place; + const int offset = regarglen[(U8)op]; + const int size = NODE_STEP_REGNODE + offset; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGINSERT; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(depth); +/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ + DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); + if (SIZE_ONLY) { + RExC_size += size; + return; + } + + src = RExC_emit; + RExC_emit += size; + dst = RExC_emit; + if (RExC_open_parens) { + int paren; + /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ + for ( paren=0 ; paren < RExC_npar ; paren++ ) { + if ( RExC_open_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("open"," - %d",size);*/ + RExC_open_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ + } + if ( RExC_close_parens[paren] >= opnd ) { + /*DEBUG_PARSE_FMT("close"," - %d",size);*/ + RExC_close_parens[paren] += size; + } else { + /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ + } + } + } + + while (src > opnd) { + StructCopy(--src, --dst, regnode); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD 20010112 */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + "reg_insert", + __LINE__, + PL_reg_name[op], + (UV)(dst - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(src - RExC_emit_start), + (UV)(dst - RExC_emit_start), + (UV)RExC_offsets[0])); + Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); + Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); + } +#endif + } + + + place = opnd; /* Op node, where operand used to be. */ +#ifdef RE_TRACK_PATTERN_OFFSETS + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG( + ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + "reginsert", + __LINE__, + PL_reg_name[op], + (UV)(place - RExC_emit_start) > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + (UV)(place - RExC_emit_start), + (UV)(RExC_parse - RExC_start), + (UV)RExC_offsets[0])); + Set_Node_Offset(place, RExC_parse); + Set_Node_Length(place, 1); + } +#endif + src = NEXTOPER(place); + FILL_ADVANCE_NODE(place, op); + REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1); + Zero(src, offset, regnode); +} + +/* +- regtail - set the next-pointer at the end of a node chain of p to val. +- SEE ALSO: regtail_study +*/ +/* TODO: All three parms should be const */ +STATIC void +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) +{ + regnode *scan; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if (SIZE_ONLY) + return; + + /* Find last node. */ + scan = p; + for (;;) { + regnode * const temp = regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tail" : "")); + regprop(RExC_rx, mysv, scan, NULL); + PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", + SvPV_nolen_const(mysv), REG_NODE_NUM(scan), + (temp == NULL ? "->" : ""), + (temp == NULL ? PL_reg_name[OP(val)] : "") + ); + }); + if (temp == NULL) + break; + scan = temp; + } + + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } +} + +#ifdef DEBUGGING +/* +- regtail_study - set the next-pointer at the end of a node chain of p to val. +- Look for optimizable sequences at the same time. +- currently only looks for EXACT chains. + +This is experimental code. The idea is to use this routine to perform +in place optimizations on branches and groups as they are constructed, +with the long term intention of removing optimization from study_chunk so +that it is purely analytical. + +Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used +to control which is which. + +*/ +/* TODO: All four parms should be const */ + +STATIC U8 +S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, + const regnode *val,U32 depth) +{ + regnode *scan; + U8 exact = PSEUDO; +#ifdef EXPERIMENTAL_INPLACESCAN + I32 min = 0; +#endif + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL_STUDY; + + + if (SIZE_ONLY) + return exact; + + /* Find last node. */ + + scan = p; + for (;;) { + regnode * const temp = regnext(scan); +#ifdef EXPERIMENTAL_INPLACESCAN + if (PL_regkind[OP(scan)] == EXACT) { + bool unfolded_multi_char; /* Unexamined in this routine */ + if (join_exact(pRExC_state, scan, &min, + &unfolded_multi_char, 1, val, depth+1)) + return EXACT; + } +#endif + if ( exact ) { + switch (OP(scan)) { + case EXACT: + case EXACTF: + case EXACTFA_NO_TRIE: + case EXACTFA: + case EXACTFU: + case EXACTFU_SS: + case EXACTFL: + if( exact == PSEUDO ) + exact= OP(scan); + else if ( exact != OP(scan) ) + exact= 0; + case NOTHING: + break; + default: + exact= 0; + } + } + DEBUG_PARSE_r({ + SV * const mysv=sv_newmortal(); + DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); + regprop(RExC_rx, mysv, scan, NULL); + PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", + SvPV_nolen_const(mysv), + REG_NODE_NUM(scan), + PL_reg_name[exact]); + }); + if (temp == NULL) + break; + scan = temp; + } + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_PARSE_MSG(""); + regprop(RExC_rx, mysv_val, val, NULL); + PerlIO_printf(Perl_debug_log, + "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(val), + (IV)(val - scan) + ); + }); + if (reg_off_by_arg[OP(scan)]) { + ARG_SET(scan, val - scan); + } + else { + NEXT_OFF(scan) = val - scan; + } + + return exact; +} +#endif + +/* + - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form + */ +#ifdef DEBUGGING + +static void +S_regdump_intflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bitprogram, ri->program + 1, NULL, NULL, sv, 0, 0); + + /* Header fields of interest. */ + if (r->anchored_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), + RE_SV_DUMPLEN(r->anchored_substr), 30); + PerlIO_printf(Perl_debug_log, + "anchored %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_substr), + (IV)r->anchored_offset); + } else if (r->anchored_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), + RE_SV_DUMPLEN(r->anchored_utf8), 30); + PerlIO_printf(Perl_debug_log, + "anchored utf8 %s%s at %"IVdf" ", + s, RE_SV_TAIL(r->anchored_utf8), + (IV)r->anchored_offset); + } + if (r->float_substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), + RE_SV_DUMPLEN(r->float_substr), 30); + PerlIO_printf(Perl_debug_log, + "floating %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_substr), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } else if (r->float_utf8) { + RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), + RE_SV_DUMPLEN(r->float_utf8), 30); + PerlIO_printf(Perl_debug_log, + "floating utf8 %s%s at %"IVdf"..%"UVuf" ", + s, RE_SV_TAIL(r->float_utf8), + (IV)r->float_min_offset, (UV)r->float_max_offset); + } + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, + (const char *) + (r->check_substr == r->float_substr + && r->check_utf8 == r->float_utf8 + ? "(checking floating" : "(checking anchored")); + if (r->intflags & PREGf_NOSCAN) + PerlIO_printf(Perl_debug_log, " noscan"); + if (r->extflags & RXf_CHECK_ALL) + PerlIO_printf(Perl_debug_log, " isall"); + if (r->check_substr || r->check_utf8) + PerlIO_printf(Perl_debug_log, ") "); + + if (ri->regstclass) { + regprop(r, sv, ri->regstclass, NULL); + PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); + } + if (r->intflags & PREGf_ANCH) { + PerlIO_printf(Perl_debug_log, "anchored"); + if (r->intflags & PREGf_ANCH_BOL) + PerlIO_printf(Perl_debug_log, "(BOL)"); + if (r->intflags & PREGf_ANCH_MBOL) + PerlIO_printf(Perl_debug_log, "(MBOL)"); + if (r->intflags & PREGf_ANCH_SBOL) + PerlIO_printf(Perl_debug_log, "(SBOL)"); + if (r->intflags & PREGf_ANCH_GPOS) + PerlIO_printf(Perl_debug_log, "(GPOS)"); + PerlIO_putc(Perl_debug_log, ' '); + } + if (r->intflags & PREGf_GPOS_SEEN) + PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); + if (r->intflags & PREGf_SKIP) + PerlIO_printf(Perl_debug_log, "plus "); + if (r->intflags & PREGf_IMPLICIT) + PerlIO_printf(Perl_debug_log, "implicit "); + PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); + if (r->extflags & RXf_EVAL_SEEN) + PerlIO_printf(Perl_debug_log, "with eval "); + PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r({ + regdump_extflags("r->extflags: ",r->extflags); + regdump_intflags("r->intflags: ",r->intflags); + }); +#else + PERL_ARGS_ASSERT_REGDUMP; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(r); +#endif /* DEBUGGING */ +} + +/* +- regprop - printable representation of opcode, with run time support +*/ + +void +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) +{ +#ifdef DEBUGGING + int k; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { +#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ + || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ + || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ + || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ + || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \ + || _CC_VERTSPACE != 16 + #error Need to adjust order of anyofs[] +#endif + "\\w", + "\\W", + "\\d", + "\\D", + "[:alpha:]", + "[:^alpha:]", + "[:lower:]", + "[:^lower:]", + "[:upper:]", + "[:^upper:]", + "[:punct:]", + "[:^punct:]", + "[:print:]", + "[:^print:]", + "[:alnum:]", + "[:^alnum:]", + "[:graph:]", + "[:^graph:]", + "[:cased:]", + "[:^cased:]", + "\\s", + "\\S", + "[:blank:]", + "[:^blank:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:ascii:]", + "[:^ascii:]", + "\\v", + "\\V" + }; + RXi_GET_DECL(prog,progi); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPROP; + + sv_setpvs(sv, ""); + + if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ + /* It would be nice to FAIL() here, but this may be called from + regexec.c, and it would be hard to supply pRExC_state. */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(o), (int)REGNODE_MAX); + sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ + + k = PL_regkind[OP(o)]; + + if (k == EXACT) { + sv_catpvs(sv, " "); + /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) + * is a crude hack but it may be the best for now since + * we have no flag "this EXACTish node was UTF-8" + * --jhi */ + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_ESCAPE_NONASCII | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); + } else if (k == TRIE) { + /* print the details of the trie in dumpuntil instead, as + * progi->data isn't available here */ + const char op = OP(o); + const U32 n = ARG(o); + const reg_ac_data * const ac = IS_TRIE_AC(op) ? + (reg_ac_data *)progi->data->data[n] : + NULL; + const reg_trie_data * const trie + = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; + + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); + DEBUG_TRIE_COMPILE_r( + Perl_sv_catpvf(aTHX_ sv, + "", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); + ); + if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { + sv_catpvs(sv, "["); + (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)); + sv_catpvs(sv, "]"); + } + + } else if (k == CURLY) { + if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); + } + else if (k == WHILEM && o->flags) /* Ordinal/of */ + Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || OP(o)==ACCEPT) + { + Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ + if ( RXp_PAREN_NAMES(prog) ) { + if ( k != REF || (OP(o) < NREF)) { + AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + SV **name= av_fetch(list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + } + else { + AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + SV **name= av_fetch(list, nums[0], 0 ); + I32 n; + if (name) { + for ( n=0; noffs[n].start; + if (prog->lastparen < n || ln == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } + } else if (k == GOSUB) + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); + else if (k == VERB) { + if (!o->flags) + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); + } else if (k == LOGICAL) + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); + else if (k == ANYOF) { + const U8 flags = ANYOF_FLAGS(o); + int do_sep = 0; + + + if (flags & ANYOF_LOCALE_FLAGS) + sv_catpvs(sv, "{loc}"); + if (flags & ANYOF_LOC_FOLD) + sv_catpvs(sv, "{i}"); + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + if (flags & ANYOF_INVERT) + sv_catpvs(sv, "^"); + + /* output what the standard cp 0-255 bitmap matches */ + do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); + + /* output any special charclass tests (used entirely under use + * locale) * */ + if (ANYOF_POSIXL_TEST_ANY_SET(o)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(o,i)) { + sv_catpv(sv, anyofs[i]); + do_sep = 1; + } + } + } + + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) + { + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (flags & ANYOF_INVERT) + /*make sure the invert info is in each */ + sv_catpvs(sv, "^"); + } + + if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } + + /* output information about the unicode matching */ + if (flags & ANYOF_ABOVE_LATIN1_ALL) + sv_catpvs(sv, "{unicode_all}"); + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { + SV *lv; /* Set if there is something outside the bit map. */ + bool byte_output = FALSE; /* If something in the bitmap has + been output */ + SV *only_utf8_locale; + + /* Get the stuff that wasn't in the bitmap */ + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); + if (lv && lv != &PL_sv_undef) { + char *s = savesvpv(lv); + char * const origs = s; + + while (*s && *s != '\n') + s++; + + if (*s == '\n') { + const char * const t = ++s; + + if (flags & ANYOF_NONBITMAP_NON_UTF8) { + sv_catpvs(sv, "{outside bitmap}"); + } + else { + sv_catpvs(sv, "{utf8}"); + } + + if (byte_output) { + sv_catpvs(sv, " "); + } + + while (*s) { + if (*s == '\n') { + + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } + *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); + } + + out_dump: + + Safefree(origs); + SvREFCNT_dec_NN(lv); + } + + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { + UV start, end; + int max_entries = 256; + + sv_catpvs(sv, "{utf8 locale}"); + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, + &start, &end)) { + put_range(sv, start, end); + max_entries --; + if (max_entries < 0) { + sv_catpvs(sv, "..."); + break; + } + } + invlist_iterfinish(only_utf8_locale); + } + } + } + + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + } + else if (k == POSIXD || k == NPOSIXD) { + U8 index = FLAGS(o) * 2; + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpv(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpv(sv, "]"); + } + } + else { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + } + else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) + Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); +#else + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(sv); + PERL_UNUSED_ARG(o); + PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); +#endif /* DEBUGGING */ +} + + + +SV * +Perl_re_intuit_string(pTHX_ REGEXP * const r) +{ /* Assume that RE_INTUIT is set */ + struct regexp *const prog = ReANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_STRING; + PERL_UNUSED_CONTEXT; + + DEBUG_COMPILE_r( + { + const char * const s = SvPV_nolen_const(prog->check_substr + ? prog->check_substr : prog->check_utf8); + + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, + "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", + PL_colors[4], + prog->check_substr ? "" : "utf8 ", + PL_colors[5],PL_colors[0], + s, + PL_colors[1], + (strlen(s) > 60 ? "..." : "")); + } ); + + return prog->check_substr ? prog->check_substr : prog->check_utf8; +} + +/* + pregfree() + + handles refcounting and freeing the perl core regexp structure. When + it is necessary to actually free the structure the first thing it + does is call the 'free' method of the regexp_engine associated to + the regexp, allowing the handling of the void *pprivate; member + first. (This routine is not overridable by extensions, which is why + the extensions free is called first.) + + See regdupe and regdupe_internal if you change anything here. +*/ +#ifndef PERL_IN_XSUB_RE +void +Perl_pregfree(pTHX_ REGEXP *r) +{ + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) +{ + struct regexp *const r = ReANY(rx); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_PREGFREE2; + + if (r->mother_re) { + ReREFCNT_dec(r->mother_re); + } else { + CALLREGFREE_PVT(rx); /* free the private data */ + SvREFCNT_dec(RXp_PAREN_NAMES(r)); + Safefree(r->xpv_len_u.xpvlenu_pv); + } + if (r->substrs) { + SvREFCNT_dec(r->anchored_substr); + SvREFCNT_dec(r->anchored_utf8); + SvREFCNT_dec(r->float_substr); + SvREFCNT_dec(r->float_utf8); + Safefree(r->substrs); + } + RX_MATCH_COPY_FREE(rx); +#ifdef PERL_ANY_COW + SvREFCNT_dec(r->saved_copy); +#endif + Safefree(r->offs); + SvREFCNT_dec(r->qr_anoncv); + rx->sv_u.svu_rx = 0; +} + +/* reg_temp_copy() + + This is a hacky workaround to the structural issue of match results + being stored in the regexp structure which is in turn stored in + PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern + could be PL_curpm in multiple contexts, and could require multiple + result sets being associated with the pattern simultaneously, such + as when doing a recursive match with (??{$qr}) + + The solution is to make a lightweight copy of the regexp structure + when a qr// is returned from the code executed by (??{$qr}) this + lightweight copy doesn't actually own any of its data except for + the starp/end and the actual regexp structure itself. + +*/ + + +REGEXP * +Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) +{ + struct regexp *ret; + struct regexp *const r = ReANY(rx); + const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV; + + PERL_ARGS_ASSERT_REG_TEMP_COPY; + + if (!ret_x) + ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + else { + SvOK_off((SV *)ret_x); + if (islv) { + /* For PVLVs, SvANY points to the xpvlv body while sv_u points + to the regexp. (For SVt_REGEXPs, sv_upgrade has already + made both spots point to the same regexp body.) */ + REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); + assert(!SvPVX(ret_x)); + ret_x->sv_u.svu_rx = temp->sv_any; + temp->sv_any = NULL; + SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; + SvREFCNT_dec_NN(temp); + /* SvCUR still resides in the xpvlv struct, so the regexp copy- + ing below will not set it. */ + SvCUR_set(ret_x, SvCUR(rx)); + } + } + /* This ensures that SvTHINKFIRST(sv) is true, and hence that + sv_force_normal(sv) is called. */ + SvFAKE_on(ret_x); + ret = ReANY(ret_x); + + SvFLAGS(ret_x) |= SvUTF8(rx); + /* We share the same string buffer as the original regexp, on which we + hold a reference count, incremented when mother_re is set below. + The string pointer is copied here, being part of the regexp struct. + */ + memcpy(&(ret->xpv_cur), &(r->xpv_cur), + sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); + if (r->offs) { + const I32 npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + } + if (r->substrs) { + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + SvREFCNT_inc_void(ret->anchored_substr); + SvREFCNT_inc_void(ret->anchored_utf8); + SvREFCNT_inc_void(ret->float_substr); + SvREFCNT_inc_void(ret->float_utf8); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + } + RX_MATCH_COPIED_off(ret_x); +#ifdef PERL_ANY_COW + ret->saved_copy = NULL; +#endif + ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); + SvREFCNT_inc_void(ret->qr_anoncv); + + return ret_x; +} +#endif + +/* regfree_internal() + + Free the private data in a regexp. This is overloadable by + extensions. Perl takes care of the regexp structure in pregfree(), + this covers the *pprivate pointer which technically perl doesn't + know about, however of course we have to handle the + regexp_internal structure when no extension is in use. + + Note this is called before freeing anything in the regexp + structure. + */ + +void +Perl_regfree_internal(pTHX_ REGEXP * const rx) +{ + struct regexp *const r = ReANY(rx); + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGFREE_INTERNAL; + + DEBUG_COMPILE_r({ + if (!PL_colorset) + reginitcolors(); + { + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); + PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + PL_colors[4],PL_colors[5],s); + } + }); +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) + Safefree(ri->u.offsets); /* 20010421 MJD */ +#endif + if (ri->code_blocks) { + int n; + for (n = 0; n < ri->num_code_blocks; n++) + SvREFCNT_dec(ri->code_blocks[n].src_regex); + Safefree(ri->code_blocks); + } + + if (ri->data) { + int n = ri->data->count; + + while (--n >= 0) { + /* If you add a ->what type here, update the comment in regcomp.h */ + switch (ri->data->what[n]) { + case 'a': + case 'r': + case 's': + case 'S': + case 'u': + SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); + break; + case 'f': + Safefree(ri->data->data[n]); + break; + case 'l': + case 'L': + break; + case 'T': + { /* Aho Corasick add-on structure for a trie node. + Used in stclass optimization only */ + U32 refcount; + reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif + OP_REFCNT_LOCK; + refcount = --aho->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(aho->states); + PerlMemShared_free(aho->fail); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } + } + } + break; + case 't': + { + /* trie structure. */ + U32 refcount; + reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif + OP_REFCNT_LOCK; + refcount = --trie->refcount; + OP_REFCNT_UNLOCK; + if ( !refcount ) { + PerlMemShared_free(trie->charmap); + PerlMemShared_free(trie->states); + PerlMemShared_free(trie->trans); + if (trie->bitmap) + PerlMemShared_free(trie->bitmap); + if (trie->jump) + PerlMemShared_free(trie->jump); + PerlMemShared_free(trie->wordinfo); + /* do this last!!!! */ + PerlMemShared_free(ri->data->data[n]); + } + } + break; + default: + Perl_croak(aTHX_ "panic: regfree data code '%c'", + ri->data->what[n]); + } + } + Safefree(ri->data->what); + Safefree(ri->data); + } + + Safefree(ri); +} + +#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t)) +#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t)) +#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) + +/* + re_dup - duplicate a regexp. + + This routine is expected to clone a given regexp structure. It is only + compiled under USE_ITHREADS. + + After all of the core data stored in struct regexp is duplicated + the regexp_engine.dupe method is used to copy any private data + stored in the *pprivate pointer. This allows extensions to handle + any duplication it needs to do. + + See pregfree() and regfree_internal() if you change anything here. +*/ +#if defined(USE_ITHREADS) +#ifndef PERL_IN_XSUB_RE +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) +{ + dVAR; + I32 npar; + const struct regexp *r = ReANY(sstr); + struct regexp *ret = ReANY(dstr); + + PERL_ARGS_ASSERT_RE_DUP_GUTS; + + npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + + if (ret->substrs) { + /* Do it this way to avoid reading from *r after the StructCopy(). + That way, if any of the sv_dup_inc()s dislodge *r from the L1 + cache, it doesn't matter. */ + const bool anchored = r->check_substr + ? r->check_substr == r->anchored_substr + : r->check_utf8 == r->anchored_utf8; + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); + ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); + ret->float_substr = sv_dup_inc(ret->float_substr, param); + ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + + if (ret->check_substr) { + if (anchored) { + assert(r->check_utf8 == r->anchored_utf8); + ret->check_substr = ret->anchored_substr; + ret->check_utf8 = ret->anchored_utf8; + } else { + assert(r->check_substr == r->float_substr); + assert(r->check_utf8 == r->float_utf8); + ret->check_substr = ret->float_substr; + ret->check_utf8 = ret->float_utf8; + } + } else if (ret->check_utf8) { + if (anchored) { + ret->check_utf8 = ret->anchored_utf8; + } else { + ret->check_utf8 = ret->float_utf8; + } + } + } + + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); + ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); + + if (ret->pprivate) + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); + + if (RX_MATCH_COPIED(dstr)) + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); + else + ret->subbeg = NULL; +#ifdef PERL_ANY_COW + ret->saved_copy = NULL; +#endif + + /* Whether mother_re be set or no, we need to copy the string. We + cannot refrain from copying it when the storage points directly to + our mother regexp, because that's + 1: a buffer in a different thread + 2: something we no longer hold a reference on + so we need to copy it locally. */ + RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); + ret->mother_re = NULL; +} +#endif /* PERL_IN_XSUB_RE */ + +/* + regdupe_internal() + + This is the internal complement to regdupe() which is used to copy + the structure pointed to by the *pprivate pointer in the regexp. + This is the core version of the extension overridable cloning hook. + The regexp structure being duplicated will be copied by perl prior + to this and will be provided as the regexp *r argument, however + with the /old/ structures pprivate pointer value. Thus this routine + may override any copying normally done by perl. + + It returns a pointer to the new regexp_internal structure. +*/ + +void * +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) +{ + dVAR; + struct regexp *const r = ReANY(rx); + regexp_internal *reti; + int len; + RXi_GET_DECL(r,ri); + + PERL_ARGS_ASSERT_REGDUPE_INTERNAL; + + len = ProgLen(ri); + + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), + char, regexp_internal); + Copy(ri->program, reti->program, len+1, regnode); + + reti->num_code_blocks = ri->num_code_blocks; + if (ri->code_blocks) { + int n; + Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, + struct reg_code_block); + Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, + struct reg_code_block); + for (n = 0; n < ri->num_code_blocks; n++) + reti->code_blocks[n].src_regex = (REGEXP*) + sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); + } + else + reti->code_blocks = NULL; + + reti->regstclass = NULL; + + if (ri->data) { + struct reg_data *d; + const int count = ri->data->count; + int i; + + Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), + char, struct reg_data); + Newx(d->what, count, U8); + + d->count = count; + for (i = 0; i < count; i++) { + d->what[i] = ri->data->what[i]; + switch (d->what[i]) { + /* see also regcomp.h and regfree_internal() */ + case 'a': /* actually an AV, but the dup function is identical. */ + case 'r': + case 's': + case 'S': + case 'u': /* actually an HV, but the dup function is identical. */ + d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); + break; + case 'f': + /* This is cheating. */ + Newx(d->data[i], 1, regnode_ssc); + StructCopy(ri->data->data[i], d->data[i], regnode_ssc); + reti->regstclass = (regnode*)d->data[i]; + break; + case 'T': + /* Trie stclasses are readonly and can thus be shared + * without duplication. We free the stclass in pregfree + * when the corresponding reg_ac_data struct is freed. + */ + reti->regstclass= ri->regstclass; + /* FALLTHROUGH */ + case 't': + OP_REFCNT_LOCK; + ((reg_trie_data*)ri->data->data[i])->refcount++; + OP_REFCNT_UNLOCK; + /* FALLTHROUGH */ + case 'l': + case 'L': + d->data[i] = ri->data->data[i]; + break; + default: + Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + ri->data->what[i]); + } + } + + reti->data = d; + } + else + reti->data = NULL; + + reti->name_list_idx = ri->name_list_idx; + +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) { + Newx(reti->u.offsets, 2*len+1, U32); + Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); + } +#else + SetProgLen(reti,len); +#endif + + return (void*)reti; +} + +#endif /* USE_ITHREADS */ + +#ifndef PERL_IN_XSUB_RE + +/* + - regnext - dig the "next" pointer out of a node + */ +regnode * +Perl_regnext(pTHX_ regnode *p) +{ + I32 offset; + + if (!p) + return(NULL); + + if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */ + Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", + (int)OP(p), (int)REGNODE_MAX); + } + + offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); + if (offset == 0) + return(NULL); + + return(p+offset); +} +#endif + +STATIC void +S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) +{ + va_list args; + STRLEN l1 = strlen(pat1); + STRLEN l2 = strlen(pat2); + char buf[512]; + SV *msv; + const char *message; + + PERL_ARGS_ASSERT_RE_CROAK2; + + if (l1 > 510) + l1 = 510; + if (l1 + l2 > 510) + l2 = 510 - l1; + Copy(pat1, buf, l1 , char); + Copy(pat2, buf + l1, l2 , char); + buf[l1 + l2] = '\n'; + buf[l1 + l2 + 1] = '\0'; + va_start(args, pat2); + msv = vmess(buf, &args); + va_end(args); + message = SvPV_const(msv,l1); + if (l1 > 512) + l1 = 512; + Copy(message, buf, l1 , char); + /* l1-1 to avoid \n */ + Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); +} + +/* XXX Here's a total kludge. But we need to re-enter for swash routines. */ + +#ifndef PERL_IN_XSUB_RE +void +Perl_save_re_context(pTHX) +{ + /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ + if (PL_curpm) { + const REGEXP * const rx = PM_GETRE(PL_curpm); + if (rx) { + U32 i; + for (i = 1; i <= RX_NPARENS(rx); i++) { + char digits[TYPE_CHARS(long)]; + const STRLEN len = my_snprintf(digits, sizeof(digits), + "%lu", (long)i); + GV *const *const gvp + = (GV**)hv_fetch(PL_defstash, digits, len, 0); + + if (gvp) { + GV * const gv = *gvp; + if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) + save_scalar(gv); + } + } + } + } +} +#endif + +#ifdef DEBUGGING + +STATIC void +S_put_byte(pTHX_ SV *sv, int c) +{ + PERL_ARGS_ASSERT_PUT_BYTE; + + if (!isPRINT(c)) { + switch (c) { + case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; + case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; + case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; + case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; + case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; + + default: + Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); + break; + } + } + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } +} + +STATIC void +S_put_range(pTHX_ SV *sv, UV start, UV end) +{ + + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end'. It assumes that only ASCII printables are displayable + * as-is (though some of these will be escaped by put_byte()). For the + * time being, this subroutine only works for latin1 (< 256) code points */ + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + while (start <= end) { + if (end - start < 3) { /* Individual chars in short ranges */ + for (; start <= end; start++) { + put_byte(sv, start); + } + break; + } + + /* For small ranges that include printable ASCII characters, it's more + * legible to print those characters rather than hex values. For + * larger ranges that include more than printables, it's probably + * clearer to just give the start and end points of the range in hex, + * and that's all we can do if there aren't any printables within the + * range + * + * On ASCII platforms the range of printables is contiguous. If the + * entire range is printable, we print each character as such. If the + * range is partially printable and partially not, it's less likely + * that the individual printables are meaningful, especially if all or + * almost all of them are in the range. But we err on the side of the + * individual printables being meaningful by using the hex only if the + * range contains all but 2 of the printables. + * + * On EBCDIC platforms, the printables are scattered around so that the + * maximum range length containing only them is about 10. Anything + * longer we treat as hex; otherwise we examine the range character by + * character to see */ +#ifdef EBCDIC + if (start < 256 && (((end < 255) ? end : 255) - start <= 10)) +#else + if ((isPRINT_A(start) && isPRINT_A(end)) + || (end >= 0x7F && (isPRINT_A(start) && start > 0x21)) + || ((end < 0x7D && isPRINT_A(end)) && start < 0x20)) +#endif + { + /* If the range beginning isn't an ASCII printable, we find the + * last such in the range, then split the output, so all the + * non-printables are in one subrange; then process the remaining + * portion as usual. If the entire range isn't printables, we + * don't split, but drop down to print as hex */ + if (! isPRINT_A(start)) { + UV temp_end = start + 1; + while (temp_end <= end && ! isPRINT_A(temp_end)) { + temp_end++; + } + if (temp_end <= end) { + put_range(sv, start, temp_end - 1); + start = temp_end; + continue; + } + } + + /* If the range beginning is a digit, output a subrange of just the + * digits, then process the remaining portion as usual */ + if (isDIGIT_A(start)) { + put_byte(sv, start); + sv_catpvs(sv, "-"); + while (start <= end && isDIGIT_A(start)) start++; + put_byte(sv, start - 1); + continue; + } + + /* Similarly for alphabetics. Because in both ASCII and EBCDIC, + * the code points for upper and lower A-Z and a-z aren't + * intermixed, the resulting subrange will consist solely of either + * upper- or lower- alphabetics */ + if (isALPHA_A(start)) { + put_byte(sv, start); + sv_catpvs(sv, "-"); + while (start <= end && isALPHA_A(start)) start++; + put_byte(sv, start - 1); + continue; + } + + /* We output any remaining printables as individual characters */ + if (isPUNCT_A(start) || isSPACE_A(start)) { + while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) { + put_byte(sv, start); + start++; + } + continue; + } + } + + /* Here is a control or non-ascii. Output the range or subrange as + * hex. */ + Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", + start, + (end < 256) ? end : 255); + break; + } +} + +STATIC bool +S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually + * output anything */ + + int i; + bool has_output_anything = FALSE; + + PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + + for (i = 0; i < 256; i++) { + if (BITMAP_TEST((U8 *) bitmap,i)) { + + /* The character at index i should be output. Find the next + * character that should NOT be output */ + int j; + for (j = i + 1; j < 256; j++) { + if (! BITMAP_TEST((U8 *) bitmap, j)) { + break; + } + } + + /* Everything between them is a single range that should be output + * */ + put_range(sv, i, j - 1); + has_output_anything = TRUE; + i = j; + } + } + + return has_output_anything; +} + +#define CLEAR_OPTSTART \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ + } STMT_END + +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); + +STATIC const regnode * +S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, + const regnode *last, const regnode *plast, + SV* sv, I32 indent, U32 depth) +{ + U8 op = PSEUDO; /* Arbitrary non-END op. */ + const regnode *next; + const regnode *optstart= NULL; + + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMPUNTIL; + +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, + last ? last-start : 0,plast ? plast-start : 0); +#endif + + if (plast && plast < last) + last= plast; + + while (PL_regkind[op] != END && (!last || node < last)) { + assert(node); + /* While that wasn't END last time... */ + NODE_ALIGN(node); + op = OP(node); + if (op == CLOSE || op == WHILEM) + indent--; + next = regnext((regnode *)node); + + /* Where, what. */ + if (OP(node) == OPTIMIZED) { + if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) + optstart = node; + else + goto after_print; + } else + CLEAR_OPTSTART; + + regprop(r, sv, node, NULL); + PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), + (int)(2*indent + 1), "", SvPVX_const(sv)); + + if (OP(node) != OPTIMIZED) { + if (next == NULL) /* Next ptr. */ + PerlIO_printf(Perl_debug_log, " (0)"); + else if (PL_regkind[(U8)op] == BRANCH + && PL_regkind[OP(next)] != BRANCH ) + PerlIO_printf(Perl_debug_log, " (FAIL)"); + else + PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); + (void)PerlIO_putc(Perl_debug_log, '\n'); + } + + after_print: + if (PL_regkind[(U8)op] == BRANCHJ) { + assert(next); + { + const regnode *nnode = (OP(next) == LONGJMP + ? regnext((regnode *)next) + : next); + if (last && nnode > last) + nnode = last; + DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode); + } + } + else if (PL_regkind[(U8)op] == BRANCH) { + assert(next); + DUMPUNTIL(NEXTOPER(node), next); + } + else if ( PL_regkind[(U8)op] == TRIE ) { + const regnode *this_trie = node; + const char op = OP(node); + const U32 n = ARG(node); + const reg_ac_data * const ac = op>=AHOCORASICK ? + (reg_ac_data *)ri->data->data[n] : + NULL; + const reg_trie_data * const trie = + (reg_trie_data*)ri->data->data[optrie]; +#ifdef DEBUGGING + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); +#endif + const regnode *nextbranch= NULL; + I32 word_idx; + sv_setpvs(sv, ""); + for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { + SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); + + PerlIO_printf(Perl_debug_log, "%*s%s ", + (int)(2*(indent+3)), "", + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), 60, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT + ) + : "???" + ); + if (trie->jump) { + U16 dist= trie->jump[word_idx+1]; + PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", + (UV)((dist ? this_trie + dist : next) - start)); + if (dist) { + if (!nextbranch) + nextbranch= this_trie + trie->jump[0]; + DUMPUNTIL(this_trie + dist, nextbranch); + } + if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) + nextbranch= regnext((regnode *)nextbranch); + } else { + PerlIO_printf(Perl_debug_log, "\n"); + } + } + if (last && next > last) + node= last; + else + node= next; + } + else if ( op == CURLY ) { /* "next" might be very big: optimizer */ + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, + NEXTOPER(node) + EXTRA_STEP_2ARGS + 1); + } + else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) { + assert(next); + DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next); + } + else if ( op == PLUS || op == STAR) { + DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1); + } + else if (PL_regkind[(U8)op] == ANYOF) { + /* arglen 1 + class block */ + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); + node = NEXTOPER(node); + } + else if (PL_regkind[(U8)op] == EXACT) { + /* Literal string, where present. */ + node += NODE_SZ_STR(node) - 1; + node = NEXTOPER(node); + } + else { + node = NEXTOPER(node); + node += regarglen[(U8)op]; + } + if (op == CURLYX || op == OPEN) + indent++; + } + CLEAR_OPTSTART; +#ifdef DEBUG_DUMPUNTIL + PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); +#endif + return node; +} + +#endif /* DEBUGGING */ + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/5021003/regexec.c b/src/5021003/regexec.c new file mode 100644 index 0000000..15fb9e5 --- /dev/null +++ b/src/5021003/regexec.c @@ -0,0 +1,8213 @@ +/* regexec.c + */ + +/* + * One Ring to rule them all, One Ring to find them + & + * [p.v of _The Lord of the Rings_, opening poem] + * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] + * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"] + */ + +/* This file contains functions for executing a regular expression. See + * also regcomp.c which funnily enough, contains functions for compiling + * a regular expression. + * + * This file is also copied at build time to ext/re/re_exec.c, where + * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. + * This causes the main functions to be compiled under new names and with + * debugging support added, which makes "use re 'debug'" work. + */ + +/* NOTE: this is derived from Henry Spencer's regexp code, and should not + * confused with the original package (see point 3 below). Thanks, Henry! + */ + +/* Additional note: this code is very heavily munged from Henry's version + * in places. In some spots I've traded clarity for efficiency, so don't + * blame Henry for some of the lack of readability. + */ + +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +/* + * pregcomp and pregexec -- regsub and regerror are not used in perl + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + **** Alterations to Henry's code are... + **** + **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others + **** + **** You may distribute under the terms of either the GNU General Public + **** License or the Artistic License, as specified in the README file. + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + */ +#include "EXTERN.h" +#define PERL_IN_REGEXEC_C +#include "perl.h" +#include "re_defs.h" + +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +#else +# include "regcomp.h" +#endif + +#include "inline_invlist.c" +#include "unicode_constants.h" + +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + +#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) + +#ifndef STATIC +#define STATIC static +#endif + +/* Valid only for non-utf8 strings: avoids the reginclass + * call if there are no complications: i.e., if everything matchable is + * straight forward in the bitmap */ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \ + : ANYOF_BITMAP_TEST(p,*(c))) + +/* + * Forwards. + */ + +#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b) + +#define HOPc(pos,off) \ + (char *)(reginfo->is_utf8_target \ + ? reghop3((U8*)pos, off, \ + (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ + : (U8*)(pos + off)) + +#define HOPBACKc(pos, off) \ + (char*)(reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \ + : (pos - off >= reginfo->strbeg) \ + ? (U8*)pos - off \ + : NULL) + +#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) +#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) + +/* lim must be +ve. Returns NULL on overshoot */ +#define HOPMAYBE3(pos,off,lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \ + : ((U8*)pos + off <= lim) \ + ? (U8*)pos + off \ + : NULL) + +/* like HOP3, but limits the result to <= lim even for the non-utf8 case. + * off must be >=0; args should be vars rather than expressions */ +#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ + ? reghop3((U8*)(pos), off, (U8*)(lim)) \ + : (U8*)((pos + off) > lim ? lim : (pos + off))) + +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) + +#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ +#define NEXTCHR_IS_EOS (nextchr < 0) + +#define SET_nextchr \ + nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) + +#define SET_locinput(p) \ + locinput = (p); \ + SET_nextchr + + +#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \ + if (!swash_ptr) { \ + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \ + swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \ + 1, 0, invlist, &flags); \ + assert(swash_ptr); \ + } \ + } STMT_END + +/* If in debug mode, we test that a known character properly matches */ +#ifdef DEBUGGING +# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ + property_name, \ + invlist, \ + utf8_char_in_property) \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \ + assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE)); +#else +# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \ + property_name, \ + invlist, \ + utf8_char_in_property) \ + LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) +#endif + +#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ + PL_utf8_swash_ptrs[_CC_WORDCHAR], \ + "", \ + PL_XPosix_ptrs[_CC_WORDCHAR], \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); + +#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ + STMT_START { \ + LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ + "_X_regular_begin", \ + NULL, \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ + LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ + "_X_extend", \ + NULL, \ + COMBINING_GRAVE_ACCENT_UTF8); \ + } STMT_END + +#define PLACEHOLDER /* Something for the preprocessor to grab onto */ +/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ + +/* for use after a quantifier and before an EXACT-like node -- japhy */ +/* it would be nice to rework regcomp.sym to generate this stuff. sigh + * + * NOTE that *nothing* that affects backtracking should be in here, specifically + * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a + * node that is in between two EXACT like nodes when ascertaining what the required + * "follow" character is. This should probably be moved to regex compile time + * although it may be done at run time beause of the REF possibility - more + * investigation required. -- demerphq +*/ +#define JUMPABLE(rn) ( \ + OP(rn) == OPEN || \ + (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ + OP(rn) == EVAL || \ + OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ + OP(rn) == PLUS || OP(rn) == MINMOD || \ + OP(rn) == KEEPS || \ + (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ +) +#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) + +#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF ) + +#if 0 +/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so + we don't need this definition. */ +#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) + +#else +/* ... so we use this as its faster. */ +#define IS_TEXT(rn) ( OP(rn)==EXACT ) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) +#define IS_TEXTF(rn) ( OP(rn)==EXACTF ) +#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) + +#endif + +/* + Search for mandatory following text node; for lookahead, the text must + follow but for lookbehind (rn->flags != 0) we skip to the next step. +*/ +#define FIND_NEXT_IMPT(rn) STMT_START { \ + while (JUMPABLE(rn)) { \ + const OPCODE type = OP(rn); \ + if (type == SUSPEND || PL_regkind[type] == CURLY) \ + rn = NEXTOPER(NEXTOPER(rn)); \ + else if (type == PLUS) \ + rn = NEXTOPER(rn); \ + else if (type == IFMATCH) \ + rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ + else rn += NEXT_OFF(rn); \ + } \ +} STMT_END + +/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode. + * These are for the pre-composed Hangul syllables, which are all in a + * contiguous block and arranged there in such a way so as to facilitate + * alorithmic determination of their characteristics. As such, they don't need + * a swash, but can be determined by simple arithmetic. Almost all are + * GCB=LVT, but every 28th one is a GCB=LV */ +#define SBASE 0xAC00 /* Start of block */ +#define SCount 11172 /* Length of block */ +#define TCount 28 + +#define SLAB_FIRST(s) (&(s)->states[0]) +#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) + +static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo); +static void S_cleanup_regmatch_info_aux(pTHX_ void *arg); +static regmatch_state * S_push_slab(pTHX); + +#define REGCP_PAREN_ELEMS 3 +#define REGCP_OTHER_ELEMS 3 +#define REGCP_FRAME_ELEMS 1 +/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and + * are needed for the regexp context stack bookkeeping. */ + +STATIC CHECKPOINT +S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) +{ + const int retval = PL_savestack_ix; + const int paren_elems_to_push = + (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS; + const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; + const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; + I32 p; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCPPUSH; + + if (paren_elems_to_push < 0) + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u", + (int)paren_elems_to_push, (int)maxopenparen, + (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS); + + if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) + Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf + " out of range (%lu-%ld)", + total_elems, + (unsigned long)maxopenparen, + (long)parenfloor); + + SSGROW(total_elems + REGCP_FRAME_ELEMS); + + DEBUG_BUFFERS_r( + if ((int)maxopenparen > (int)parenfloor) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { +/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); + SSPUSHINT(rex->offs[p].start_tmp); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", + (UV)p, + (IV)rex->offs[p].start, + (IV)rex->offs[p].start_tmp, + (IV)rex->offs[p].end + )); + } +/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ + SSPUSHINT(maxopenparen); + SSPUSHINT(rex->lastparen); + SSPUSHINT(rex->lastcloseparen); + SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ + + return retval; +} + +/* These are needed since we do not localize EVAL nodes: */ +#define REGCP_SET(cp) \ + DEBUG_STATE_r( \ + PerlIO_printf(Perl_debug_log, \ + " Setting an EVAL scope, savestack=%"IVdf"\n", \ + (IV)PL_savestack_ix)); \ + cp = PL_savestack_ix + +#define REGCP_UNWIND(cp) \ + DEBUG_STATE_r( \ + if (cp != PL_savestack_ix) \ + PerlIO_printf(Perl_debug_log, \ + " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \ + (IV)(cp), (IV)PL_savestack_ix)); \ + regcpblow(cp) + +#define UNWIND_PAREN(lp, lcp) \ + for (n = rex->lastparen; n > lp; n--) \ + rex->offs[n].end = -1; \ + rex->lastparen = n; \ + rex->lastcloseparen = lcp; + + +STATIC void +S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) +{ + UV i; + U32 paren; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCPPOP; + + /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */ + i = SSPOPUV; + assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ + i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ + rex->lastcloseparen = SSPOPINT; + rex->lastparen = SSPOPINT; + *maxopenparen_p = SSPOPINT; + + i -= REGCP_OTHER_ELEMS; + /* Now restore the parentheses context. */ + DEBUG_BUFFERS_r( + if (i || rex->lastparen + 1 <= rex->nparens) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + paren = *maxopenparen_p; + for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { + SSize_t tmps; + rex->offs[paren].start_tmp = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; + if (paren <= rex->lastparen) + rex->offs[paren].end = tmps; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n", + (UV)paren, + (IV)rex->offs[paren].start, + (IV)rex->offs[paren].start_tmp, + (IV)rex->offs[paren].end, + (paren > rex->lastparen ? "(skipped)" : "")); + ); + paren--; + } +#if 1 + /* It would seem that the similar code in regtry() + * already takes care of this, and in fact it is in + * a better location to since this code can #if 0-ed out + * but the code in regtry() is needed or otherwise tests + * requiring null fields (pat.t#187 and split.t#{13,14} + * (as of patchlevel 7877) will fail. Then again, + * this code seems to be necessary or otherwise + * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ + * --jhi updated by dapm */ + for (i = rex->lastparen + 1; i <= rex->nparens; i++) { + if (i > *maxopenparen_p) + rex->offs[i].start = -1; + rex->offs[i].end = -1; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %s ..-1 undeffing\n", + (UV)i, + (i > *maxopenparen_p) ? "-1" : " " + )); + } +#endif +} + +/* restore the parens and associated vars at savestack position ix, + * but without popping the stack */ + +STATIC void +S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p) +{ + I32 tmpix = PL_savestack_ix; + PL_savestack_ix = ix; + regcppop(rex, maxopenparen_p); + PL_savestack_ix = tmpix; +} + +#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ + +STATIC bool +S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) +{ + /* Returns a boolean as to whether or not 'character' is a member of the + * Posix character class given by 'classnum' that should be equivalent to a + * value in the typedef '_char_class_number'. + * + * Ideally this could be replaced by a just an array of function pointers + * to the C library functions that implement the macros this calls. + * However, to compile, the precise function signatures are required, and + * these may vary from platform to to platform. To avoid having to figure + * out what those all are on each platform, I (khw) am using this method, + * which adds an extra layer of function call overhead (unless the C + * optimizer strips it away). But we don't particularly care about + * performance with locales anyway. */ + + switch ((_char_class_number) classnum) { + case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character); + case _CC_ENUM_ALPHA: return isALPHA_LC(character); + case _CC_ENUM_ASCII: return isASCII_LC(character); + case _CC_ENUM_BLANK: return isBLANK_LC(character); + case _CC_ENUM_CASED: return isLOWER_LC(character) + || isUPPER_LC(character); + case _CC_ENUM_CNTRL: return isCNTRL_LC(character); + case _CC_ENUM_DIGIT: return isDIGIT_LC(character); + case _CC_ENUM_GRAPH: return isGRAPH_LC(character); + case _CC_ENUM_LOWER: return isLOWER_LC(character); + case _CC_ENUM_PRINT: return isPRINT_LC(character); + case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character); + case _CC_ENUM_PUNCT: return isPUNCT_LC(character); + case _CC_ENUM_SPACE: return isSPACE_LC(character); + case _CC_ENUM_UPPER: return isUPPER_LC(character); + case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character); + case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character); + default: /* VERTSPACE should never occur in locales */ + Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum); + } + + assert(0); /* NOTREACHED */ + return FALSE; +} + +STATIC bool +S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) +{ + /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded + * 'character' is a member of the Posix character class given by 'classnum' + * that should be equivalent to a value in the typedef + * '_char_class_number'. + * + * This just calls isFOO_lc on the code point for the character if it is in + * the range 0-255. Outside that range, all characters avoid Unicode + * rules, ignoring any locale. So use the Unicode function if this class + * requires a swash, and use the Unicode macro otherwise. */ + + PERL_ARGS_ASSERT_ISFOO_UTF8_LC; + + if (UTF8_IS_INVARIANT(*character)) { + return isFOO_lc(classnum, *character); + } + else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { + return isFOO_lc(classnum, + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); + } + + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Initialize the swash unless done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) + character, + TRUE /* is UTF */ )); + } + + switch ((_char_class_number) classnum) { + case _CC_ENUM_SPACE: + case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character); + + case _CC_ENUM_BLANK: return is_HORIZWS_high(character); + case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character); + case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character); + default: break; + } + + return FALSE; /* Things like CNTRL are always below 256 */ +} + +/* + * pregexec and friends + */ + +#ifndef PERL_IN_XSUB_RE +/* + - pregexec - match a regexp against a string + */ +I32 +Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) +/* stringarg: the point in the string at which to begin matching */ +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >= minend bytes after stringarg. */ +/* screamer: SV being matched: only used for utf8 flag, pos() etc; string + * itself is accessed via the pointers above */ +/* nosave: For optimizations. */ +{ + PERL_ARGS_ASSERT_PREGEXEC; + + return + regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, + nosave ? 0 : REXEC_COPY_STR); +} +#endif + + + +/* re_intuit_start(): + * + * Based on some optimiser hints, try to find the earliest position in the + * string where the regex could match. + * + * rx: the regex to match against + * sv: the SV being matched: only used for utf8 flag; the string + * itself is accessed via the pointers below. Note that on + * something like an overloaded SV, SvPOK(sv) may be false + * and the string pointers may point to something unrelated to + * the SV itself. + * strbeg: real beginning of string + * strpos: the point in the string at which to begin matching + * strend: pointer to the byte following the last char of the string + * flags currently unused; set to 0 + * data: currently unused; set to NULL + * + * The basic idea of re_intuit_start() is to use some known information + * about the pattern, namely: + * + * a) the longest known anchored substring (i.e. one that's at a + * constant offset from the beginning of the pattern; but not + * necessarily at a fixed offset from the beginning of the + * string); + * b) the longest floating substring (i.e. one that's not at a constant + * offset from the beginning of the pattern); + * c) Whether the pattern is anchored to the string; either + * an absolute anchor: /^../, or anchored to \n: /^.../m, + * or anchored to pos(): /\G/; + * d) A start class: a real or synthetic character class which + * represents which characters are legal at the start of the pattern; + * + * to either quickly reject the match, or to find the earliest position + * within the string at which the pattern might match, thus avoiding + * running the full NFA engine at those earlier locations, only to + * eventually fail and retry further along. + * + * Returns NULL if the pattern can't match, or returns the address within + * the string which is the earliest place the match could occur. + * + * The longest of the anchored and floating substrings is called 'check' + * and is checked first. The other is called 'other' and is checked + * second. The 'other' substring may not be present. For example, + * + * /(abc|xyz)ABC\d{0,3}DEFG/ + * + * will have + * + * check substr (float) = "DEFG", offset 6..9 chars + * other substr (anchored) = "ABC", offset 3..3 chars + * stclass = [ax] + * + * Be aware that during the course of this function, sometimes 'anchored' + * refers to a substring being anchored relative to the start of the + * pattern, and sometimes to the pattern itself being anchored relative to + * the string. For example: + * + * /\dabc/: "abc" is anchored to the pattern; + * /^\dabc/: "abc" is anchored to the pattern and the string; + * /\d+abc/: "abc" is anchored to neither the pattern nor the string; + * /^\d+abc/: "abc" is anchored to neither the pattern nor the string, + * but the pattern is anchored to the string. + */ + +char * +Perl_re_intuit_start(pTHX_ + REGEXP * const rx, + SV *sv, + const char * const strbeg, + char *strpos, + char *strend, + const U32 flags, + re_scream_pos_data *data) +{ + struct regexp *const prog = ReANY(rx); + SSize_t start_shift = prog->check_offset_min; + /* Should be nonnegative! */ + SSize_t end_shift = 0; + /* current lowest pos in string where the regex can start matching */ + char *rx_origin = strpos; + SV *check; + const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ + U8 other_ix = 1 - prog->substrs->check_ix; + bool ml_anch = 0; + char *other_last = strpos;/* latest pos 'other' substr already checked to */ + char *check_at = NULL; /* check substr found at this pos */ + const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; + RXi_GET_DECL(prog,progi); + regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ + regmatch_info *const reginfo = ®info_buf; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_START; + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(data); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: trying to determine minimum start position...\n")); + + /* for now, assume that all substr offsets are positive. If at some point + * in the future someone wants to do clever things with look-behind and + * -ve offsets, they'll need to fix up any code in this function + * which uses these offsets. See the thread beginning + * <20140113145929.GF27210@iabyn.com> + */ + assert(prog->substrs->data[0].min_offset >= 0); + assert(prog->substrs->data[0].max_offset >= 0); + assert(prog->substrs->data[1].min_offset >= 0); + assert(prog->substrs->data[1].max_offset >= 0); + assert(prog->substrs->data[2].min_offset >= 0); + assert(prog->substrs->data[2].max_offset >= 0); + + /* for now, assume that if both present, that the floating substring + * doesn't start before the anchored substring. + * If you break this assumption (e.g. doing better optimisations + * with lookahead/behind), then you'll need to audit the code in this + * function carefully first + */ + assert( + ! ( (prog->anchored_utf8 || prog->anchored_substr) + && (prog->float_utf8 || prog->float_substr)) + || (prog->float_min_offset >= prog->anchored_offset)); + + /* byte rather than char calculation for efficiency. It fails + * to quickly reject some cases that can't match, but will reject + * them later after doing full char arithmetic */ + if (prog->minlen > strend - strpos) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too short...\n")); + goto fail; + } + + reginfo->is_utf8_target = cBOOL(utf8_target); + reginfo->info_aux = NULL; + reginfo->strbeg = strbeg; + reginfo->strend = strend; + reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); + reginfo->intuit = 1; + /* not actually used within intuit, but zero for safety anyway */ + reginfo->poscache_maxiter = 0; + + if (utf8_target) { + if (!prog->check_utf8 && prog->check_substr) + to_utf8_substr(prog); + check = prog->check_utf8; + } else { + if (!prog->check_substr && prog->check_utf8) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail); + } + } + check = prog->check_substr; + } + + /* dump the various substring data */ + DEBUG_OPTIMISE_MORE_r({ + int i; + for (i=0; i<=2; i++) { + SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr + : prog->substrs->data[i].substr); + if (!sv) + continue; + + PerlIO_printf(Perl_debug_log, + " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf + " useful=%"IVdf" utf8=%d [%s]\n", + i, + (IV)prog->substrs->data[i].min_offset, + (IV)prog->substrs->data[i].max_offset, + (IV)prog->substrs->data[i].end_shift, + BmUSEFUL(sv), + utf8_target ? 1 : 0, + SvPEEK(sv)); + } + }); + + if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */ + + /* ml_anch: check after \n? + * + * A note about IMPLICIT: on an un-anchored pattern beginning + * with /.*.../, these flags will have been added by the + * compiler: + * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL + * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL + */ + ml_anch = (prog->intflags & PREGf_ANCH_MBOL) + && !(prog->intflags & PREGf_IMPLICIT); + + if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) { + /* we are only allowed to match at BOS or \G */ + + /* trivially reject if there's a BOS anchor and we're not at BOS. + * + * Note that we don't try to do a similar quick reject for + * \G, since generally the caller will have calculated strpos + * based on pos() and gofs, so the string is already correctly + * anchored by definition; and handling the exceptions would + * be too fiddly (e.g. REXEC_IGNOREPOS). + */ + if ( strpos != strbeg + && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Not at start...\n")); + goto fail; + } + + /* in the presence of an anchor, the anchored (relative to the + * start of the regex) substr must also be anchored relative + * to strpos. So quickly reject if substr isn't found there. + * This works for \G too, because the caller will already have + * subtracted gofs from pos, and gofs is the offset from the + * \G to the start of the regex. For example, in /.abc\Gdef/, + * where substr="abcdef", pos()=3, gofs=4, offset_min=1: + * caller will have set strpos=pos()-4; we look for the substr + * at position pos()-4+1, which lines up with the "a" */ + + if (prog->check_offset_min == prog->check_offset_max + && !(prog->intflags & PREGf_CANY_SEEN)) + { + /* Substring at constant offset from beg-of-str... */ + SSize_t slen = SvCUR(check); + char *s = HOP3c(strpos, prog->check_offset_min, strend); + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Looking for check substr at fixed offset %"IVdf"...\n", + (IV)prog->check_offset_min)); + + if (SvTAIL(check)) { + /* In this case, the regex is anchored at the end too. + * Unless it's a multiline match, the lengths must match + * exactly, give or take a \n. NB: slen >= 1 since + * the last char of check is \n */ + if (!multiline + && ( strend - s > slen + || strend - s < slen - 1 + || (strend - s == slen && strend[-1] != '\n'))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String too long...\n")); + goto fail_finish; + } + /* Now should match s[0..slen-2] */ + slen--; + } + if (slen && (*SvPVX_const(check) != *s + || (slen > 1 && memNE(SvPVX_const(check), s, slen)))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " String not equal...\n")); + goto fail_finish; + } + + check_at = s; + goto success_at_start; + } + } + } + + end_shift = prog->check_end_shift; + +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ + if (end_shift < 0) + Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", + (IV)end_shift, RX_PRECOMP(prog)); +#endif + + restart: + + /* This is the (re)entry point of the main loop in this function. + * The goal of this loop is to: + * 1) find the "check" substring in the region rx_origin..strend + * (adjusted by start_shift / end_shift). If not found, reject + * immediately. + * 2) If it exists, look for the "other" substr too if defined; for + * example, if the check substr maps to the anchored substr, then + * check the floating substr, and vice-versa. If not found, go + * back to (1) with rx_origin suitably incremented. + * 3) If we find an rx_origin position that doesn't contradict + * either of the substrings, then check the possible additional + * constraints on rx_origin of /^.../m or a known start class. + * If these fail, then depending on which constraints fail, jump + * back to here, or to various other re-entry points further along + * that skip some of the first steps. + * 4) If we pass all those tests, update the BmUSEFUL() count on the + * substring. If the start position was determined to be at the + * beginning of the string - so, not rejected, but not optimised, + * since we have to run regmatch from position 0 - decrement the + * BmUSEFUL() count. Otherwise increment it. + */ + + + /* first, look for the 'check' substring */ + + { + U8* start_point; + U8* end_point; + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, + " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf + " Start shift: %"IVdf" End shift %"IVdf + " Real end Shift: %"IVdf"\n", + (IV)(rx_origin - strpos), + (IV)prog->check_offset_min, + (IV)start_shift, + (IV)end_shift, + (IV)prog->check_end_shift); + }); + + if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(rx_origin + start_shift); + end_point= (U8*)(strend - end_shift); + if (start_point > end_point) + goto fail_finish; + } else { + end_point = HOP3(strend, -end_shift, strbeg); + start_point = HOPMAYBE3(rx_origin, start_shift, end_point); + if (!start_point) + goto fail_finish; + } + + + /* If the regex is absolutely anchored to either the start of the + * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * check_offset_max represents an upper bound on the string where + * the substr could start. For the ANCH_GPOS case, we assume that + * the caller of intuit will have already set strpos to + * pos()-gofs, so in this case strpos + offset_max will still be + * an upper bound on the substr. + */ + if (!ml_anch + && prog->intflags & PREGf_ANCH + && prog->check_offset_max != SSize_t_MAX) + { + SSize_t len = SvCUR(check) - !!SvTAIL(check); + const char * const anchor = + (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + + /* do a bytes rather than chars comparison. It's conservative; + * so it skips doing the HOP if the result can't possibly end + * up earlier than the old value of end_point. + */ + if ((char*)end_point - anchor > prog->check_offset_max) { + end_point = HOP3lim((U8*)anchor, + prog->check_offset_max, + end_point -len) + + len; + } + } + + DEBUG_OPTIMISE_MORE_r({ + PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n", + (int)(end_point - start_point), + (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), + start_point); + }); + + check_at = fbm_instr( start_point, end_point, + check, multiline ? FBMrf_MULTILINE : 0); + + /* Update the count-of-usability, remove useless subpatterns, + unshift s. */ + + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(check), RE_SV_DUMPLEN(check), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s", + (check_at ? "Found" : "Did not find"), + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) + ? "anchored" : "floating"), + quoted, + RE_SV_TAIL(check), + (check_at ? " at offset " : "...\n") ); + }); + + if (!check_at) + goto fail_finish; + /* Finish the diagnostic message */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) ); + + /* set rx_origin to the minimum position where the regex could start + * matching, given the constraint of the just-matched check substring. + * But don't set it lower than previously. + */ + + if (check_at - rx_origin > prog->check_offset_max) + rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin); + } + + + /* now look for the 'other' substring if defined */ + + if (utf8_target ? prog->substrs->data[other_ix].utf8_substr + : prog->substrs->data[other_ix].substr) + { + /* Take into account the "other" substring. */ + char *last, *last1; + char *s; + SV* must; + struct reg_substr_datum *other; + + do_other_substr: + other = &prog->substrs->data[other_ix]; + + /* if "other" is anchored: + * we've previously found a floating substr starting at check_at. + * This means that the regex origin must lie somewhere + * between min (rx_origin): HOP3(check_at, -check_offset_max) + * and max: HOP3(check_at, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ + + /* if "other" is floating + * Calculate last1, the absolute latest point where the + * floating substr could start in the string, ignoring any + * constraints from the earlier fixed match. It is calculated + * as follows: + * + * strend - prog->minlen (in chars) is the absolute latest + * position within the string where the origin of the regex + * could appear. The latest start point for the floating + * substr is float_min_offset(*) on from the start of the + * regex. last1 simply combines thee two offsets. + * + * (*) You might think the latest start point should be + * float_max_offset from the regex origin, and technically + * you'd be correct. However, consider + * /a\d{2,4}bcd\w/ + * Here, float min, max are 3,5 and minlen is 7. + * This can match either + * /a\d\dbcd\w/ + * /a\d\d\dbcd\w/ + * /a\d\d\d\dbcd\w/ + * In the first case, the regex matches minlen chars; in the + * second, minlen+1, in the third, minlen+2. + * In the first case, the floating offset is 3 (which equals + * float_min), in the second, 4, and in the third, 5 (which + * equals float_max). In all cases, the floating string bcd + * can never start more than 4 chars from the end of the + * string, which equals minlen - float_min. As the substring + * starts to match more than float_min from the start of the + * regex, it makes the regex match more than minlen chars, + * and the two cancel each other out. So we can always use + * float_min - minlen, rather than float_max - minlen for the + * latest position in the string. + * + * Note that -minlen + float_min_offset is equivalent (AFAIKT) + * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift + */ + + assert(prog->minlen >= other->min_offset); + last1 = HOP3c(strend, + other->min_offset - prog->minlen, strbeg); + + if (other_ix) {/* i.e. if (other-is-float) */ + /* last is the latest point where the floating substr could + * start, *given* any constraints from the earlier fixed + * match. This constraint is that the floating string starts + * <= float_max_offset chars from the regex origin (rx_origin). + * If this value is less than last1, use it instead. + */ + assert(rx_origin <= last1); + last = + /* this condition handles the offset==infinity case, and + * is a short-cut otherwise. Although it's comparing a + * byte offset to a char length, it does so in a safe way, + * since 1 char always occupies 1 or more bytes, + * so if a string range is (last1 - rx_origin) bytes, + * it will be less than or equal to (last1 - rx_origin) + * chars; meaning it errs towards doing the accurate HOP3 + * rather than just using last1 as a short-cut */ + (last1 - rx_origin) < other->max_offset + ? last1 + : (char*)HOP3lim(rx_origin, other->max_offset, last1); + } + else { + assert(strpos + start_shift <= check_at); + last = HOP4c(check_at, other->min_offset - start_shift, + strbeg, strend); + } + + s = HOP3c(rx_origin, other->min_offset, strend); + if (s < other_last) /* These positions already checked */ + s = other_last; + + must = utf8_target ? other->utf8_substr : other->substr; + assert(SvPOK(must)); + s = fbm_instr( + (unsigned char*)s, + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), + must, + multiline ? FBMrf_MULTILINE : 0 + ); + DEBUG_EXECUTE_r({ + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, " %s %s substr %s%s", + s ? "Found" : "Contradicts", + other_ix ? "floating" : "anchored", + quoted, RE_SV_TAIL(must)); + }); + + + if (!s) { + /* last1 is latest possible substr location. If we didn't + * find it before there, we never will */ + if (last >= last1) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", giving up...\n")); + goto fail_finish; + } + + /* try to find the check substr again at a later + * position. Maybe next time we'll find the "other" substr + * in range too */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + ", trying %s at offset %ld...\n", + (other_ix ? "floating" : "anchored"), + (long)(HOP3c(check_at, 1, strend) - strpos))); + + other_last = HOP3c(last, 1, strend) /* highest failure */; + rx_origin = + other_ix /* i.e. if other-is-float */ + ? HOP3c(rx_origin, 1, strend) + : HOP4c(last, 1 - other->min_offset, strbeg, strend); + goto restart; + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", + (long)(s - strpos))); + + if (other_ix) { /* if (other-is-float) */ + /* other_last is set to s, not s+1, since its possible for + * a floating substr to fail first time, then succeed + * second time at the same floating position; e.g.: + * "-AB--AABZ" =~ /\wAB\d*Z/ + * The first time round, anchored and float match at + * "-(AB)--AAB(Z)" then fail on the initial \w character + * class. Second time round, they match at "-AB--A(AB)(Z)". + */ + other_last = s; + } + else { + rx_origin = HOP3c(s, -other->min_offset, strbeg); + other_last = HOP3c(s, 1, strend); + } + } + } + else { + DEBUG_OPTIMISE_MORE_r( + PerlIO_printf(Perl_debug_log, + " Check-only match: offset min:%"IVdf" max:%"IVdf + " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf + " strend-strpos:%"IVdf"\n", + (IV)prog->check_offset_min, + (IV)prog->check_offset_max, + (IV)(check_at-strpos), + (IV)(rx_origin-strpos), + (IV)(rx_origin-check_at), + (IV)(strend-strpos) + ) + ); + } + + postprocess_substr_matches: + + /* handle the extra constraint of /^.../m if present */ + + if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') { + char *s; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for /^/m anchor")); + + /* we have failed the constraint of a \n before rx_origin. + * Find the next \n, if any, even if it's beyond the current + * anchored and/or floating substrings. Whether we should be + * scanning ahead for the next \n or the next substr is debatable. + * On the one hand you'd expect rare substrings to appear less + * often than \n's. On the other hand, searching for \n means + * we're effectively flipping been check_substr and "\n" on each + * iteration as the current "rarest" string candidate, which + * means for example that we'll quickly reject the whole string if + * hasn't got a \n, rather than trying every substr position + * first + */ + + s = HOP3c(strend, - prog->minlen, strpos); + if (s <= rx_origin || + ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin))) + { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Did not find /%s^%s/m...\n", + PL_colors[0], PL_colors[1])); + goto fail_finish; + } + + /* earliest possible origin is 1 char after the \n. + * (since *rx_origin == '\n', it's safe to ++ here rather than + * HOP(rx_origin, 1)) */ + rx_origin++; + + if (prog->substrs->check_ix == 0 /* check is anchored */ + || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos)) + { + /* Position contradicts check-string; either because + * check was anchored (and thus has no wiggle room), + * or check was float and rx_origin is above the float range */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + goto restart; + } + + /* if we get here, the check substr must have been float, + * is in range, and we may or may not have had an anchored + * "other" substr which still contradicts */ + assert(prog->substrs->check_ix); /* check is float */ + + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { + /* whoops, the anchored "other" substr exists, so we still + * contradict. On the other hand, the float "check" substr + * didn't contradict, so just retry the anchored "other" + * substr */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos), + (long)(rx_origin - strpos + prog->anchored_offset))); + goto do_other_substr; + } + + /* success: we don't contradict the found floating substring + * (and there's no anchored substr). */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Found /%s^%s/m at offset %ld...\n", + PL_colors[0], PL_colors[1], (long)(rx_origin - strpos))); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " (multiline anchor test skipped)\n")); + } + + success_at_start: + + + /* if we have a starting character class, then test that extra constraint. + * (trie stclasses are too expensive to use here, we are better off to + * leave it to regmatch itself) */ + + if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) { + const U8* const str = (U8*)STRING(progi->regstclass); + + /* XXX this value could be pre-computed */ + const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT + ? (reginfo->is_utf8_pat + ? utf8_distance(str + STR_LEN(progi->regstclass), str) + : STR_LEN(progi->regstclass)) + : 1); + char * endpos; + char *s; + /* latest pos that a matching float substr constrains rx start to */ + char *rx_max_float = NULL; + + /* if the current rx_origin is anchored, either by satisfying an + * anchored substring constraint, or a /^.../m constraint, then we + * can reject the current origin if the start class isn't found + * at the current position. If we have a float-only match, then + * rx_origin is constrained to a range; so look for the start class + * in that range. if neither, then look for the start class in the + * whole rest of the string */ + + /* XXX DAPM it's not clear what the minlen test is for, and why + * it's not used in the floating case. Nothing in the test suite + * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>. + * Here are some old comments, which may or may not be correct: + * + * minlen == 0 is possible if regstclass is \b or \B, + * and the fixed substr is ''$. + * Since minlen is already taken into account, rx_origin+1 is + * before strend; accidentally, minlen >= 1 guaranties no false + * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 : + * 0) below assumes that regstclass does not come from lookahead... + * If regstclass takes bytelength more than 1: If charlength==1, OK. + * This leaves EXACTF-ish only, which are dealt with in + * find_byclass(). + */ + + if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) + endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + else if (prog->float_substr || prog->float_utf8) { + rx_max_float = HOP3c(check_at, -start_shift, strbeg); + endpos= HOP3c(rx_max_float, cl_l, strend); + } + else + endpos= strend; + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " looking for class: start_shift: %"IVdf" check_at: %"IVdf + " rx_origin: %"IVdf" endpos: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), + (IV)(rx_origin - strbeg), (IV)(endpos - strbeg))); + + s = find_byclass(prog, progi->regstclass, rx_origin, endpos, + reginfo); + if (!s) { + if (endpos == strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " This position contradicts STCLASS...\n") ); + if ((prog->intflags & PREGf_ANCH) && !ml_anch + && !(prog->intflags & PREGf_IMPLICIT)) + goto fail; + + /* Contradict one of substrings */ + if (prog->anchored_substr || prog->anchored_utf8) { + if (prog->substrs->check_ix == 1) { /* check is float */ + /* Have both, check_string is floating */ + assert(rx_origin + start_shift <= check_at); + if (rx_origin + start_shift != check_at) { + /* not at latest position float substr could match: + * Recheck anchored substring, but not floating. + * The condition above is in bytes rather than + * chars for efficiency. It's conservative, in + * that it errs on the side of doing 'goto + * do_other_substr', where a more accurate + * char-based calculation will be done */ + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for anchored substr starting at offset %ld...\n", + (long)(other_last - strpos)) ); + goto do_other_substr; + } + } + } + else { + /* float-only */ + + if (ml_anch) { + /* In the presence of ml_anch, we might be able to + * find another \n without breaking the current float + * constraint. */ + + /* strictly speaking this should be HOP3c(..., 1, ...), + * but since we goto a block of code that's going to + * search for the next \n if any, its safe here */ + rx_origin++; + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for /%s^%s/m starting at offset %ld...\n", + PL_colors[0], PL_colors[1], + (long)(rx_origin - strpos)) ); + goto postprocess_substr_matches; + } + + /* strictly speaking this can never be true; but might + * be if we ever allow intuit without substrings */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + goto fail; + + rx_origin = rx_max_float; + } + + /* at this point, any matching substrings have been + * contradicted. Start again... */ + + rx_origin = HOP3c(rx_origin, 1, strend); + + /* uses bytes rather than char calculations for efficiency. + * It's conservative: it errs on the side of doing 'goto restart', + * where there is code that does a proper char-based test */ + if (rx_origin + start_shift + end_shift > strend) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Could not match STCLASS...\n") ); + goto fail; + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + " Looking for %s substr starting at offset %ld...\n", + (prog->substrs->check_ix ? "floating" : "anchored"), + (long)(rx_origin + start_shift - strpos)) ); + goto restart; + } + + /* Success !!! */ + + if (rx_origin != s) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " By STCLASS: moving %ld --> %ld\n", + (long)(rx_origin - strpos), (long)(s - strpos)) + ); + } + else { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + " Does not contradict STCLASS...\n"); + ); + } + } + + /* Decide whether using the substrings helped */ + + if (rx_origin != strpos) { + /* Fixed substring is found far enough so that the match + cannot start at strpos. */ + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n")); + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + } + else { + /* The found rx_origin position does not prohibit matching at + * strpos, so calling intuit didn't gain us anything. Decrement + * the BmUSEFUL() count on the check substring, and if we reach + * zero, free it. */ + if (!(prog->intflags & PREGf_NAUGHTY) + && (utf8_target ? ( + prog->check_utf8 /* Could be deleted already */ + && --BmUSEFUL(prog->check_utf8) < 0 + && (prog->check_utf8 == prog->float_utf8) + ) : ( + prog->check_substr /* Could be deleted already */ + && --BmUSEFUL(prog->check_substr) < 0 + && (prog->check_substr == prog->float_substr) + ))) + { + /* If flags & SOMETHING - do not do it many times on the same match */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n")); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ + /* XXXX This is a remnant of the old implementation. It + looks wasteful, since now INTUIT can use many + other heuristics. */ + prog->extflags &= ~RXf_USE_INTUIT; + } + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "Intuit: %sSuccessfully guessed:%s match at offset %ld\n", + PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) ); + + return rx_origin; + + fail_finish: /* Substring not found */ + if (prog->check_substr || prog->check_utf8) /* could be removed already */ + BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ + fail: + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", + PL_colors[4], PL_colors[5])); + return NULL; +} + + +#define DECL_TRIE_TYPE(scan) \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ + trie_type = ((scan->flags == EXACT) \ + ? (utf8_target ? trie_utf8 : trie_plain) \ + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + +#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ +STMT_START { \ + STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ + switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ + case trie_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ + len = UTF8SKIP(uc); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALLTHROUGH */ \ + case trie_latin_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_utf8: \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + break; \ + case trie_plain: \ + uvc = (UV)*uc; \ + len = 1; \ + } \ + if (uvc < 256) { \ + charid = trie->charmap[ uvc ]; \ + } \ + else { \ + charid = 0; \ + if (widecharmap) { \ + SV** const svpp = hv_fetch(widecharmap, \ + (char*)&uvc, sizeof(UV), 0); \ + if (svpp) \ + charid = (U16)SvIV(*svpp); \ + } \ + } \ +} STMT_END + +#define DUMP_EXEC_POS(li,s,doutf8) \ + dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ + startpos, doutf8) + +#define REXEC_FBC_EXACTISH_SCAN(COND) \ +STMT_START { \ + while (s <= e) { \ + if ( (COND) \ + && (ln == 1 || folder(s, pat_string, ln)) \ + && (reginfo->intuit || regtry(reginfo, &s)) )\ + goto got_it; \ + s++; \ + } \ +} STMT_END + +#define REXEC_FBC_UTF8_SCAN(CODE) \ +STMT_START { \ + while (s < strend) { \ + CODE \ + s += UTF8SKIP(s); \ + } \ +} STMT_END + +#define REXEC_FBC_SCAN(CODE) \ +STMT_START { \ + while (s < strend) { \ + CODE \ + s++; \ + } \ +} STMT_END + +#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \ +REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \ + if (COND) { \ + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it; \ + else \ + tmp = doevery; \ + } \ + else \ + tmp = 1; \ +) + +#define REXEC_FBC_CLASS_SCAN(COND) \ +REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ + if (COND) { \ + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it; \ + else \ + tmp = doevery; \ + } \ + else \ + tmp = 1; \ +) + +#define REXEC_FBC_CSCAN(CONDUTF8,COND) \ + if (utf8_target) { \ + REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \ + } \ + else { \ + REXEC_FBC_CLASS_SCAN(COND); \ + } + +/* The three macros below are slightly different versions of the same logic. + * + * The first is for /a and /aa when the target string is UTF-8. This can only + * match ascii, but it must advance based on UTF-8. The other two handle the + * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking + * for the boundary (or non-boundary) between a word and non-word character. + * The utf8 and non-utf8 cases have the same logic, but the details must be + * different. Find the "wordness" of the character just prior to this one, and + * compare it with the wordness of this one. If they differ, we have a + * boundary. At the beginning of the string, pretend that the previous + * character was a new-line. + * + * All these macros uncleanly have side-effects with each other and outside + * variables. So far it's been too much trouble to clean-up + * + * TEST_NON_UTF8 is the macro or function to call to test if its byte input is + * a word character or not. + * IF_SUCCESS is code to do if it finds that we are at a boundary between + * word/non-word + * IF_FAIL is code to do if we aren't at a boundary between word/non-word + * + * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we + * are looking for a boundary or for a non-boundary. If we are looking for a + * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and + * see if this tentative match actually works, and if so, to quit the loop + * here. And vice-versa if we are looking for a non-boundary. + * + * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and + * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of + * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be + * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal + * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that + * complement. But in that branch we complement tmp, meaning that at the + * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s), + * which means at the top of the loop in the next iteration, it is + * TEST_NON_UTF8(s-1) */ +#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + tmp = !tmp; \ + IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + +/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and + * TEST_UTF8 is a macro that for the same input code points returns identically + * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */ +#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ + if (s == reginfo->strbeg) { \ + tmp = '\n'; \ + } \ + else { /* Back-up to the start of the previous character */ \ + U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \ + tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \ + 0, UTF8_ALLOW_DEFAULT); \ + } \ + tmp = TEST_UV(tmp); \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ + REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ + if (tmp == ! (TEST_UTF8((U8 *) s))) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); + +/* Like the above two macros. UTF8_CODE is the complete code for handling + * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc + * macros below */ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + if (utf8_target) { \ + UTF8_CODE \ + } \ + else { /* Not utf8 */ \ + tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_SCAN( /* advances s while s < strend */ \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + IF_SUCCESS; \ + tmp = !tmp; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + } \ + /* Here, things have been set up by the previous code so that tmp is the \ + * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \ + * utf8ness of the target). We also have to check if this matches against \ + * the EOS, which we treat as a \n (which is the same value in both UTF-8 \ + * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \ + * string */ \ + if (tmp == ! TEST_NON_UTF8('\n')) { \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } + +/* This is the macro to use when we want to see if something that looks like it + * could match, actually does, and if so exits the loop */ +#define REXEC_FBC_TRYIT \ + if ((reginfo->intuit || regtry(reginfo, &s))) \ + goto got_it + +/* The only difference between the BOUND and NBOUND cases is that + * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in + * NBOUND. This is accomplished by passing it as either the if or else clause, + * with the other one being empty (PLACEHOLDER is defined as empty). + * + * The TEST_FOO parameters are for operating on different forms of input, but + * all should be ones that return identically for the same underlying code + * points */ +#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ + TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ + TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ + TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + +#define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON( \ + FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ + TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + + +/* We know what class REx starts with. Try to find this position... */ +/* if reginfo->intuit, its a dryrun */ +/* annoyingly all the vars in this routine have different names from their counterparts + in regmatch. /grrr */ +STATIC char * +S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, + const char *strend, regmatch_info *reginfo) +{ + dVAR; + const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; + char *pat_string; /* The pattern's exactish string */ + char *pat_end; /* ptr to end char of pat_string */ + re_fold_t folder; /* Function for computing non-utf8 folds */ + const U8 *fold_array; /* array for folding ords < 256 */ + STRLEN ln; + STRLEN lnc; + U8 c1; + U8 c2; + char *e; + I32 tmp = 1; /* Scratch variable? */ + const bool utf8_target = reginfo->is_utf8_target; + UV utf8_fold_flags = 0; + const bool is_utf8_pat = reginfo->is_utf8_pat; + bool to_complement = FALSE; /* Invert the result? Taking the xor of this + with a result inverts that result, as 0^1 = + 1 and 1^1 = 0 */ + _char_class_number classnum; + + RXi_GET_DECL(prog,progi); + + PERL_ARGS_ASSERT_FIND_BYCLASS; + + /* We know what class it must start with. */ + switch (OP(c)) { + case ANYOF: + if (utf8_target) { + REXEC_FBC_UTF8_CLASS_SCAN( + reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); + } + else { + REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); + } + break; + case CANY: + REXEC_FBC_SCAN( + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) + goto got_it; + else + tmp = doevery; + ); + break; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + if (is_utf8_pat || utf8_target) { + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf_utf8; + } + fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */ + folder = foldEQ_latin1; /* /a, except the sharp s one which */ + goto do_exactf_non_utf8; /* isn't dealt with by these */ + + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + if (utf8_target) { + utf8_fold_flags = 0; + goto do_exactf_utf8; + } + fold_array = PL_fold; + folder = foldEQ; + goto do_exactf_non_utf8; + + case EXACTFL: + if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_exactf_utf8; + } + fold_array = PL_fold_locale; + folder = foldEQ_locale; + goto do_exactf_non_utf8; + + case EXACTFU_SS: + if (is_utf8_pat) { + utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + } + goto do_exactf_utf8; + + case EXACTFU: + if (is_utf8_pat || utf8_target) { + utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; + goto do_exactf_utf8; + } + + /* Any 'ss' in the pattern should have been replaced by regcomp, + * so we don't have to worry here about this single special case + * in the Latin1 range */ + fold_array = PL_fold_latin1; + folder = foldEQ_latin1; + + /* FALLTHROUGH */ + + do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there + are no glitches with fold-length differences + between the target string and pattern */ + + /* The idea in the non-utf8 EXACTF* cases is to first find the + * first character of the EXACTF* node and then, if necessary, + * case-insensitively compare the full text of the node. c1 is the + * first character. c2 is its fold. This logic will not work for + * Unicode semantics and the german sharp ss, which hence should + * not be compiled into a node that gets here. */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + + /* We know that we have to match at least 'ln' bytes (which is the + * same as characters, since not utf8). If we have to match 3 + * characters, and there are only 2 availabe, we know without + * trying that it will fail; so don't start a match past the + * required minimum number from the far end */ + e = HOP3c(strend, -((SSize_t)ln), s); + + if (reginfo->intuit && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + c1 = *pat_string; + c2 = fold_array[c1]; + if (c1 == c2) { /* If char and fold are the same */ + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); + } + else { + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); + } + break; + + do_exactf_utf8: + { + unsigned expansion; + + /* If one of the operands is in utf8, we can't use the simpler folding + * above, due to the fact that many different characters can have the + * same fold, or portion of a fold, or different- length fold */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_end = pat_string + ln; + lnc = is_utf8_pat /* length to match in characters */ + ? utf8_length((U8 *) pat_string, (U8 *) pat_end) + : ln; + + /* We have 'lnc' characters to match in the pattern, but because of + * multi-character folding, each character in the target can match + * up to 3 characters (Unicode guarantees it will never exceed + * this) if it is utf8-encoded; and up to 2 if not (based on the + * fact that the Latin 1 folds are already determined, and the + * only multi-char fold in that range is the sharp-s folding to + * 'ss'. Thus, a pattern character can match as little as 1/3 of a + * string character. Adjust lnc accordingly, rounding up, so that + * if we need to match at least 4+1/3 chars, that really is 5. */ + expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; + lnc = (lnc + expansion - 1) / expansion; + + /* As in the non-UTF8 case, if we have to match 3 characters, and + * only 2 are left, it's guaranteed to fail, so don't start a + * match that would require us to go beyond the end of the string + */ + e = HOP3c(strend, -((SSize_t)lnc), s); + + if (reginfo->intuit && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + /* XXX Note that we could recalculate e to stop the loop earlier, + * as the worst case expansion above will rarely be met, and as we + * go along we would usually find that e moves further to the left. + * This would happen only after we reached the point in the loop + * where if there were no expansion we should fail. Unclear if + * worth the expense */ + + while (s <= e) { + char *my_strend= (char *)strend; + if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, + pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags) + && (reginfo->intuit || regtry(reginfo, &s)) ) + { + goto got_it; + } + s += (utf8_target) ? UTF8SKIP(s) : 1; + } + break; + } + + case BOUNDL: + FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); + break; + case NBOUNDL: + FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); + break; + case BOUND: + FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case BOUNDA: + FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A); + break; + case NBOUND: + FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case NBOUNDA: + FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A); + break; + case BOUNDU: + FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case NBOUNDU: + FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); + break; + case LNBREAK: + REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), + is_LNBREAK_latin1_safe(s, strend) + ); + break; + + /* The argument to all the POSIX node types is the class number to pass to + * _generic_isCC() to build a mask for searching in PL_charclass[] */ + + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: + REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), + to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); + break; + + case NPOSIXD: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: + if (utf8_target) { + goto posix_utf8; + } + goto posixa; + + case NPOSIXA: + if (utf8_target) { + /* The complement of something that matches only ASCII matches all + * non-ASCII, plus everything in ASCII that isn't in the class. */ + REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s) + || ! _generic_isCC_A(*s, FLAGS(c))); + break; + } + + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXA: + posixa: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character. */ + REXEC_FBC_CLASS_SCAN( + to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); + break; + + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s, + FLAGS(c)))); + } + else { + + posix_utf8: + classnum = (_char_class_number) FLAGS(c); + if (classnum < _FIRST_NON_SWASH_CC) { + while (s < strend) { + + /* We avoid loading in the swash as long as possible, but + * should we have to, we jump to a separate loop. This + * extra 'if' statement is what keeps this code from being + * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */ + if (UTF8_IS_ABOVE_LATIN1(*s)) { + goto found_above_latin1; + } + if ((UTF8_IS_INVARIANT(*s) + && to_complement ^ cBOOL(_generic_isCC((U8) *s, + classnum))) + || (UTF8_IS_DOWNGRADEABLE_START(*s) + && to_complement ^ cBOOL( + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), + classnum)))) + { + if (tmp && (reginfo->intuit || regtry(reginfo, &s))) + goto got_it; + else { + tmp = doevery; + } + } + else { + tmp = 1; + } + s += UTF8SKIP(s); + } + } + else switch (classnum) { /* These classes are implemented as + macros */ + case _CC_ENUM_SPACE: /* XXX would require separate code if we + revert the change of \v matching this */ + /* FALLTHROUGH */ + + case _CC_ENUM_PSXSPC: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isSPACE_utf8(s))); + break; + + case _CC_ENUM_BLANK: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isBLANK_utf8(s))); + break; + + case _CC_ENUM_XDIGIT: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isXDIGIT_utf8(s))); + break; + + case _CC_ENUM_VERTSPACE: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isVERTWS_utf8(s))); + break; + + case _CC_ENUM_CNTRL: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isCNTRL_utf8(s))); + break; + + default: + Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum); + assert(0); /* NOTREACHED */ + } + } + break; + + found_above_latin1: /* Here we have to load a swash to get the result + for the current code point */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = + _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + /* This is a copy of the loop above for swash classes, though using the + * FBC macro instead of being expanded out. Since we've loaded the + * swash, we don't have to check for that each time through the loop */ + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(_generic_utf8( + classnum, + s, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) s, TRUE)))); + break; + + case AHOCORASICKC: + case AHOCORASICK: + { + DECL_TRIE_TYPE(c); + /* what trie are we using right now */ + reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; + reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ]; + HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); + + const char *last_start = strend - trie->minlen; +#ifdef DEBUGGING + const char *real_start = s; +#endif + STRLEN maxlen = trie->maxlen; + SV *sv_points; + U8 **points; /* map of where we were in the input string + when reading a given char. For ASCII this + is unnecessary overhead as the relationship + is always 1:1, but for Unicode, especially + case folded Unicode this is not true. */ + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + U8 *bitmap=NULL; + + + GET_RE_DEBUG_FLAGS_DECL; + + /* We can't just allocate points here. We need to wrap it in + * an SV so it gets freed properly if there is a croak while + * running the match */ + ENTER; + SAVETMPS; + sv_points=newSV(maxlen * sizeof(U8 *)); + SvCUR_set(sv_points, + maxlen * sizeof(U8 *)); + SvPOK_on(sv_points); + sv_2mortal(sv_points); + points=(U8**)SvPV_nolen(sv_points ); + if ( trie_type != trie_utf8_fold + && (trie->bitmap || OP(c)==AHOCORASICKC) ) + { + if (trie->bitmap) + bitmap=(U8*)trie->bitmap; + else + bitmap=(U8*)ANYOF_BITMAP(c); + } + /* this is the Aho-Corasick algorithm modified a touch + to include special handling for long "unknown char" sequences. + The basic idea being that we use AC as long as we are dealing + with a possible matching char, when we encounter an unknown char + (and we have not encountered an accepting state) we scan forward + until we find a legal starting char. + AC matching is basically that of trie matching, except that when + we encounter a failing transition, we fall back to the current + states "fail state", and try the current char again, a process + we repeat until we reach the root state, state 1, or a legal + transition. If we fail on the root state then we can either + terminate if we have reached an accepting state previously, or + restart the entire process from the beginning if we have not. + + */ + while (s <= last_start) { + const U32 uniflags = UTF8_ALLOW_DEFAULT; + U8 *uc = (U8*)s; + U16 charid = 0; + U32 base = 1; + U32 state = 1; + UV uvc = 0; + STRLEN len = 0; + STRLEN foldlen = 0; + U8 *uscan = (U8*)NULL; + U8 *leftmost = NULL; +#ifdef DEBUGGING + U32 accepted_word= 0; +#endif + U32 pointpos = 0; + + while ( state && uc <= (U8*)strend ) { + int failed=0; + U32 word = aho->states[ state ].wordnum; + + if( state==1 ) { + if ( bitmap ) { + DEBUG_TRIE_EXECUTE_r( + if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + dump_exec_pos( (char *)uc, c, strend, real_start, + (char *)uc, utf8_target ); + PerlIO_printf( Perl_debug_log, + " Scanning for legal start char...\n"); + } + ); + if (utf8_target) { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc += UTF8SKIP(uc); + } + } else { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; + } + } + s= (char *)uc; + } + if (uc >(U8*)last_start) break; + } + + if ( word ) { + U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ]; + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=word); + leftmost= lpos; + } + if (base==0) break; + + } + points[pointpos++ % maxlen]= uc; + if (foldlen || uc < (U8*)strend) { + REXEC_TRIE_READ_CHAR(trie_type, trie, + widecharmap, uc, + uscan, len, uvc, charid, foldlen, + foldbuf, uniflags); + DEBUG_TRIE_EXECUTE_r({ + dump_exec_pos( (char *)uc, c, strend, + real_start, s, utf8_target); + PerlIO_printf(Perl_debug_log, + " Charid:%3u CP:%4"UVxf" ", + charid, uvc); + }); + } + else { + len = 0; + charid = 0; + } + + + do { +#ifdef DEBUGGING + word = aho->states[ state ].wordnum; +#endif + base = aho->states[ state ].trans.base; + + DEBUG_TRIE_EXECUTE_r({ + if (failed) + dump_exec_pos( (char *)uc, c, strend, real_start, + s, utf8_target ); + PerlIO_printf( Perl_debug_log, + "%sState: %4"UVxf", word=%"UVxf, + failed ? " Fail transition to " : "", + (UV)state, (UV)word); + }); + if ( base ) { + U32 tmp; + I32 offset; + if (charid && + ( ((offset = base + charid + - 1 - trie->uniquecharcount)) >= 0) + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state + && (tmp=trie->trans[offset].next)) + { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - legal\n")); + state = tmp; + break; + } + else { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - fail\n")); + failed = 1; + state = aho->fail[state]; + } + } + else { + /* we must be accepting here */ + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log," - accepting\n")); + failed = 1; + break; + } + } while(state); + uc += len; + if (failed) { + if (leftmost) + break; + if (!state) state = 1; + } + } + if ( aho->states[ state ].wordnum ) { + U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ]; + if (!leftmost || lpos < leftmost) { + DEBUG_r(accepted_word=aho->states[ state ].wordnum); + leftmost = lpos; + } + } + if (leftmost) { + s = (char*)leftmost; + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( + Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n", + (UV)accepted_word, (IV)(s - real_start) + ); + }); + if (reginfo->intuit || regtry(reginfo, &s)) { + FREETMPS; + LEAVE; + goto got_it; + } + s = HOPc(s,1); + DEBUG_TRIE_EXECUTE_r({ + PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n"); + }); + } else { + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log,"No match.\n")); + break; + } + } + FREETMPS; + LEAVE; + } + break; + default: + Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); + } + return 0; + got_it: + return s; +} + +/* set RX_SAVED_COPY, RX_SUBBEG etc. + * flags have same meanings as with regexec_flags() */ + +static void +S_reg_set_capture_string(pTHX_ REGEXP * const rx, + char *strbeg, + char *strend, + SV *sv, + U32 flags, + bool utf8_target) +{ + struct regexp *const prog = ReANY(rx); + + if (flags & REXEC_COPY_STR) { +#ifdef PERL_ANY_COW + if (SvCANCOW(sv)) { + if (DEBUG_C_TEST) { + PerlIO_printf(Perl_debug_log, + "Copy on write: regexp capture, type %d\n", + (int) SvTYPE(sv)); + } + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) + { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ + RX_MATCH_COPY_FREE(rx); + prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); + } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); + prog->sublen = strend - strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + } else +#endif + { + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; + + if ( (flags & REXEC_COPY_SKIP_POST) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_RIGHT) + ) { /* don't copy $' part of string */ + U32 n = 0; + max = -1; + /* calculate the right-most part of the string covered + * by a capture. Due to look-ahead, this may be to + * the right of $&, so we have to scan all captures */ + while (n <= prog->lastparen) { + if (prog->offs[n].end > max) + max = prog->offs[n].end; + n++; + } + if (max == -1) + max = (PL_sawampersand & SAWAMPERSAND_LEFT) + ? prog->offs[0].start + : 0; + assert(max >= 0 && max <= strend - strbeg); + } + + if ( (flags & REXEC_COPY_SKIP_PRE) + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ + && !(PL_sawampersand & SAWAMPERSAND_LEFT) + ) { /* don't copy $` part of string */ + U32 n = 0; + min = max; + /* calculate the left-most part of the string covered + * by a capture. Due to look-behind, this may be to + * the left of $&, so we have to scan all captures */ + while (min && n <= prog->lastparen) { + if ( prog->offs[n].start != -1 + && prog->offs[n].start < min) + { + min = prog->offs[n].start; + } + n++; + } + if ((PL_sawampersand & SAWAMPERSAND_RIGHT) + && min > prog->offs[0].end + ) + min = prog->offs[0].end; + + } + + assert(min >= 0 && min <= max && min <= strend - strbeg); + sublen = max - min; + + if (RX_MATCH_COPIED(rx)) { + if (sublen > prog->sublen) + prog->subbeg = + (char*)saferealloc(prog->subbeg, sublen+1); + } + else + prog->subbeg = (char*)safemalloc(sublen+1); + Copy(strbeg + min, prog->subbeg, sublen, char); + prog->subbeg[sublen] = '\0'; + prog->suboffset = min; + prog->sublen = sublen; + RX_MATCH_COPIED_on(rx); + } + prog->subcoffset = prog->suboffset; + if (prog->suboffset && utf8_target) { + /* Convert byte offset to chars. + * XXX ideally should only compute this if @-/@+ + * has been seen, a la PL_sawampersand ??? */ + + /* If there's a direct correspondence between the + * string which we're matching and the original SV, + * then we can use the utf8 len cache associated with + * the SV. In particular, it means that under //g, + * sv_pos_b2u() will use the previously cached + * position to speed up working out the new length of + * subcoffset, rather than counting from the start of + * the string each time. This stops + * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; + * from going quadratic */ + if (SvPOKp(sv) && SvPVX(sv) == strbeg) + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); + else + prog->subcoffset = utf8_length((U8*)strbeg, + (U8*)(strbeg+prog->suboffset)); + } + } + else { + RX_MATCH_COPY_FREE(rx); + prog->subbeg = strbeg; + prog->suboffset = 0; + prog->subcoffset = 0; + prog->sublen = strend - strbeg; + } +} + + + + +/* + - regexec_flags - match a regexp against a string + */ +I32 +Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) +/* stringarg: the point in the string at which to begin matching */ +/* strend: pointer to null at end of string */ +/* strbeg: real beginning of string */ +/* minend: end of match must be >= minend bytes after stringarg. */ +/* sv: SV being matched: only used for utf8 flag, pos() etc; string + * itself is accessed via the pointers above */ +/* data: May be used for some additional optimizations. + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ + +{ + struct regexp *const prog = ReANY(rx); + char *s; + regnode *c; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ + const bool utf8_target = cBOOL(DO_UTF8(sv)); + I32 multiline; + RXi_GET_DECL(prog,progi); + regmatch_info reginfo_buf; /* create some info to pass to regtry etc */ + regmatch_info *const reginfo = ®info_buf; + regexp_paren_pair *swap = NULL; + I32 oldsave; + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGEXEC_FLAGS; + PERL_UNUSED_ARG(data); + + /* Be paranoid... */ + if (prog == NULL || stringarg == NULL) { + Perl_croak(aTHX_ "NULL regexp parameter"); + } + + DEBUG_EXECUTE_r( + debug_start_match(rx, utf8_target, stringarg, strend, + "Matching"); + ); + + startpos = stringarg; + + if (prog->intflags & PREGf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if prog->gofs is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ + + if (prog->intflags & PREGf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->intflags & PREGf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; + } + + /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), + * which will call destuctors to reset PL_regmatch_state, free higher + * PL_regmatch_slabs, and clean up regmatch_info_aux and + * regmatch_info_aux_eval */ + + oldsave = PL_savestack_ix; + + s = startpos; + + if ((prog->extflags & RXf_USE_INTUIT) + && !(flags & REXEC_CHECKED)) + { + s = re_intuit_start(rx, sv, strbeg, startpos, strend, + flags, NULL); + if (!s) + return 0; + + if (prog->extflags & RXf_CHECK_ALL) { + /* we can match based purely on the result of INTUIT. + * Set up captures etc just for $& and $-[0] + * (an intuit-only match wont have $1,$2,..) */ + assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + /* match via INTUIT shouldn't have any captures. + * Let @-, @+, $^N know */ + prog->lastparen = prog->lastcloseparen = 0; + RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, strend, + sv, flags, utf8_target); + + return 1; + } + } + + multiline = prog->extflags & RXf_PMf_MULTILINE; + + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "String too short [regexec_flags]...\n")); + goto phooey; + } + + /* Check validity of program. */ + if (UCHARAT(progi->program) != REG_MAGIC) { + Perl_croak(aTHX_ "corrupted regexp program"); + } + + RX_MATCH_TAINTED_off(rx); + + reginfo->prog = rx; /* Yes, sorry that this is confusing. */ + reginfo->intuit = 0; + reginfo->is_utf8_target = cBOOL(utf8_target); + reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); + reginfo->warned = FALSE; + reginfo->strbeg = strbeg; + reginfo->sv = sv; + reginfo->poscache_maxiter = 0; /* not yet started a countdown */ + reginfo->strend = strend; + /* see how far we have to get to not match where we matched before */ + reginfo->till = stringarg + minend; + + if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) { + /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after + S_cleanup_regmatch_info_aux has executed (registered by + SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies + magic belonging to this SV. + Not newSVsv, either, as it does not COW. + */ + assert(!IS_PADGV(sv)); + reginfo->sv = newSV(0); + SvSetSV_nosteal(reginfo->sv, sv); + SAVEFREESV(reginfo->sv); + } + + /* reserve next 2 or 3 slots in PL_regmatch_state: + * slot N+0: may currently be in use: skip it + * slot N+1: use for regmatch_info_aux struct + * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s + * slot N+3: ready for use by regmatch() + */ + + { + regmatch_state *old_regmatch_state; + regmatch_slab *old_regmatch_slab; + int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1; + + /* on first ever match, allocate first slab */ + if (!PL_regmatch_slab) { + Newx(PL_regmatch_slab, 1, regmatch_slab); + PL_regmatch_slab->prev = NULL; + PL_regmatch_slab->next = NULL; + PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab); + } + + old_regmatch_state = PL_regmatch_state; + old_regmatch_slab = PL_regmatch_slab; + + for (i=0; i <= max; i++) { + if (i == 1) + reginfo->info_aux = &(PL_regmatch_state->u.info_aux); + else if (i ==2) + reginfo->info_aux_eval = + reginfo->info_aux->info_aux_eval = + &(PL_regmatch_state->u.info_aux_eval); + + if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab)) + PL_regmatch_state = S_push_slab(aTHX); + } + + /* note initial PL_regmatch_state position; at end of match we'll + * pop back to there and free any higher slabs */ + + reginfo->info_aux->old_regmatch_state = old_regmatch_state; + reginfo->info_aux->old_regmatch_slab = old_regmatch_slab; + reginfo->info_aux->poscache = NULL; + + SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux); + + if ((prog->extflags & RXf_EVAL_SEEN)) + S_setup_eval_state(aTHX_ reginfo); + else + reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL; + } + + /* If there is a "must appear" string, look for it. */ + + if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { + /* We have to be careful. If the previous successful match + was from this regex we don't want a subsequent partially + successful match to clobber the old results. + So when we detect this possibility we add a swap buffer + to the re, and switch the buffer each match. If we fail, + we switch it back; otherwise we leave it swapped. + */ + swap = prog->offs; + /* do we need a save destructor here for eval dies? */ + Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap), + PTR2UV(prog->offs) + )); + } + + /* Simplest case: anchored match need be tried only once. */ + /* [unless only anchor is BOL and multiline is set] */ + if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { + if (s == startpos && regtry(reginfo, &s)) + goto got_it; + else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */ + { + char *end; + + if (minlen) + dontbother = minlen - 1; + end = HOP3c(strend, -dontbother, strbeg) - 1; + /* for multiline we only have to try after newlines */ + if (prog->check_substr || prog->check_utf8) { + /* because of the goto we can not easily reuse the macros for bifurcating the + unicode/non-unicode match modes here like we do elsewhere - demerphq */ + if (utf8_target) { + if (s == startpos) + goto after_try_utf8; + while (1) { + if (regtry(reginfo, &s)) { + goto got_it; + } + after_try_utf8: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, strbeg, + s + UTF8SKIP(s), strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s += UTF8SKIP(s); + } + } + } /* end search for check string in unicode */ + else { + if (s == startpos) { + goto after_try_latin; + } + while (1) { + if (regtry(reginfo, &s)) { + goto got_it; + } + after_try_latin: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, strbeg, + s + 1, strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s++; + } + } + } /* end search for check string in latin*/ + } /* end search for check string */ + else { /* search for newline */ + if (s > startpos) { + /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ + s--; + } + /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ + while (s <= end) { /* note it could be possible to match at the end of the string */ + if (*s++ == '\n') { /* don't need PL_utf8skip here */ + if (regtry(reginfo, &s)) + goto got_it; + } + } + } /* end search for newline */ + } /* end anchored/multiline check string search */ + goto phooey; + } else if (prog->intflags & PREGf_ANCH_GPOS) + { + /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */ + assert(prog->intflags & PREGf_GPOS_SEEN); + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) + goto got_it; + goto phooey; + } + + /* Messy cases: unanchored match. */ + if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { + /* we have /x+whatever/ */ + /* it must be a one character string (XXXX Except is_utf8_pat?) */ + char ch; +#ifdef DEBUGGING + int did_match = 0; +#endif + if (utf8_target) { + if (! prog->anchored_utf8) { + to_utf8_substr(prog); + } + ch = SvPVX_const(prog->anchored_utf8)[0]; + REXEC_FBC_SCAN( + if (*s == ch) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (regtry(reginfo, &s)) goto got_it; + s += UTF8SKIP(s); + while (s < strend && *s == ch) + s += UTF8SKIP(s); + } + ); + + } + else { + if (! prog->anchored_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + ch = SvPVX_const(prog->anchored_substr)[0]; + REXEC_FBC_SCAN( + if (*s == ch) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (regtry(reginfo, &s)) goto got_it; + s++; + while (s < strend && *s == ch) + s++; + } + ); + } + DEBUG_EXECUTE_r(if (!did_match) + PerlIO_printf(Perl_debug_log, + "Did not find anchored character...\n") + ); + } + else if (prog->anchored_substr != NULL + || prog->anchored_utf8 != NULL + || ((prog->float_substr != NULL || prog->float_utf8 != NULL) + && prog->float_max_offset < strend - s)) { + SV *must; + SSize_t back_max; + SSize_t back_min; + char *last; + char *last1; /* Last position checked before */ +#ifdef DEBUGGING + int did_match = 0; +#endif + if (prog->anchored_substr || prog->anchored_utf8) { + if (utf8_target) { + if (! prog->anchored_utf8) { + to_utf8_substr(prog); + } + must = prog->anchored_utf8; + } + else { + if (! prog->anchored_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + must = prog->anchored_substr; + } + back_max = back_min = prog->anchored_offset; + } else { + if (utf8_target) { + if (! prog->float_utf8) { + to_utf8_substr(prog); + } + must = prog->float_utf8; + } + else { + if (! prog->float_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + must = prog->float_substr; + } + back_max = prog->float_max_offset; + back_min = prog->float_min_offset; + } + + if (back_min<0) { + last = strend; + } else { + last = HOP3c(strend, /* Cannot start after this */ + -(SSize_t)(CHR_SVLEN(must) + - (SvTAIL(must) != 0) + back_min), strbeg); + } + if (s > reginfo->strbeg) + last1 = HOPc(s, -1); + else + last1 = s - 1; /* bogus */ + + /* XXXX check_substr already used to find "s", can optimize if + check_substr==must. */ + dontbother = 0; + strend = HOPc(strend, -dontbother); + while ( (s <= last) && + (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend), + (unsigned char*)strend, must, + multiline ? FBMrf_MULTILINE : 0)) ) { + DEBUG_EXECUTE_r( did_match = 1 ); + if (HOPc(s, -back_max) > last1) { + last1 = HOPc(s, -back_min); + s = HOPc(s, -back_max); + } + else { + char * const t = (last1 >= reginfo->strbeg) + ? HOPc(last1, 1) : last1 + 1; + + last1 = HOPc(s, -back_min); + s = t; + } + if (utf8_target) { + while (s <= last1) { + if (regtry(reginfo, &s)) + goto got_it; + if (s >= last1) { + s++; /* to break out of outer loop */ + break; + } + s += UTF8SKIP(s); + } + } + else { + while (s <= last1) { + if (regtry(reginfo, &s)) + goto got_it; + s++; + } + } + } + DEBUG_EXECUTE_r(if (!did_match) { + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), + SvPVX_const(must), RE_SV_DUMPLEN(must), 30); + PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", + ((must == prog->anchored_substr || must == prog->anchored_utf8) + ? "anchored" : "floating"), + quoted, RE_SV_TAIL(must)); + }); + goto phooey; + } + else if ( (c = progi->regstclass) ) { + if (minlen) { + const OPCODE op = OP(progi->regstclass); + /* don't bother with what can't match */ + if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE) + strend = HOPc(strend, -(minlen - 1)); + } + DEBUG_EXECUTE_r({ + SV * const prop = sv_newmortal(); + regprop(prog, prop, c, reginfo); + { + RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), + s,strend-s,60); + PerlIO_printf(Perl_debug_log, + "Matching stclass %.*s against %s (%d bytes)\n", + (int)SvCUR(prop), SvPVX_const(prop), + quoted, (int)(strend - s)); + } + }); + if (find_byclass(prog, c, s, strend, reginfo)) + goto got_it; + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n")); + } + else { + dontbother = 0; + if (prog->float_substr != NULL || prog->float_utf8 != NULL) { + /* Trim the end. */ + char *last= NULL; + SV* float_real; + STRLEN len; + const char *little; + + if (utf8_target) { + if (! prog->float_utf8) { + to_utf8_substr(prog); + } + float_real = prog->float_utf8; + } + else { + if (! prog->float_substr) { + if (! to_byte_substr(prog)) { + NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey); + } + } + float_real = prog->float_substr; + } + + little = SvPV_const(float_real, len); + if (SvTAIL(float_real)) { + /* This means that float_real contains an artificial \n on + * the end due to the presence of something like this: + * /foo$/ where we can match both "foo" and "foo\n" at the + * end of the string. So we have to compare the end of the + * string first against the float_real without the \n and + * then against the full float_real with the string. We + * have to watch out for cases where the string might be + * smaller than the float_real or the float_real without + * the \n. */ + char *checkpos= strend - len; + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%sChecking for float_real.%s\n", + PL_colors[4], PL_colors[5])); + if (checkpos + 1 < strbeg) { + /* can't match, even if we remove the trailing \n + * string is too short to match */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString shorter than required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (memEQ(checkpos + 1, little, len - 1)) { + /* can match, the end of the string matches without the + * "\n" */ + last = checkpos + 1; + } else if (checkpos < strbeg) { + /* cant match, string is too short when the "\n" is + * included */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (!multiline) { + /* non multiline match, so compare with the "\n" at the + * end of the string */ + if (memEQ(checkpos, little, len)) { + last= checkpos; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } + } else { + /* multiline match, so we have to search for a place + * where the full string is located */ + goto find_last; + } + } else { + find_last: + if (len) + last = rninstr(s, strend, little, little + len); + else + last = strend; /* matching "$" */ + } + if (!last) { + /* at one point this block contained a comment which was + * probably incorrect, which said that this was a "should not + * happen" case. Even if it was true when it was written I am + * pretty sure it is not anymore, so I have removed the comment + * and replaced it with this one. Yves */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "String does not contain required substring, cannot match.\n" + )); + goto phooey; + } + dontbother = strend - last + prog->float_min_offset; + } + if (minlen && (dontbother < minlen)) + dontbother = minlen - 1; + strend -= dontbother; /* this one's always in bytes! */ + /* We don't know much -- general case. */ + if (utf8_target) { + for (;;) { + if (regtry(reginfo, &s)) + goto got_it; + if (s >= strend) + break; + s += UTF8SKIP(s); + }; + } + else { + do { + if (regtry(reginfo, &s)) + goto got_it; + } while (s++ < strend); + } + } + + /* Failure. */ + goto phooey; + +got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->intflags & PREGf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + + DEBUG_BUFFERS_r( + if (swap) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap) + ); + ); + Safefree(swap); + + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ + + LEAVE_SCOPE(oldsave); + + if (RXp_PAREN_NAMES(prog)) + (void)hv_iterinit(RXp_PAREN_NAMES(prog)); + + RX_MATCH_UTF8_set(rx, utf8_target); + + /* make sure $`, $&, $', and $digit will work later */ + if ( !(flags & REXEC_NOT_FIRST) ) + S_reg_set_capture_string(aTHX_ rx, + strbeg, reginfo->strend, + sv, flags, utf8_target); + + return 1; + +phooey: + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", + PL_colors[4], PL_colors[5])); + + /* clean up; this will trigger destructors that will free all slabs + * above the current one, and cleanup the regmatch_info_aux + * and regmatch_info_aux_eval sructs */ + + LEAVE_SCOPE(oldsave); + + if (swap) { + /* we failed :-( roll it back */ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(prog->offs), + PTR2UV(swap) + )); + Safefree(prog->offs); + prog->offs = swap; + } + return 0; +} + + +/* Set which rex is pointed to by PL_reg_curpm, handling ref counting. + * Do inc before dec, in case old and new rex are the same */ +#define SET_reg_curpm(Re2) \ + if (reginfo->info_aux_eval) { \ + (void)ReREFCNT_inc(Re2); \ + ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ + PM_SETRE((PL_reg_curpm), (Re2)); \ + } + + +/* + - regtry - try match at specific point + */ +STATIC I32 /* 0 failure, 1 success */ +S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) +{ + CHECKPOINT lastcp; + REGEXP *const rx = reginfo->prog; + regexp *const prog = ReANY(rx); + SSize_t result; + RXi_GET_DECL(prog,progi); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTRY; + + reginfo->cutpoint=NULL; + + prog->offs[0].start = *startposp - reginfo->strbeg; + prog->lastparen = 0; + prog->lastcloseparen = 0; + + /* XXXX What this code is doing here?!!! There should be no need + to do this again and again, prog->lastparen should take care of + this! --ilya*/ + + /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. + * Actually, the code in regcppop() (which Ilya may be meaning by + * prog->lastparen), is not needed at all by the test suite + * (op/regexp, op/pat, op/split), but that code is needed otherwise + * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ + * Meanwhile, this code *is* needed for the + * above-mentioned test suite tests to succeed. The common theme + * on those tests seems to be returning null fields from matches. + * --jhi updated by dapm */ +#if 1 + if (prog->nparens) { + regexp_paren_pair *pp = prog->offs; + I32 i; + for (i = prog->nparens; i > (I32)prog->lastparen; i--) { + ++pp; + pp->start = -1; + pp->end = -1; + } + } +#endif + REGCP_SET(lastcp); + result = regmatch(reginfo, *startposp, progi->program + 1); + if (result != -1) { + prog->offs[0].end = result; + return 1; + } + if (reginfo->cutpoint) + *startposp= reginfo->cutpoint; + REGCP_UNWIND(lastcp); + return 0; +} + + +#define sayYES goto yes +#define sayNO goto no +#define sayNO_SILENT goto no_silent + +/* we dont use STMT_START/END here because it leads to + "unreachable code" warnings, which are bogus, but distracting. */ +#define CACHEsayNO \ + if (ST.cache_mask) \ + reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \ + sayNO + +/* this is used to determine how far from the left messages like + 'failed...' are printed. It should be set such that messages + are inline with the regop output that created them. +*/ +#define REPORT_CODE_OFF 32 + + +#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ +#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ +#define CHRTEST_NOT_A_CP_1 -999 +#define CHRTEST_NOT_A_CP_2 -998 + +/* grab a new slab and return the first slot in it */ + +STATIC regmatch_state * +S_push_slab(pTHX) +{ +#if PERL_VERSION < 9 && !defined(PERL_CORE) + dMY_CXT; +#endif + regmatch_slab *s = PL_regmatch_slab->next; + if (!s) { + Newx(s, 1, regmatch_slab); + s->prev = PL_regmatch_slab; + s->next = NULL; + PL_regmatch_slab->next = s; + } + PL_regmatch_slab = s; + return SLAB_FIRST(s); +} + + +/* push a new state then goto it */ + +#define PUSH_STATE_GOTO(state, node, input) \ + pushinput = input; \ + scan = node; \ + st->resume_state = state; \ + goto push_state; + +/* push a new state with success backtracking, then goto it */ + +#define PUSH_YES_STATE_GOTO(state, node, input) \ + pushinput = input; \ + scan = node; \ + st->resume_state = state; \ + goto push_yes_state; + + + + +/* + +regmatch() - main matching routine + +This is basically one big switch statement in a loop. We execute an op, +set 'next' to point the next op, and continue. If we come to a point which +we may need to backtrack to on failure such as (A|B|C), we push a +backtrack state onto the backtrack stack. On failure, we pop the top +state, and re-enter the loop at the state indicated. If there are no more +states to pop, we return failure. + +Sometimes we also need to backtrack on success; for example /A+/, where +after successfully matching one A, we need to go back and try to +match another one; similarly for lookahead assertions: if the assertion +completes successfully, we backtrack to the state just before the assertion +and then carry on. In these cases, the pushed state is marked as +'backtrack on success too'. This marking is in fact done by a chain of +pointers, each pointing to the previous 'yes' state. On success, we pop to +the nearest yes state, discarding any intermediate failure-only states. +Sometimes a yes state is pushed just to force some cleanup code to be +called at the end of a successful match or submatch; e.g. (??{$re}) uses +it to free the inner regex. + +Note that failure backtracking rewinds the cursor position, while +success backtracking leaves it alone. + +A pattern is complete when the END op is executed, while a subpattern +such as (?=foo) is complete when the SUCCESS op is executed. Both of these +ops trigger the "pop to last yes state if any, otherwise return true" +behaviour. + +A common convention in this function is to use A and B to refer to the two +subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is +the subpattern to be matched possibly multiple times, while B is the entire +rest of the pattern. Variable and state names reflect this convention. + +The states in the main switch are the union of ops and failure/success of +substates associated with with that op. For example, IFMATCH is the op +that does lookahead assertions /(?=A)B/ and so the IFMATCH state means +'execute IFMATCH'; while IFMATCH_A is a state saying that we have just +successfully matched A and IFMATCH_A_fail is a state saying that we have +just failed to match A. Resume states always come in pairs. The backtrack +state we push is marked as 'IFMATCH_A', but when that is popped, we resume +at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking +on success or failure. + +The struct that holds a backtracking state is actually a big union, with +one variant for each major type of op. The variable st points to the +top-most backtrack struct. To make the code clearer, within each +block of code we #define ST to alias the relevant union. + +Here's a concrete example of a (vastly oversimplified) IFMATCH +implementation: + + switch (state) { + .... + +#define ST st->u.ifmatch + + case IFMATCH: // we are executing the IFMATCH op, (?=A)B + ST.foo = ...; // some state we wish to save + ... + // push a yes backtrack state with a resume value of + // IFMATCH_A/IFMATCH_A_fail, then continue execution at the + // first node of A: + PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput); + // NOTREACHED + + case IFMATCH_A: // we have successfully executed A; now continue with B + next = B; + bar = ST.foo; // do something with the preserved value + break; + + case IFMATCH_A_fail: // A failed, so the assertion failed + ...; // do some housekeeping, then ... + sayNO; // propagate the failure + +#undef ST + + ... + } + +For any old-timers reading this who are familiar with the old recursive +approach, the code above is equivalent to: + + case IFMATCH: // we are executing the IFMATCH op, (?=A)B + { + int foo = ... + ... + if (regmatch(A)) { + next = B; + bar = foo; + break; + } + ...; // do some housekeeping, then ... + sayNO; // propagate the failure + } + +The topmost backtrack state, pointed to by st, is usually free. If you +want to claim it, populate any ST.foo fields in it with values you wish to +save, then do one of + + PUSH_STATE_GOTO(resume_state, node, newinput); + PUSH_YES_STATE_GOTO(resume_state, node, newinput); + +which sets that backtrack state's resume value to 'resume_state', pushes a +new free entry to the top of the backtrack stack, then goes to 'node'. +On backtracking, the free slot is popped, and the saved state becomes the +new free state. An ST.foo field in this new top state can be temporarily +accessed to retrieve values, but once the main loop is re-entered, it +becomes available for reuse. + +Note that the depth of the backtrack stack constantly increases during the +left-to-right execution of the pattern, rather than going up and down with +the pattern nesting. For example the stack is at its maximum at Z at the +end of the pattern, rather than at X in the following: + + /(((X)+)+)+....(Y)+....Z/ + +The only exceptions to this are lookahead/behind assertions and the cut, +(?>A), which pop all the backtrack states associated with A before +continuing. + +Backtrack state structs are allocated in slabs of about 4K in size. +PL_regmatch_state and st always point to the currently active state, +and PL_regmatch_slab points to the slab currently containing +PL_regmatch_state. The first time regmatch() is called, the first slab is +allocated, and is never freed until interpreter destruction. When the slab +is full, a new one is allocated and chained to the end. At exit from +regmatch(), slabs allocated since entry are freed. + +*/ + + +#define DEBUG_STATE_pp(pp) \ + DEBUG_STATE_r({ \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ + PerlIO_printf(Perl_debug_log, \ + " %*s"pp" %s%s%s%s%s\n", \ + depth*2, "", \ + PL_reg_name[st->resume_state], \ + ((st==yes_state||st==mark_state) ? "[" : ""), \ + ((st==yes_state) ? "Y" : ""), \ + ((st==mark_state) ? "M" : ""), \ + ((st==yes_state||st==mark_state) ? "]" : "") \ + ); \ + }); + + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1) + +#ifdef DEBUGGING + +STATIC void +S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, + const char *start, const char *end, const char *blurb) +{ + const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; + + PERL_ARGS_ASSERT_DEBUG_START_MATCH; + + if (!PL_colorset) + reginitcolors(); + { + RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), + RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); + + RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), + start, end - start, 60); + + PerlIO_printf(Perl_debug_log, + "%s%s REx%s %s against %s\n", + PL_colors[4], blurb, PL_colors[5], s0, s1); + + if (utf8_target||utf8_pat) + PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", + utf8_pat ? "pattern" : "", + utf8_pat && utf8_target ? " and " : "", + utf8_target ? "string" : "" + ); + } +} + +STATIC void +S_dump_exec_pos(pTHX_ const char *locinput, + const regnode *scan, + const char *loc_regeol, + const char *loc_bostr, + const char *loc_reg_starttry, + const bool utf8_target) +{ + const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; + const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ + int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput); + /* The part of the string before starttry has one color + (pref0_len chars), between starttry and current + position another one (pref_len - pref0_len chars), + after the current position the third one. + We assume that pref0_len <= pref_len, otherwise we + decrease pref0_len. */ + int pref_len = (locinput - loc_bostr) > (5 + taill) - l + ? (5 + taill) - l : locinput - loc_bostr; + int pref0_len; + + PERL_ARGS_ASSERT_DUMP_EXEC_POS; + + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) + pref_len++; + pref0_len = pref_len - (locinput - loc_reg_starttry); + if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) + l = ( loc_regeol - locinput > (5 + taill) - pref_len + ? (5 + taill) - pref_len : loc_regeol - locinput); + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) + l--; + if (pref0_len < 0) + pref0_len = 0; + if (pref0_len > pref_len) + pref0_len = pref_len; + { + const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0; + + RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), + (locinput - pref_len),pref0_len, 60, 4, 5); + + RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), + (locinput - pref_len + pref0_len), + pref_len - pref0_len, 60, 2, 3); + + RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), + locinput, loc_regeol - locinput, 10, 0, 1); + + const STRLEN tlen=len0+len1+len2; + PerlIO_printf(Perl_debug_log, + "%4"IVdf" <%.*s%.*s%s%.*s>%*s|", + (IV)(locinput - loc_bostr), + len0, s0, + len1, s1, + (docolor ? "" : "> <"), + len2, s2, + (int)(tlen > 19 ? 0 : 19 - tlen), + ""); + } +} + +#endif + +/* reg_check_named_buff_matched() + * Checks to see if a named buffer has matched. The data array of + * buffer numbers corresponding to the buffer is expected to reside + * in the regexp->data->data array in the slot stored in the ARG() of + * node involved. Note that this routine doesn't actually care about the + * name, that information is not preserved from compilation to execution. + * Returns the index of the leftmost defined buffer with the given name + * or 0 if non of the buffers matched. + */ +STATIC I32 +S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) +{ + I32 n; + RXi_GET_DECL(rex,rexi); + SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + I32 *nums=(I32*)SvPVX(sv_dat); + + PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; + + for ( n=0; nlastparen >= nums[n] && + rex->offs[nums[n]].end != -1) + { + return nums[n]; + } + } + return 0; +} + + +static bool +S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, + U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) +{ + /* This function determines if there are one or two characters that match + * the first character of the passed-in EXACTish node , and if + * so, returns them in the passed-in pointers. + * + * If it determines that no possible character in the target string can + * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if + * the first character in requires UTF-8 to represent, and the + * target string isn't in UTF-8.) + * + * If there are more than two characters that could match the beginning of + * , or if more context is required to determine a match or not, + * it sets both * and * to CHRTEST_VOID. + * + * The motiviation behind this function is to allow the caller to set up + * tight loops for matching. If is of type EXACT, there is + * only one possible character that can match its first character, and so + * the situation is quite simple. But things get much more complicated if + * folding is involved. It may be that the first character of an EXACTFish + * node doesn't participate in any possible fold, e.g., punctuation, so it + * can be matched only by itself. The vast majority of characters that are + * in folds match just two things, their lower and upper-case equivalents. + * But not all are like that; some have multiple possible matches, or match + * sequences of more than one character. This function sorts all that out. + * + * Consider the patterns A*B or A*?B where A and B are arbitrary. In a + * loop of trying to match A*, we know we can't exit where the thing + * following it isn't a B. And something can't be a B unless it is the + * beginning of B. By putting a quick test for that beginning in a tight + * loop, we can rule out things that can't possibly be B without having to + * break out of the loop, thus avoiding work. Similarly, if A is a single + * character, we can make a tight loop matching A*, using the outputs of + * this function. + * + * If the target string to match isn't in UTF-8, and there aren't + * complications which require CHRTEST_VOID, * and * are set to + * the one or two possible octets (which are characters in this situation) + * that can match. In all cases, if there is only one character that can + * match, * and * will be identical. + * + * If the target string is in UTF-8, the buffers pointed to by + * and will contain the one or two UTF-8 sequences of bytes that + * can match the beginning of . They should be declared with at + * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is + * undefined what these contain.) If one or both of the buffers are + * invariant under UTF-8, *, and * will also be set to the + * corresponding invariant. If variant, the corresponding * and/or + * * will be set to a negative number(s) that shouldn't match any code + * point (unless inappropriately coerced to unsigned). * will equal + * * if and only if and are the same. */ + + const bool utf8_target = reginfo->is_utf8_target; + + UV c1 = (UV)CHRTEST_NOT_A_CP_1; + UV c2 = (UV)CHRTEST_NOT_A_CP_2; + bool use_chrtest_void = FALSE; + const bool is_utf8_pat = reginfo->is_utf8_pat; + + /* Used when we have both utf8 input and utf8 output, to avoid converting + * to/from code points */ + bool utf8_has_been_setup = FALSE; + + dVAR; + + U8 *pat = (U8*)STRING(text_node); + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + + if (OP(text_node) == EXACT) { + + /* In an exact node, only one thing can be matched, that first + * character. If both the pat and the target are UTF-8, we can just + * copy the input to the output, avoiding finding the code point of + * that character */ + if (!is_utf8_pat) { + c2 = c1 = *pat; + } + else if (utf8_target) { + Copy(pat, c1_utf8, UTF8SKIP(pat), U8); + Copy(pat, c2_utf8, UTF8SKIP(pat), U8); + utf8_has_been_setup = TRUE; + } + else { + c2 = c1 = valid_utf8_to_uvchr(pat, NULL); + } + } + else { /* an EXACTFish node */ + U8 *pat_end = pat + STR_LEN(text_node); + + /* An EXACTFL node has at least some characters unfolded, because what + * they match is not known until now. So, now is the time to fold + * the first few of them, as many as are needed to determine 'c1' and + * 'c2' later in the routine. If the pattern isn't UTF-8, we only need + * to fold if in a UTF-8 locale, and then only the Sharp S; everything + * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we + * need to fold as many characters as a single character can fold to, + * so that later we can check if the first ones are such a multi-char + * fold. But, in such a pattern only locale-problematic characters + * aren't folded, so we can skip this completely if the first character + * in the node isn't one of the tricky ones */ + if (OP(text_node) == EXACTFL) { + + if (! is_utf8_pat) { + if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + folded[0] = folded[1] = 's'; + pat = folded; + pat_end = folded + 2; + } + } + else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { + U8 *s = pat; + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD_LC(*s); + s++; + } + else { + STRLEN len; + _to_utf8_fold_flags(s, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + d += len; + s += UTF8SKIP(s); + } + } + + pat = folded; + pat_end = d; + } + } + + if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) + || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to + * be handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 255) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); + } + + /* The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_tindex(list) != 1) { + + /* If there aren't exactly two folds to this, it is + * outside the scope of this function */ + use_chrtest_void = TRUE; + } + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden + * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and + * one is ASCIII. Since the pattern character is above + * 255, and its only other match is below 256, the only + * legal match will be to itself. We have thrown away + * the original, so have to compute which is the one + * above 255. */ + if ((c1 < 256) != (c2 < 256)) { + if ((OP(text_node) == EXACTFL + && ! IN_UTF8_CTYPE_LOCALE) + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } + } + } + } + } + } + else /* Here, c1 is <= 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target + * which folds to this character in the pattern. All such + * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more + * than two characters involved in their folds, so are outside + * the scope of this function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } + } + else { /* Here nothing above Latin1 can fold to the pattern + character */ + switch (OP(text_node)) { + + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; + + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; + break; + + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } + } + } + } + + /* Here have figured things out. Set up the returns */ + if (use_chrtest_void) { + *c2p = *c1p = CHRTEST_VOID; + } + else if (utf8_target) { + if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ + uvchr_to_utf8(c1_utf8, c1); + uvchr_to_utf8(c2_utf8, c2); + } + + /* Invariants are stored in both the utf8 and byte outputs; Use + * negative numbers otherwise for the byte ones. Make sure that the + * byte ones are the same iff the utf8 ones are the same */ + *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; + *c2p = (UTF8_IS_INVARIANT(*c2_utf8)) + ? *c2_utf8 + : (c1 == c2) + ? CHRTEST_NOT_A_CP_1 + : CHRTEST_NOT_A_CP_2; + } + else if (c1 > 255) { + if (c2 > 255) { /* both possibilities are above what a non-utf8 string + can represent */ + return FALSE; + } + + *c1p = *c2p = c2; /* c2 is the only representable value */ + } + else { /* c1 is representable; see about c2 */ + *c1p = c1; + *c2p = (c2 < 256) ? c2 : c1; + } + + return TRUE; +} + +/* returns -1 on failure, $+[0] on success */ +STATIC SSize_t +S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) +{ +#if PERL_VERSION < 9 && !defined(PERL_CORE) + dMY_CXT; +#endif + dVAR; + const bool utf8_target = reginfo->is_utf8_target; + const U32 uniflags = UTF8_ALLOW_DEFAULT; + REGEXP *rex_sv = reginfo->prog; + regexp *rex = ReANY(rex_sv); + RXi_GET_DECL(rex,rexi); + /* the current state. This is a cached copy of PL_regmatch_state */ + regmatch_state *st; + /* cache heavy used fields of st in registers */ + regnode *scan; + regnode *next; + U32 n = 0; /* general value; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ + char *locinput = startpos; + char *pushinput; /* where to continue after a PUSH */ + I32 nextchr; /* is always set to UCHARAT(locinput) */ + + bool result = 0; /* return value of S_regmatch */ + int depth = 0; /* depth of backtrack stack */ + U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ + const U32 max_nochange_depth = + (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ? + 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; + regmatch_state *yes_state = NULL; /* state to pop to on success of + subpattern */ + /* mark_state piggy backs on the yes_state logic so that when we unwind + the stack on success we can update the mark_state as we go */ + regmatch_state *mark_state = NULL; /* last mark state we have seen */ + regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ + struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ + U32 state_num; + bool no_final = 0; /* prevent failure from backtracking? */ + bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ + char *startpoint = locinput; + SV *popmark = NULL; /* are we looking for a mark? */ + SV *sv_commit = NULL; /* last mark name seen in failure */ + SV *sv_yes_mark = NULL; /* last mark name we have seen + during a successful match */ + U32 lastopen = 0; /* last open we saw */ + bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; + SV* const oreplsv = GvSVn(PL_replgv); + /* these three flags are set by various ops to signal information to + * the very next op. They have a useful lifetime of exactly one loop + * iteration, and are not preserved or restored by state pushes/pops + */ + bool sw = 0; /* the condition value in (?(cond)a|b) */ + bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ + int logical = 0; /* the following EVAL is: + 0: (?{...}) + 1: (?(?{...})X|Y) + 2: (??{...}) + or the following IFMATCH/UNLESSM is: + false: plain (?=foo) + true: used as a condition: (?(?=foo)) + */ + PAD* last_pad = NULL; + dMULTICALL; + I32 gimme = G_SCALAR; + CV *caller_cv = NULL; /* who called us */ + CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ + CHECKPOINT runops_cp; /* savestack position before executing EVAL */ + U32 maxopenparen = 0; /* max '(' index seen so far */ + int to_complement; /* Invert the result? */ + _char_class_number classnum; + bool is_utf8_pat = reginfo->is_utf8_pat; + +#ifdef DEBUGGING + GET_RE_DEBUG_FLAGS_DECL; +#endif + + /* protect against undef(*^R) */ + SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv)); + + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ + multicall_oldcatch = 0; + multicall_cv = NULL; + cx = NULL; + PERL_UNUSED_VAR(multicall_cop); + PERL_UNUSED_VAR(newsp); + + + PERL_ARGS_ASSERT_REGMATCH; + + DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ + PerlIO_printf(Perl_debug_log,"regmatch start\n"); + })); + + st = PL_regmatch_state; + + /* Note that nextchr is a byte even in UTF */ + SET_nextchr; + scan = prog; + while (scan != NULL) { + + DEBUG_EXECUTE_r( { + SV * const prop = sv_newmortal(); + regnode *rnext=regnext(scan); + DUMP_EXEC_POS( locinput, scan, utf8_target ); + regprop(rex, prop, scan, reginfo); + + PerlIO_printf(Perl_debug_log, + "%3"IVdf":%*s%s(%"IVdf")\n", + (IV)(scan - rexi->program), depth*2, "", + SvPVX_const(prop), + (PL_regkind[OP(scan)] == END || !rnext) ? + 0 : (IV)(rnext - rexi->program)); + }); + + next = scan + NEXT_OFF(scan); + if (next == scan) + next = NULL; + state_num = OP(scan); + + REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st); + reenter_switch: + to_complement = 0; + + SET_nextchr; + assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); + + switch (state_num) { + case BOL: /* /^../ */ + case SBOL: /* /^../s */ + if (locinput == reginfo->strbeg) + break; + sayNO; + + case MBOL: /* /^../m */ + if (locinput == reginfo->strbeg || + (!NEXTCHR_IS_EOS && locinput[-1] == '\n')) + { + break; + } + sayNO; + + case GPOS: /* \G */ + if (locinput == reginfo->ganch) + break; + sayNO; + + case KEEPS: /* \K */ + /* update the startpoint */ + st->u.keeper.val = rex->offs[0].start; + rex->offs[0].start = locinput - reginfo->strbeg; + PUSH_STATE_GOTO(KEEPS_next, next, locinput); + /* NOTREACHED */ + assert(0); + + case KEEPS_next_fail: + /* rollback the start point change */ + rex->offs[0].start = st->u.keeper.val; + sayNO_SILENT; + /* NOTREACHED */ + assert(0); + + case MEOL: /* /..$/m */ + if (!NEXTCHR_IS_EOS && nextchr != '\n') + sayNO; + break; + + case EOL: /* /..$/ */ + /* FALLTHROUGH */ + case SEOL: /* /..$/s */ + if (!NEXTCHR_IS_EOS && nextchr != '\n') + sayNO; + if (reginfo->strend - locinput > 1) + sayNO; + break; + + case EOS: /* \z */ + if (!NEXTCHR_IS_EOS) + sayNO; + break; + + case SANY: /* /./s */ + if (NEXTCHR_IS_EOS) + sayNO; + goto increment_locinput; + + case CANY: /* \C */ + if (NEXTCHR_IS_EOS) + sayNO; + locinput++; + break; + + case REG_ANY: /* /./ */ + if ((NEXTCHR_IS_EOS) || nextchr == '\n') + sayNO; + goto increment_locinput; + + +#undef ST +#define ST st->u.trie + case TRIEC: /* (ab|cd) with known charclass */ + /* In this case the charclass data is available inline so + we can fail fast without a lot of extra overhead. + */ + if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + /* NOTREACHED */ + assert(0); + } + /* FALLTHROUGH */ + case TRIE: /* (ab|cd) */ + /* the basic plan of execution of the trie is: + * At the beginning, run though all the states, and + * find the longest-matching word. Also remember the position + * of the shortest matching word. For example, this pattern: + * 1 2 3 4 5 + * ab|a|x|abcd|abc + * when matched against the string "abcde", will generate + * accept states for all words except 3, with the longest + * matching word being 4, and the shortest being 2 (with + * the position being after char 1 of the string). + * + * Then for each matching word, in word order (i.e. 1,2,4,5), + * we run the remainder of the pattern; on each try setting + * the current position to the character following the word, + * returning to try the next word on failure. + * + * We avoid having to build a list of words at runtime by + * using a compile-time structure, wordinfo[].prev, which + * gives, for each word, the previous accepting word (if any). + * In the case above it would contain the mappings 1->2, 2->0, + * 3->0, 4->5, 5->1. We can use this table to generate, from + * the longest word (4 above), a list of all words, by + * following the list of prev pointers; this gives us the + * unordered list 4,5,1,2. Then given the current word we have + * just tried, we can go through the list and find the + * next-biggest word to try (so if we just failed on word 2, + * the next in the list is 4). + * + * Since at runtime we don't record the matching position in + * the string for each word, we have to work that out for + * each word we're about to process. The wordinfo table holds + * the character length of each word; given that we recorded + * at the start: the position of the shortest word and its + * length in chars, we just need to move the pointer the + * difference between the two char lengths. Depending on + * Unicode status and folding, that's cheap or expensive. + * + * This algorithm is optimised for the case where are only a + * small number of accept states, i.e. 0,1, or maybe 2. + * With lots of accepts states, and having to try all of them, + * it becomes quadratic on number of accept states to find all + * the next words. + */ + + { + /* what type of TRIE am I? (utf8 makes this contextual) */ + DECL_TRIE_TYPE(scan); + + /* what trie are we using right now */ + reg_trie_data * const trie + = (reg_trie_data*)rexi->data->data[ ARG( scan ) ]; + HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); + U32 state = trie->startstate; + + if ( trie->bitmap + && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) + { + if (trie->states[ state ].wordnum) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %smatched empty string...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + if (!trie->jump) + break; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + } + } + + { + U8 *uc = ( U8* )locinput; + + STRLEN len = 0; + STRLEN foldlen = 0; + U8 *uscan = (U8*)NULL; + U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + U32 charcount = 0; /* how many input chars we have matched */ + U32 accepted = 0; /* have we seen any accepting states? */ + + ST.jump = trie->jump; + ST.me = scan; + ST.firstpos = NULL; + ST.longfold = FALSE; /* char longer if folded => it's harder */ + ST.nextword = 0; + + /* fully traverse the TRIE; note the position of the + shortest accept state and the wordnum of the longest + accept state */ + + while ( state && uc <= (U8*)(reginfo->strend) ) { + U32 base = trie->states[ state ].trans.base; + UV uvc = 0; + U16 charid = 0; + U16 wordnum; + wordnum = trie->states[ state ].wordnum; + + if (wordnum) { /* it's an accept state */ + if (!accepted) { + accepted = 1; + /* record first match position */ + if (ST.longfold) { + ST.firstpos = (U8*)locinput; + ST.firstchars = 0; + } + else { + ST.firstpos = uc; + ST.firstchars = charcount; + } + } + if (!ST.nextword || wordnum < ST.nextword) + ST.nextword = wordnum; + ST.topword = wordnum; + } + + DEBUG_TRIE_EXECUTE_r({ + DUMP_EXEC_POS( (char *)uc, scan, utf8_target ); + PerlIO_printf( Perl_debug_log, + "%*s %sState: %4"UVxf" Accepted: %c ", + 2+depth * 2, "", PL_colors[4], + (UV)state, (accepted ? 'Y' : 'N')); + }); + + /* read a char and goto next state */ + if ( base && (foldlen || uc < (U8*)(reginfo->strend))) { + I32 offset; + REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, + uscan, len, uvc, charid, foldlen, + foldbuf, uniflags); + charcount++; + if (foldlen>0) + ST.longfold = TRUE; + if (charid && + ( ((offset = + base + charid - 1 - trie->uniquecharcount)) >= 0) + + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state) + { + state = trie->trans[offset].next; + } + else { + state = 0; + } + uc += len; + + } + else { + state = 0; + } + DEBUG_TRIE_EXECUTE_r( + PerlIO_printf( Perl_debug_log, + "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n", + charid, uvc, (UV)state, PL_colors[5] ); + ); + } + if (!accepted) + sayNO; + + /* calculate total number of accept states */ + { + U16 w = ST.topword; + accepted = 0; + while (w) { + w = trie->wordinfo[w].prev; + accepted++; + } + ST.accepted = accepted; + } + + DEBUG_EXECUTE_r( + PerlIO_printf( Perl_debug_log, + "%*s %sgot %"IVdf" possible matches%s\n", + REPORT_CODE_OFF + depth * 2, "", + PL_colors[4], (IV)ST.accepted, PL_colors[5] ); + ); + goto trie_first_try; /* jump into the fail handler */ + }} + /* NOTREACHED */ + assert(0); + + case TRIE_next_fail: /* we failed - try next alternative */ + { + U8 *uc; + if ( ST.jump) { + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + if (!--ST.accepted) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sTRIE failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); + sayNO_SILENT; + } + { + /* Find next-highest word to process. Note that this code + * is O(N^2) per trie run (O(N) per branch), so keep tight */ + U16 min = 0; + U16 word; + U16 const nextword = ST.nextword; + reg_trie_wordinfo * const wordinfo + = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; + for (word=ST.topword; word; word=wordinfo[word].prev) { + if (word > nextword && (!min || word < min)) + min = word; + } + ST.nextword = min; + } + + trie_first_try: + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } + + if ( ST.jump) { + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + REGCP_SET(ST.cp); + } + + /* find start char of end of current word */ + { + U32 chars; /* how many chars to skip */ + reg_trie_data * const trie + = (reg_trie_data*)rexi->data->data[ARG(ST.me)]; + + assert((trie->wordinfo[ST.nextword].len - trie->prefixlen) + >= ST.firstchars); + chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen) + - ST.firstchars; + uc = ST.firstpos; + + if (ST.longfold) { + /* the hard option - fold each char in turn and find + * its folded length (which may be different */ + U8 foldbuf[UTF8_MAXBYTES_CASE + 1]; + STRLEN foldlen; + STRLEN len; + UV uvc; + U8 *uscan; + + while (chars) { + if (utf8_target) { + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, + uniflags); + uc += len; + } + else { + uvc = *uc; + uc++; + } + uvc = to_uni_fold(uvc, foldbuf, &foldlen); + uscan = foldbuf; + while (foldlen) { + if (!--chars) + break; + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, + uniflags); + uscan += len; + foldlen -= len; + } + } + } + else { + if (utf8_target) + while (chars--) + uc += UTF8SKIP(uc); + else + uc += chars; + } + } + + scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) + ? ST.jump[ST.nextword] + : NEXT_OFF(ST.me)); + + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sTRIE matched word #%d, continuing%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + ST.nextword, + PL_colors[5] + ); + }); + + if (ST.accepted > 1 || has_cutgroup) { + PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc); + /* NOTREACHED */ + assert(0); + } + /* only one choice left - just continue */ + DEBUG_EXECUTE_r({ + AV *const trie_words + = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]); + SV ** const tmp = av_fetch( trie_words, + ST.nextword-1, 0 ); + SV *sv= tmp ? sv_newmortal() : NULL; + + PerlIO_printf( Perl_debug_log, + "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], + ST.nextword, + tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII + ) + : "not compiled under -Dr", + PL_colors[5] ); + }); + + locinput = (char*)uc; + continue; /* execute rest of RE */ + /* NOTREACHED */ + assert(0); + } +#undef ST + + case EXACT: { /* /abc/ */ + char *s = STRING(scan); + ln = STR_LEN(scan); + if (utf8_target != is_utf8_pat) { + /* The target and the pattern have differing utf8ness. */ + char *l = locinput; + const char * const e = s + ln; + + if (utf8_target) { + /* The target is utf8, the pattern is not utf8. + * Above-Latin1 code points can't match the pattern; + * invariants match exactly, and the other Latin1 ones need + * to be downgraded to a single byte in order to do the + * comparison. (If we could be confident that the target + * is not malformed, this could be refactored to have fewer + * tests by just assuming that if the first bytes match, it + * is an invariant, but there are tests in the test suite + * dealing with (??{...}) which violate this) */ + while (s < e) { + if (l >= reginfo->strend + || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) + { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)l)) { + if (*l != *s) { + sayNO; + } + l++; + } + else { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { + sayNO; + } + l += 2; + } + s++; + } + } + else { + /* The target is not utf8, the pattern is utf8. */ + while (s < e) { + if (l >= reginfo->strend + || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) + { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)s)) { + if (*s != *l) { + sayNO; + } + s++; + } + else { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { + sayNO; + } + s += 2; + } + l++; + } + } + locinput = l; + } + else { + /* The target and the pattern have the same utf8ness. */ + /* Inline the first character, for speed. */ + if (reginfo->strend - locinput < ln + || UCHARAT(s) != nextchr + || (ln > 1 && memNE(s, locinput, ln))) + { + sayNO; + } + locinput += ln; + } + break; + } + + case EXACTFL: { /* /abc/il */ + re_fold_t folder; + const U8 * fold_array; + const char * s; + U32 fold_utf8_flags; + + folder = foldEQ_locale; + fold_array = PL_fold_locale; + fold_utf8_flags = FOLDEQ_LOCALE; + goto do_exactf; + + case EXACTFU_SS: /* /\x{df}/iu */ + case EXACTFU: /* /abc/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; + goto do_exactf; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: /* /abc/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf; + + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + folder = foldEQ; + fold_array = PL_fold; + fold_utf8_flags = 0; + + do_exactf: + s = STRING(scan); + ln = STR_LEN(scan); + + if (utf8_target + || is_utf8_pat + || state_num == EXACTFU_SS + || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { + /* Either target or the pattern are utf8, or has the issue where + * the fold lengths may differ. */ + const char * const l = locinput; + char *e = reginfo->strend; + + if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat, + l, &e, 0, utf8_target, fold_utf8_flags)) + { + sayNO; + } + locinput = e; + break; + } + + /* Neither the target nor the pattern are utf8 */ + if (UCHARAT(s) != nextchr + && !NEXTCHR_IS_EOS + && UCHARAT(s) != fold_array[nextchr]) + { + sayNO; + } + if (reginfo->strend - locinput < ln) + sayNO; + if (ln > 1 && ! folder(s, locinput, ln)) + sayNO; + locinput += ln; + break; + } + + /* XXX Could improve efficiency by separating these all out using a + * macro or in-line function. At that point regcomp.c would no longer + * have to set the FLAGS fields of these */ + case BOUNDL: /* /\b/l */ + case NBOUNDL: /* /\B/l */ + case BOUND: /* /\b/ */ + case BOUNDU: /* /\b/u */ + case BOUNDA: /* /\b/a */ + case NBOUND: /* /\B/ */ + case NBOUNDU: /* /\B/u */ + case NBOUNDA: /* /\B/a */ + /* was last char in word? */ + if (utf8_target + && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET + && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET) + { + if (locinput == reginfo->strbeg) + ln = '\n'; + else { + const U8 * const r = + reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); + + ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, + 0, uniflags); + } + if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { + ln = isWORDCHAR_uni(ln); + if (NEXTCHR_IS_EOS) + n = 0; + else { + LOAD_UTF8_CHARCLASS_ALNUM(); + n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput, + utf8_target); + } + } + else { + ln = isWORDCHAR_LC_uvchr(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); + } + } + else { + + /* Here the string isn't utf8, or is utf8 and only ascii + * characters are to match \w. In the latter case looking at + * the byte just prior to the current one may be just the final + * byte of a multi-byte character. This is ok. There are two + * cases: + * 1) it is a single byte character, and then the test is doing + * just what it's supposed to. + * 2) it is a multi-byte character, in which case the final + * byte is never mistakable for ASCII, and so the test + * will say it is not a word character, which is the + * correct answer. */ + ln = (locinput != reginfo->strbeg) ? + UCHARAT(locinput - 1) : '\n'; + switch (FLAGS(scan)) { + case REGEX_UNICODE_CHARSET: + ln = isWORDCHAR_L1(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr); + break; + case REGEX_LOCALE_CHARSET: + ln = isWORDCHAR_LC(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr); + break; + case REGEX_DEPENDS_CHARSET: + ln = isWORDCHAR(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr); + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + ln = isWORDCHAR_A(ln); + n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr); + break; + default: + Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); + } + } + /* Note requires that all BOUNDs be lower than all NBOUNDs in + * regcomp.sym */ + if (((!ln) == (!n)) == (OP(scan) < NBOUND)) + sayNO; + break; + + case ANYOF: /* /[abc]/ */ + if (NEXTCHR_IS_EOS) + sayNO; + if (utf8_target) { + if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, + utf8_target)) + sayNO; + locinput += UTF8SKIP(locinput); + } + else { + if (!REGINCLASS(rex, scan, (U8*)locinput)) + sayNO; + locinput++; + } + break; + + /* The argument (FLAGS) to all the POSIX node types is the class number + * */ + + case NPOSIXL: /* \W or [:^punct:] etc. under /l */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: /* \w or [:punct:] etc. under /l */ + if (NEXTCHR_IS_EOS) + sayNO; + + /* Use isFOO_lc() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) { + sayNO; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, + *(locinput + 1)))))) + { + sayNO; + } + } + else { /* Here, must be an above Latin-1 code point */ + goto utf8_posix_not_eos; + } + + /* Here, must be utf8 */ + locinput += UTF8SKIP(locinput); + break; + + case NPOSIXD: /* \W or [:^punct:] etc. under /d */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD: /* \w or [:punct:] etc. under /d */ + if (utf8_target) { + goto utf8_posix; + } + goto posixa; + + case NPOSIXA: /* \W or [:^punct:] etc. under /a */ + + if (NEXTCHR_IS_EOS) { + sayNO; + } + + /* All UTF-8 variants match */ + if (! UTF8_IS_INVARIANT(nextchr)) { + goto increment_locinput; + } + + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXA: /* \w or [:punct:] etc. under /a */ + + posixa: + /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in + * UTF-8, and also from NPOSIXA even in UTF-8 when the current + * character is a single byte */ + + if (NEXTCHR_IS_EOS + || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, + FLAGS(scan))))) + { + sayNO; + } + + /* Here we are either not in utf8, or we matched a utf8-invariant, + * so the next char is the next byte */ + locinput++; + break; + + case NPOSIXU: /* \W or [:^punct:] etc. under /u */ + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: /* \w or [:punct:] etc. under /u */ + utf8_posix: + if (NEXTCHR_IS_EOS) { + sayNO; + } + utf8_posix_not_eos: + + /* Use _generic_isCC() for characters within Latin1. (Note that + * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else + * wouldn't be invariant) */ + if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { + if (! (to_complement ^ cBOOL(_generic_isCC(nextchr, + FLAGS(scan))))) + { + sayNO; + } + locinput++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, + *(locinput + 1)), + FLAGS(scan))))) + { + sayNO; + } + locinput += 2; + } + else { /* Handle above Latin-1 code points */ + classnum = (_char_class_number) FLAGS(scan); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, uses a swash to find such code points. Load if if + * not done already */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] + = _core_swash_init("utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + if (! (to_complement + ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) locinput, TRUE)))) + { + sayNO; + } + } + else { /* Here, uses macros to find above Latin-1 code points */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate + code if we revert the change + of \v matching this */ + case _CC_ENUM_PSXSPC: + if (! (to_complement + ^ cBOOL(is_XPERLSPACE_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_BLANK: + if (! (to_complement + ^ cBOOL(is_HORIZWS_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_XDIGIT: + if (! (to_complement + ^ cBOOL(is_XDIGIT_high(locinput)))) + { + sayNO; + } + break; + case _CC_ENUM_VERTSPACE: + if (! (to_complement + ^ cBOOL(is_VERTWS_high(locinput)))) + { + sayNO; + } + break; + default: /* The rest, e.g. [:cntrl:], can't match + above Latin1 */ + if (! to_complement) { + sayNO; + } + break; + } + } + locinput += UTF8SKIP(locinput); + } + break; + + case CLUMP: /* Match \X: logical Unicode character. This is defined as + a Unicode extended Grapheme Cluster */ + /* From http://www.unicode.org/reports/tr29 (5.2 version). An + extended Grapheme Cluster is: + + CR LF + | Prepend* Begin Extend* + | . + + Begin is: ( Special_Begin | ! Control ) + Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) + Extend is: ( Grapheme_Extend | Spacing_Mark ) + Control is: [ GCB_Control | CR | LF ] + Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) + + If we create a 'Regular_Begin' = Begin - Special_Begin, then + we can rewrite + + Begin is ( Regular_Begin + Special Begin ) + + It turns out that 98.4% of all Unicode code points match + Regular_Begin. Doing it this way eliminates a table match in + the previous implementation for almost all Unicode code points. + + There is a subtlety with Prepend* which showed up in testing. + Note that the Begin, and only the Begin is required in: + | Prepend* Begin Extend* + Also, Begin contains '! Control'. A Prepend must be a + '! Control', which means it must also be a Begin. What it + comes down to is that if we match Prepend* and then find no + suitable Begin afterwards, that if we backtrack the last + Prepend, that one will be a suitable Begin. + */ + + if (NEXTCHR_IS_EOS) + sayNO; + if (! utf8_target) { + + /* Match either CR LF or '.', as all the other possibilities + * require utf8 */ + locinput++; /* Match the . or CR */ + if (nextchr == '\r' /* And if it was CR, and the next is LF, + match the LF */ + && locinput < reginfo->strend + && UCHARAT(locinput) == '\n') + { + locinput++; + } + } + else { + + /* Utf8: See if is ( CR LF ); already know that locinput < + * reginfo->strend, so locinput+1 is in bounds */ + if ( nextchr == '\r' && locinput+1 < reginfo->strend + && UCHARAT(locinput + 1) == '\n') + { + locinput += 2; + } + else { + STRLEN len; + + /* In case have to backtrack to beginning, then match '.' */ + char *starting = locinput; + + /* In case have to backtrack the last prepend */ + char *previous_prepend = NULL; + + LOAD_UTF8_CHARCLASS_GCB(); + + /* Match (prepend)* */ + while (locinput < reginfo->strend + && (len = is_GCB_Prepend_utf8(locinput))) + { + previous_prepend = locinput; + locinput += len; + } + + /* As noted above, if we matched a prepend character, but + * the next thing won't match, back off the last prepend we + * matched, as it is guaranteed to match the begin */ + if (previous_prepend + && (locinput >= reginfo->strend + || (! swash_fetch(PL_utf8_X_regular_begin, + (U8*)locinput, utf8_target) + && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput))) + ) + { + locinput = previous_prepend; + } + + /* Note that here we know reginfo->strend > locinput, as we + * tested that upon input to this switch case, and if we + * moved locinput forward, we tested the result just above + * and it either passed, or we backed off so that it will + * now pass */ + if (swash_fetch(PL_utf8_X_regular_begin, + (U8*)locinput, utf8_target)) { + locinput += UTF8SKIP(locinput); + } + else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) { + + /* Here did not match the required 'Begin' in the + * second term. So just match the very first + * character, the '.' of the final term of the regex */ + locinput = starting + UTF8SKIP(starting); + goto exit_utf8; + } else { + + /* Here is a special begin. It can be composed of + * several individual characters. One possibility is + * RI+ */ + if ((len = is_GCB_RI_utf8(locinput))) { + locinput += len; + while (locinput < reginfo->strend + && (len = is_GCB_RI_utf8(locinput))) + { + locinput += len; + } + } else if ((len = is_GCB_T_utf8(locinput))) { + /* Another possibility is T+ */ + locinput += len; + while (locinput < reginfo->strend + && (len = is_GCB_T_utf8(locinput))) + { + locinput += len; + } + } else { + + /* Here, neither RI+ nor T+; must be some other + * Hangul. That means it is one of the others: L, + * LV, LVT or V, and matches: + * L* (L | LVT T* | V * V* T* | LV V* T*) */ + + /* Match L* */ + while (locinput < reginfo->strend + && (len = is_GCB_L_utf8(locinput))) + { + locinput += len; + } + + /* Here, have exhausted L*. If the next character + * is not an LV, LVT nor V, it means we had to have + * at least one L, so matches L+ in the original + * equation, we have a complete hangul syllable. + * Are done. */ + + if (locinput < reginfo->strend + && is_GCB_LV_LVT_V_utf8(locinput)) + { + /* Otherwise keep going. Must be LV, LVT or V. + * See if LVT, by first ruling out V, then LV */ + if (! is_GCB_V_utf8(locinput) + /* All but every TCount one is LV */ + && (valid_utf8_to_uvchr((U8 *) locinput, + NULL) + - SBASE) + % TCount != 0) + { + locinput += UTF8SKIP(locinput); + } else { + + /* Must be V or LV. Take it, then match + * V* */ + locinput += UTF8SKIP(locinput); + while (locinput < reginfo->strend + && (len = is_GCB_V_utf8(locinput))) + { + locinput += len; + } + } + + /* And any of LV, LVT, or V can be followed + * by T* */ + while (locinput < reginfo->strend + && (len = is_GCB_T_utf8(locinput))) + { + locinput += len; + } + } + } + } + + /* Match any extender */ + while (locinput < reginfo->strend + && swash_fetch(PL_utf8_X_extend, + (U8*)locinput, utf8_target)) + { + locinput += UTF8SKIP(locinput); + } + } + exit_utf8: + if (locinput > reginfo->strend) sayNO; + } + break; + + case NREFFL: /* /\g{name}/il */ + { /* The capture buffer cases. The ones beginning with N for the + named buffers just convert to the equivalent numbered and + pretend they were called as the corresponding numbered buffer + op. */ + /* don't initialize these in the declaration, it makes C++ + unhappy */ + const char *s; + char type; + re_fold_t folder; + const U8 *fold_array; + UV utf8_fold_flags; + + folder = foldEQ_locale; + fold_array = PL_fold_locale; + type = REFFL; + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_nref; + + case NREFFA: /* /\g{name}/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFA; + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_nref; + + case NREFFU: /* /\g{name}/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFU; + utf8_fold_flags = 0; + goto do_nref; + + case NREFF: /* /\g{name}/i */ + folder = foldEQ; + fold_array = PL_fold; + type = REFF; + utf8_fold_flags = 0; + goto do_nref; + + case NREF: /* /\g{name}/ */ + type = REF; + folder = NULL; + fold_array = NULL; + utf8_fold_flags = 0; + do_nref: + + /* For the named back references, find the corresponding buffer + * number */ + n = reg_check_named_buff_matched(rex,scan); + + if ( ! n ) { + sayNO; + } + goto do_nref_ref_common; + + case REFFL: /* /\1/il */ + folder = foldEQ_locale; + fold_array = PL_fold_locale; + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_ref; + + case REFFA: /* /\1/iaa */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_ref; + + case REFFU: /* /\1/iu */ + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + utf8_fold_flags = 0; + goto do_ref; + + case REFF: /* /\1/i */ + folder = foldEQ; + fold_array = PL_fold; + utf8_fold_flags = 0; + goto do_ref; + + case REF: /* /\1/ */ + folder = NULL; + fold_array = NULL; + utf8_fold_flags = 0; + + do_ref: + type = OP(scan); + n = ARG(scan); /* which paren pair */ + + do_nref_ref_common: + ln = rex->offs[n].start; + reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ + if (rex->lastparen < n || ln == -1) + sayNO; /* Do not match unless seen CLOSEn. */ + if (ln == rex->offs[n].end) + break; + + s = reginfo->strbeg + ln; + if (type != REF /* REF can do byte comparison */ + && (utf8_target || type == REFFU || type == REFFL)) + { + char * limit = reginfo->strend; + + /* This call case insensitively compares the entire buffer + * at s, with the current input starting at locinput, but + * not going off the end given by reginfo->strend, and + * returns in upon success, how much of the + * current input was matched */ + if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, + locinput, &limit, 0, utf8_target, utf8_fold_flags)) + { + sayNO; + } + locinput = limit; + break; + } + + /* Not utf8: Inline the first character, for speed. */ + if (!NEXTCHR_IS_EOS && + UCHARAT(s) != nextchr && + (type == REF || + UCHARAT(s) != fold_array[nextchr])) + sayNO; + ln = rex->offs[n].end - ln; + if (locinput + ln > reginfo->strend) + sayNO; + if (ln > 1 && (type == REF + ? memNE(s, locinput, ln) + : ! folder(s, locinput, ln))) + sayNO; + locinput += ln; + break; + } + + case NOTHING: /* null op; e.g. the 'nothing' following + * the '*' in m{(a+|b)*}' */ + break; + case TAIL: /* placeholder while compiling (A|B|C) */ + break; + + case BACK: /* ??? doesn't appear to be used ??? */ + break; + +#undef ST +#define ST st->u.eval + { + SV *ret; + REGEXP *re_sv; + regexp *re; + regexp_internal *rei; + regnode *startpoint; + + case GOSTART: /* (?R) */ + case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) + Perl_croak(aTHX_ "Infinite recursion in regex"); + if ( ++nochange_depth > max_nochange_depth ) + Perl_croak(aTHX_ + "Pattern subroutine nesting without pos change" + " exceeded limit in regex"); + } else { + nochange_depth = 0; + } + re_sv = rex_sv; + re = rex; + rei = rexi; + if (OP(scan)==GOSUB) { + startpoint = scan + ARG2L(scan); + ST.close_paren = ARG(scan); + } else { + startpoint = rei->program+1; + ST.close_paren = 0; + } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ + goto eval_recurse_doit; + + /* NOTREACHED */ + assert(0); + + case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ + if (cur_eval && cur_eval->locinput==locinput) { + if ( ++nochange_depth > max_nochange_depth ) + Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); + } else { + nochange_depth = 0; + } + { + /* execute the code in the {...} */ + + dSP; + IV before; + OP * const oop = PL_op; + COP * const ocurcop = PL_curcop; + OP *nop; + CV *newcv; + + /* save *all* paren positions */ + regcppush(rex, 0, maxopenparen); + REGCP_SET(runops_cp); + + if (!caller_cv) + caller_cv = find_runcv(NULL); + + n = ARG(scan); + + if (rexi->data->what[n] == 'r') { /* code from an external qr */ + newcv = (ReANY( + (REGEXP*)(rexi->data->data[n]) + ))->qr_anoncv + ; + nop = (OP*)rexi->data->data[n+1]; + } + else if (rexi->data->what[n] == 'l') { /* literal code */ + newcv = caller_cv; + nop = (OP*)rexi->data->data[n]; + assert(CvDEPTH(newcv)); + } + else { + /* literal with own CV */ + assert(rexi->data->what[n] == 'L'); + newcv = rex->qr_anoncv; + nop = (OP*)rexi->data->data[n]; + } + + /* normally if we're about to execute code from the same + * CV that we used previously, we just use the existing + * CX stack entry. However, its possible that in the + * meantime we may have backtracked, popped from the save + * stack, and undone the SAVECOMPPAD(s) associated with + * PUSH_MULTICALL; in which case PL_comppad no longer + * points to newcv's pad. */ + if (newcv != last_pushed_cv || PL_comppad != last_pad) + { + U8 flags = (CXp_SUB_RE | + ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); + if (last_pushed_cv) { + CHANGE_MULTICALL_FLAGS(newcv, flags); + } + else { + PUSH_MULTICALL_FLAGS(newcv, flags); + } + last_pushed_cv = newcv; + } + else { + /* these assignments are just to silence compiler + * warnings */ + multicall_cop = NULL; + newsp = NULL; + } + last_pad = PL_comppad; + + /* the initial nextstate you would normally execute + * at the start of an eval (which would cause error + * messages to come from the eval), may be optimised + * away from the execution path in the regex code blocks; + * so manually set PL_curcop to it initially */ + { + OP *o = cUNOPx(nop)->op_first; + assert(o->op_type == OP_NULL); + if (o->op_targ == OP_SCOPE) { + o = cUNOPo->op_first; + } + else { + assert(o->op_targ == OP_LEAVE); + o = cUNOPo->op_first; + assert(o->op_type == OP_ENTER); + o = OP_SIBLING(o); + } + + if (o->op_type != OP_STUB) { + assert( o->op_type == OP_NEXTSTATE + || o->op_type == OP_DBSTATE + || (o->op_type == OP_NULL + && ( o->op_targ == OP_NEXTSTATE + || o->op_targ == OP_DBSTATE + ) + ) + ); + PL_curcop = (COP*)o; + } + } + nop = nop->op_next; + + DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, + " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); + + rex->offs[0].end = locinput - reginfo->strbeg; + if (reginfo->info_aux_eval->pos_magic) + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); + + if (sv_yes_mark) { + SV *sv_mrk = get_sv("REGMARK", 1); + sv_setsv(sv_mrk, sv_yes_mark); + } + + /* we don't use MULTICALL here as we want to call the + * first op of the block of interest, rather than the + * first op of the sub */ + before = (IV)(SP-PL_stack_base); + PL_op = nop; + CALLRUNOPS(aTHX); /* Scalar context. */ + SPAGAIN; + if ((IV)(SP-PL_stack_base) == before) + ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ + else { + ret = POPs; + PUTBACK; + } + + /* before restoring everything, evaluate the returned + * value, so that 'uninit' warnings don't use the wrong + * PL_op or pad. Also need to process any magic vars + * (e.g. $1) *before* parentheses are restored */ + + PL_op = NULL; + + re_sv = NULL; + if (logical == 0) /* (?{})/ */ + sv_setsv(save_scalar(PL_replgv), ret); /* $^R */ + else if (logical == 1) { /* /(?(?{...})X|Y)/ */ + sw = cBOOL(SvTRUE(ret)); + logical = 0; + } + else { /* /(??{}) */ + /* if its overloaded, let the regex compiler handle + * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); + if (!SvAMAGIC(ret)) { + SV *sv = ret; + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) == SVt_REGEXP) + re_sv = (REGEXP*) sv; + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); + if (mg) + re_sv = (REGEXP *) mg->mg_obj; + } + + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { + ret = sv_mortalcopy(ret); + (void) SvPV_force_nolen(ret); + } + } + + } + + /* *** Note that at this point we don't restore + * PL_comppad, (or pop the CxSUB) on the assumption it may + * be used again soon. This is safe as long as nothing + * in the regexp code uses the pad ! */ + PL_op = oop; + PL_curcop = ocurcop; + S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); + PL_curpm = PL_reg_curpm; + + if (logical != 2) + break; + } + + /* only /(??{})/ from now on */ + logical = 0; + { + /* extract RE object from returned value; compiling if + * necessary */ + + if (re_sv) { + re_sv = reg_temp_copy(NULL, re_sv); + } + else { + U32 pm_flags = 0; + + if (SvUTF8(ret) && IN_BYTES) { + /* In use 'bytes': make a copy of the octet + * sequence, but without the flag on */ + STRLEN len; + const char *const p = SvPV(ret, len); + ret = newSVpvn_flags(p, len, SVs_TEMP); + } + if (rex->intflags & PREGf_USE_RE_EVAL) + pm_flags |= PMf_USE_RE_EVAL; + + /* if we got here, it should be an engine which + * supports compiling code blocks and stuff */ + assert(rex->engine && rex->engine->op_comp); + assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); + re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, + rex->engine, NULL, NULL, + /* copy /msix etc to inner pattern */ + scan->flags, + pm_flags); + + if (!(SvFLAGS(ret) + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { + /* This isn't a first class regexp. Instead, it's + caching a regexp onto an existing, Perl visible + scalar. */ + sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); + } + } + SAVEFREESV(re_sv); + re = ReANY(re_sv); + } + RXp_MATCH_COPIED_off(re); + re->subbeg = rex->subbeg; + re->sublen = rex->sublen; + re->suboffset = rex->suboffset; + re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; + rei = RXi_GET(re); + DEBUG_EXECUTE_r( + debug_start_match(re_sv, utf8_target, locinput, + reginfo->strend, "Matching embedded"); + ); + startpoint = rei->program + 1; + ST.close_paren = 0; /* only used for GOSUB */ + /* Save all the seen positions so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + /* and set maxopenparen to 0, since we are starting a "fresh" match */ + maxopenparen = 0; + /* run the pattern returned from (??{...}) */ + + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ + + /* invalidate the S-L poscache. We're now executing a + * different set of WHILEM ops (and their associated + * indexes) against the same string, so the bits in the + * cache are meaningless. Setting maxiter to zero forces + * the cache to be invalidated and zeroed before reuse. + * XXX This is too dramatic a measure. Ideally we should + * save the old cache and restore when running the outer + * pattern again */ + reginfo->poscache_maxiter = 0; + + /* the new regexp might have a different is_utf8_pat than we do */ + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); + + ST.prev_rex = rex_sv; + ST.prev_curlyx = cur_curlyx; + rex_sv = re_sv; + SET_reg_curpm(rex_sv); + rex = re; + rexi = rei; + cur_curlyx = NULL; + ST.B = next; + ST.prev_eval = cur_eval; + cur_eval = st; + /* now continue from first node in postoned RE */ + PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput); + /* NOTREACHED */ + assert(0); + } + + case EVAL_AB: /* cleanup after a successful (??{A})B */ + /* note: this is called twice; first after popping B, then A */ + rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + { + /* preserve $^R across LEAVE's. See Bug 121070. */ + SV *save_sv= GvSV(PL_replgv); + SvREFCNT_inc(save_sv); + regcpblow(ST.cp); /* LEAVE in disguise */ + sv_setsv(GvSV(PL_replgv), save_sv); + SvREFCNT_dec(save_sv); + } + cur_eval = ST.prev_eval; + cur_curlyx = ST.prev_curlyx; + + /* Invalidate cache. See "invalidate" comment above. */ + reginfo->poscache_maxiter = 0; + if ( nochange_depth ) + nochange_depth--; + sayYES; + + + case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ + /* note: this is called twice; first after popping B, then A */ + rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + cur_eval = ST.prev_eval; + cur_curlyx = ST.prev_curlyx; + /* Invalidate cache. See "invalidate" comment above. */ + reginfo->poscache_maxiter = 0; + if ( nochange_depth ) + nochange_depth--; + sayNO_SILENT; +#undef ST + + case OPEN: /* ( */ + n = ARG(scan); /* which paren pair */ + rex->offs[n].start_tmp = locinput - reginfo->strbeg; + if (n > maxopenparen) + maxopenparen = n; + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n", + PTR2UV(rex), + PTR2UV(rex->offs), + (UV)n, + (IV)rex->offs[n].start_tmp, + (UV)maxopenparen + )); + lastopen = n; + break; + +/* XXX really need to log other places start/end are set too */ +#define CLOSE_CAPTURE \ + rex->offs[n].start = rex->offs[n].start_tmp; \ + rex->offs[n].end = locinput - reginfo->strbeg; \ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \ + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \ + PTR2UV(rex), \ + PTR2UV(rex->offs), \ + (UV)n, \ + (IV)rex->offs[n].start, \ + (IV)rex->offs[n].end \ + )) + + case CLOSE: /* ) */ + n = ARG(scan); /* which paren pair */ + CLOSE_CAPTURE; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; + if (cur_eval && cur_eval->u.eval.close_paren == n) { + goto fake_end; + } + break; + + case ACCEPT: /* (*ACCEPT) */ + if (ARG(scan)){ + regnode *cursor; + for (cursor=scan; + cursor && OP(cursor)!=END; + cursor=regnext(cursor)) + { + if ( OP(cursor)==CLOSE ){ + n = ARG(cursor); + if ( n <= lastopen ) { + CLOSE_CAPTURE; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; + if ( n == ARG(scan) || (cur_eval && + cur_eval->u.eval.close_paren == n)) + break; + } + } + } + } + goto fake_end; + /* NOTREACHED */ + + case GROUPP: /* (?(1)) */ + n = ARG(scan); /* which paren pair */ + sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); + break; + + case NGROUPP: /* (?()) */ + /* reg_check_named_buff_matched returns 0 for no match */ + sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); + break; + + case INSUBP: /* (?(R)) */ + n = ARG(scan); + sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n)); + break; + + case DEFINEP: /* (?(DEFINE)) */ + sw = 0; + break; + + case IFTHEN: /* (?(cond)A|B) */ + reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ + if (sw) + next = NEXTOPER(NEXTOPER(scan)); + else { + next = scan + ARG(scan); + if (OP(next) == IFTHEN) /* Fake one. */ + next = NEXTOPER(NEXTOPER(next)); + } + break; + + case LOGICAL: /* modifier for EVAL and IFMATCH */ + logical = scan->flags; + break; + +/******************************************************************* + +The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/ +pattern, where A and B are subpatterns. (For simple A, CURLYM or +STAR/PLUS/CURLY/CURLYN are used instead.) + +A*B is compiled as + +On entry to the subpattern, CURLYX is called. This pushes a CURLYX +state, which contains the current count, initialised to -1. It also sets +cur_curlyx to point to this state, with any previous value saved in the +state block. + +CURLYX then jumps straight to the WHILEM op, rather than executing A, +since the pattern may possibly match zero times (i.e. it's a while {} loop +rather than a do {} while loop). + +Each entry to WHILEM represents a successful match of A. The count in the +CURLYX block is incremented, another WHILEM state is pushed, and execution +passes to A or B depending on greediness and the current count. + +For example, if matching against the string a1a2a3b (where the aN are +substrings that match /A/), then the match progresses as follows: (the +pushed states are interspersed with the bits of strings matched so far): + + + + a1 + a1 a2 + a1 a2 a3 + a1 a2 a3 b + +(Contrast this with something like CURLYM, which maintains only a single +backtrack state: + + a1 + a1 a2 + a1 a2 a3 + a1 a2 a3 b +) + +Each WHILEM state block marks a point to backtrack to upon partial failure +of A or B, and also contains some minor state data related to that +iteration. The CURLYX block, pointed to by cur_curlyx, contains the +overall state, such as the count, and pointers to the A and B ops. + +This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx +must always point to the *current* CURLYX block, the rules are: + +When executing CURLYX, save the old cur_curlyx in the CURLYX state block, +and set cur_curlyx to point the new block. + +When popping the CURLYX block after a successful or unsuccessful match, +restore the previous cur_curlyx. + +When WHILEM is about to execute B, save the current cur_curlyx, and set it +to the outer one saved in the CURLYX block. + +When popping the WHILEM block after a successful or unsuccessful B match, +restore the previous cur_curlyx. + +Here's an example for the pattern (AI* BI)*BO +I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: + +cur_ +curlyx backtrack stack +------ --------------- +NULL +CO +CI ai +CO ai bi +NULL ai bi bo + +At this point the pattern succeeds, and we work back down the stack to +clean up, restoring as we go: + +CO ai bi +CI ai +CO +NULL + +*******************************************************************/ + +#define ST st->u.curlyx + + case CURLYX: /* start of /A*B/ (for complex A) */ + { + /* No need to save/restore up to this paren */ + I32 parenfloor = scan->flags; + + assert(next); /* keep Coverity happy */ + if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ + next += ARG(next); + + /* XXXX Probably it is better to teach regpush to support + parenfloor > maxopenparen ... */ + if (parenfloor > (I32)rex->lastparen) + parenfloor = rex->lastparen; /* Pessimization... */ + + ST.prev_curlyx= cur_curlyx; + cur_curlyx = st; + ST.cp = PL_savestack_ix; + + /* these fields contain the state of the current curly. + * they are accessed by subsequent WHILEMs */ + ST.parenfloor = parenfloor; + ST.me = scan; + ST.B = next; + ST.minmod = minmod; + minmod = 0; + ST.count = -1; /* this will be updated by WHILEM */ + ST.lastloc = NULL; /* this will be updated by WHILEM */ + + PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput); + /* NOTREACHED */ + assert(0); + } + + case CURLYX_end: /* just finished matching all of A*B */ + cur_curlyx = ST.prev_curlyx; + sayYES; + /* NOTREACHED */ + assert(0); + + case CURLYX_end_fail: /* just failed to match all of A*B */ + regcpblow(ST.cp); + cur_curlyx = ST.prev_curlyx; + sayNO; + /* NOTREACHED */ + assert(0); + + +#undef ST +#define ST st->u.whilem + + case WHILEM: /* just matched an A in /A*B/ (for complex A) */ + { + /* see the discussion above about CURLYX/WHILEM */ + I32 n; + int min, max; + regnode *A; + + assert(cur_curlyx); /* keep Coverity happy */ + + min = ARG1(cur_curlyx->u.curlyx.me); + max = ARG2(cur_curlyx->u.curlyx.me); + A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS; + n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ + ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; + ST.cache_offset = 0; + ST.cache_mask = 0; + + + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: matched %ld out of %d..%d\n", + REPORT_CODE_OFF+depth*2, "", (long)n, min, max) + ); + + /* First just match a string of min A's. */ + + if (n < min) { + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + cur_curlyx->u.curlyx.lastloc = locinput; + REGCP_SET(ST.lastcp); + + PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput); + /* NOTREACHED */ + assert(0); + } + + /* If degenerate A matches "", assume A done. */ + + if (locinput == cur_curlyx->u.curlyx.lastloc) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: empty match detected, trying continuation...\n", + REPORT_CODE_OFF+depth*2, "") + ); + goto do_whilem_B_max; + } + + /* super-linear cache processing. + * + * The idea here is that for certain types of CURLYX/WHILEM - + * principally those whose upper bound is infinity (and + * excluding regexes that have things like \1 and other very + * non-regular expresssiony things), then if a pattern like + * /....A*.../ fails and we backtrack to the WHILEM, then we + * make a note that this particular WHILEM op was at string + * position 47 (say) when the rest of pattern failed. Then, if + * we ever find ourselves back at that WHILEM, and at string + * position 47 again, we can just fail immediately rather than + * running the rest of the pattern again. + * + * This is very handy when patterns start to go + * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up + * with a combinatorial explosion of backtracking. + * + * The cache is implemented as a bit array, with one bit per + * string byte position per WHILEM op (up to 16) - so its + * between 0.25 and 2x the string size. + * + * To avoid allocating a poscache buffer every time, we do an + * initially countdown; only after we have executed a WHILEM + * op (string-length x #WHILEMs) times do we allocate the + * cache. + * + * The top 4 bits of scan->flags byte say how many different + * relevant CURLLYX/WHILEM op pairs there are, while the + * bottom 4-bits is the identifying index number of this + * WHILEM. + */ + + if (scan->flags) { + + if (!reginfo->poscache_maxiter) { + /* start the countdown: Postpone detection until we + * know the match is not *that* much linear. */ + reginfo->poscache_maxiter + = (reginfo->strend - reginfo->strbeg + 1) + * (scan->flags>>4); + /* possible overflow for long strings and many CURLYX's */ + if (reginfo->poscache_maxiter < 0) + reginfo->poscache_maxiter = I32_MAX; + reginfo->poscache_iter = reginfo->poscache_maxiter; + } + + if (reginfo->poscache_iter-- == 0) { + /* initialise cache */ + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; + regmatch_info_aux *const aux = reginfo->info_aux; + if (aux->poscache) { + if ((SSize_t)reginfo->poscache_size < size) { + Renew(aux->poscache, size, char); + reginfo->poscache_size = size; + } + Zero(aux->poscache, size, char); + } + else { + reginfo->poscache_size = size; + Newxz(aux->poscache, size, char); + } + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%swhilem: Detected a super-linear match, switching on caching%s...\n", + PL_colors[4], PL_colors[5]) + ); + } + + if (reginfo->poscache_iter < 0) { + /* have we already failed at this position? */ + SSize_t offset, mask; + + reginfo->poscache_iter = -1; /* stop eventual underflow */ + offset = (scan->flags & 0xf) - 1 + + (locinput - reginfo->strbeg) + * (scan->flags>>4); + mask = 1 << (offset % 8); + offset /= 8; + if (reginfo->info_aux->poscache[offset] & mask) { + DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, + "%*s whilem: (cache) already tried at this position...\n", + REPORT_CODE_OFF+depth*2, "") + ); + sayNO; /* cache records failure */ + } + ST.cache_offset = offset; + ST.cache_mask = mask; + } + } + + /* Prefer B over A for minimal matching. */ + + if (cur_curlyx->u.curlyx.minmod) { + ST.save_curlyx = cur_curlyx; + cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; + ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor, + maxopenparen); + REGCP_SET(ST.lastcp); + PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, + locinput); + /* NOTREACHED */ + assert(0); + } + + /* Prefer A over B for maximal matching. */ + + if (n < max) { /* More greed allowed? */ + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + cur_curlyx->u.curlyx.lastloc = locinput; + REGCP_SET(ST.lastcp); + PUSH_STATE_GOTO(WHILEM_A_max, A, locinput); + /* NOTREACHED */ + assert(0); + } + goto do_whilem_B_max; + } + /* NOTREACHED */ + assert(0); + + case WHILEM_B_min: /* just matched B in a minimal match */ + case WHILEM_B_max: /* just matched B in a maximal match */ + cur_curlyx = ST.save_curlyx; + sayYES; + /* NOTREACHED */ + assert(0); + + case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ + cur_curlyx = ST.save_curlyx; + cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + /* NOTREACHED */ + assert(0); + + case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ + /* FALLTHROUGH */ + case WHILEM_A_pre_fail: /* just failed to match even minimal A */ + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + /* NOTREACHED */ + assert(0); + + case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); /* Restore some previous $s? */ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%*s whilem: failed, trying continuation...\n", + REPORT_CODE_OFF+depth*2, "") + ); + do_whilem_B_max: + if (cur_curlyx->u.curlyx.count >= REG_INFTY + && ckWARN(WARN_REGEXP) + && !reginfo->warned) + { + reginfo->warned = TRUE; + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion limit (%d) " + "exceeded", + REG_INFTY - 1); + } + + /* now try B */ + ST.save_curlyx = cur_curlyx; + cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; + PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, + locinput); + /* NOTREACHED */ + assert(0); + + case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ + cur_curlyx = ST.save_curlyx; + REGCP_UNWIND(ST.lastcp); + regcppop(rex, &maxopenparen); + + if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { + /* Maximum greed exceeded */ + if (cur_curlyx->u.curlyx.count >= REG_INFTY + && ckWARN(WARN_REGEXP) + && !reginfo->warned) + { + reginfo->warned = TRUE; + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion " + "limit (%d) exceeded", + REG_INFTY - 1); + } + cur_curlyx->u.curlyx.count--; + CACHEsayNO; + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "") + ); + /* Try grabbing another A and see if it helps. */ + cur_curlyx->u.curlyx.lastloc = locinput; + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, + maxopenparen); + REGCP_SET(ST.lastcp); + PUSH_STATE_GOTO(WHILEM_A_min, + /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, + locinput); + /* NOTREACHED */ + assert(0); + +#undef ST +#define ST st->u.branch + + case BRANCHJ: /* /(...|A|...)/ with long next pointer */ + next = scan + ARG(scan); + if (next == scan) + next = NULL; + scan = NEXTOPER(scan); + /* FALLTHROUGH */ + + case BRANCH: /* /(...|A|...)/ */ + scan = NEXTOPER(scan); /* scan now points to inner node */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + ST.next_branch = next; + REGCP_SET(ST.cp); + + /* Now go into the branch */ + if (has_cutgroup) { + PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput); + } else { + PUSH_STATE_GOTO(BRANCH_next, scan, locinput); + } + /* NOTREACHED */ + assert(0); + + case CUTGROUP: /* /(*THEN)/ */ + sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : + MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + PUSH_STATE_GOTO(CUTGROUP_next, next, locinput); + /* NOTREACHED */ + assert(0); + + case CUTGROUP_next_fail: + do_cutgroup = 1; + no_final = 1; + if (st->u.mark.mark_name) + sv_commit = st->u.mark.mark_name; + sayNO; + /* NOTREACHED */ + assert(0); + + case BRANCH_next: + sayYES; + /* NOTREACHED */ + assert(0); + + case BRANCH_next_fail: /* that branch failed; try the next, if any */ + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + scan = ST.next_branch; + /* no more branches? */ + if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sBRANCH failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); + sayNO_SILENT; + } + continue; /* execute next BRANCH[J] op */ + /* NOTREACHED */ + assert(0); + + case MINMOD: /* next op will be non-greedy, e.g. A*? */ + minmod = 1; + break; + +#undef ST +#define ST st->u.curlym + + case CURLYM: /* /A{m,n}B/ where A is fixed-length */ + + /* This is an optimisation of CURLYX that enables us to push + * only a single backtracking state, no matter how many matches + * there are in {m,n}. It relies on the pattern being constant + * length, with no parens to influence future backrefs + */ + + ST.me = scan; + scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + + /* if paren positive, emulate an OPEN/CLOSE around A */ + if (ST.me->flags) { + U32 paren = ST.me->flags; + if (paren > maxopenparen) + maxopenparen = paren; + scan += NEXT_OFF(scan); /* Skip former OPEN. */ + } + ST.A = scan; + ST.B = next; + ST.alen = 0; + ST.count = 0; + ST.minmod = minmod; + minmod = 0; + ST.c1 = CHRTEST_UNINIT; + REGCP_SET(ST.cp); + + if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ + goto curlym_do_B; + + curlym_do_A: /* execute the A in /A{m,n}B/ */ + PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */ + /* NOTREACHED */ + assert(0); + + case CURLYM_A: /* we've just matched an A */ + ST.count++; + /* after first match, determine A's length: u.curlym.alen */ + if (ST.count == 1) { + if (reginfo->is_utf8_target) { + char *s = st->locinput; + while (s < locinput) { + ST.alen++; + s += UTF8SKIP(s); + } + } + else { + ST.alen = locinput - st->locinput; + } + if (ST.alen == 0) + ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me); + } + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n", + (int)(REPORT_CODE_OFF+(depth*2)), "", + (IV) ST.count, (IV)ST.alen) + ); + + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags) + goto fake_end; + + { + I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); + if ( max == REG_INFTY || ST.count < max ) + goto curlym_do_A; /* try to match another A */ + } + goto curlym_do_B; /* try to match B */ + + case CURLYM_A_fail: /* just failed to match an A */ + REGCP_UNWIND(ST.cp); + + if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ + || (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags)) + sayNO; + + curlym_do_B: /* execute the B in /A{m,n}B/ */ + if (ST.c1 == CHRTEST_UNINIT) { + /* calculate c1 and c2 for possible match of 1st char + * following curly */ + ST.c1 = ST.c2 = CHRTEST_VOID; + assert(ST.B); + if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { + regnode *text_node = ST.B; + if (! HAS_TEXT(text_node)) + FIND_NEXT_IMPT(text_node); + /* this used to be + + (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT) + + But the former is redundant in light of the latter. + + if this changes back then the macro for + IS_TEXT and friends need to change. + */ + if (PL_regkind[OP(text_node)] == EXACT) { + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, + reginfo)) + { + sayNO; + } + } + } + } + + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM trying tail with matches=%"IVdf"...\n", + (int)(REPORT_CODE_OFF+(depth*2)), + "", (IV)ST.count) + ); + if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { + if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { + if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + valid_utf8_to_uvchr((U8 *) locinput, NULL), + valid_utf8_to_uvchr(ST.c1_utf8, NULL), + valid_utf8_to_uvchr(ST.c2_utf8, NULL)) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + else if (nextchr != ST.c1 && nextchr != ST.c2) { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + (int) nextchr, ST.c1, ST.c2) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + + if (ST.me->flags) { + /* emulate CLOSE: mark current A as captured */ + I32 paren = ST.me->flags; + if (ST.count) { + rex->offs[paren].start + = HOPc(locinput, -ST.alen) - reginfo->strbeg; + rex->offs[paren].end = locinput - reginfo->strbeg; + if ((U32)paren > rex->lastparen) + rex->lastparen = paren; + rex->lastcloseparen = paren; + } + else + rex->offs[paren].end = -1; + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.me->flags) + { + if (ST.count) + goto fake_end; + else + sayNO; + } + } + + PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */ + /* NOTREACHED */ + assert(0); + + case CURLYM_B_fail: /* just failed to match a B */ + REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + if (ST.minmod) { + I32 max = ARG2(ST.me); + if (max != REG_INFTY && ST.count == max) + sayNO; + goto curlym_do_A; /* try to match a further A */ + } + /* backtrack one A */ + if (ST.count == ARG1(ST.me) /* min */) + sayNO; + ST.count--; + SET_locinput(HOPc(locinput, -ST.alen)); + goto curlym_do_B; /* try to match B */ + +#undef ST +#define ST st->u.curly + +#define CURLY_SETPAREN(paren, success) \ + if (paren) { \ + if (success) { \ + rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \ + rex->offs[paren].end = locinput - reginfo->strbeg; \ + if (paren > rex->lastparen) \ + rex->lastparen = paren; \ + rex->lastcloseparen = paren; \ + } \ + else { \ + rex->offs[paren].end = -1; \ + rex->lastparen = ST.lastparen; \ + rex->lastcloseparen = ST.lastcloseparen; \ + } \ + } + + case STAR: /* /A*B/ where A is width 1 char */ + ST.paren = 0; + ST.min = 0; + ST.max = REG_INFTY; + scan = NEXTOPER(scan); + goto repeat; + + case PLUS: /* /A+B/ where A is width 1 char */ + ST.paren = 0; + ST.min = 1; + ST.max = REG_INFTY; + scan = NEXTOPER(scan); + goto repeat; + + case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ + ST.paren = scan->flags; /* Which paren to set */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + if (ST.paren > maxopenparen) + maxopenparen = ST.paren; + ST.min = ARG1(scan); /* min to match */ + ST.max = ARG2(scan); /* max to match */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + ST.min=1; + ST.max=1; + } + scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); + goto repeat; + + case CURLY: /* /A{m,n}B/ where A is width 1 char */ + ST.paren = 0; + ST.min = ARG1(scan); /* min to match */ + ST.max = ARG2(scan); /* max to match */ + scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + repeat: + /* + * Lookahead to avoid useless match attempts + * when we know what character comes next. + * + * Used to only do .*x and .*?x, but now it allows + * for )'s, ('s and (?{ ... })'s to be in the way + * of the quantifier and the EXACT-like node. -- japhy + */ + + assert(ST.min <= ST.max); + if (! HAS_TEXT(next) && ! JUMPABLE(next)) { + ST.c1 = ST.c2 = CHRTEST_VOID; + } + else { + regnode *text_node = next; + + if (! HAS_TEXT(text_node)) + FIND_NEXT_IMPT(text_node); + + if (! HAS_TEXT(text_node)) + ST.c1 = ST.c2 = CHRTEST_VOID; + else { + if ( PL_regkind[OP(text_node)] != EXACT ) { + ST.c1 = ST.c2 = CHRTEST_VOID; + } + else { + + /* Currently we only get here when + + PL_rekind[OP(text_node)] == EXACT + + if this changes back then the macro for IS_TEXT and + friends need to change. */ + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, + reginfo)) + { + sayNO; + } + } + } + } + + ST.A = scan; + ST.B = next; + if (minmod) { + char *li = locinput; + minmod = 0; + if (ST.min && + regrepeat(rex, &li, ST.A, reginfo, ST.min, depth) + < ST.min) + sayNO; + SET_locinput(li); + ST.count = ST.min; + REGCP_SET(ST.cp); + if (ST.c1 == CHRTEST_VOID) + goto curly_try_B_min; + + ST.oldloc = locinput; + + /* set ST.maxpos to the furthest point along the + * string that could possibly match */ + if (ST.max == REG_INFTY) { + ST.maxpos = reginfo->strend - 1; + if (utf8_target) + while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) + ST.maxpos--; + } + else if (utf8_target) { + int m = ST.max - ST.min; + for (ST.maxpos = locinput; + m >0 && ST.maxpos < reginfo->strend; m--) + ST.maxpos += UTF8SKIP(ST.maxpos); + } + else { + ST.maxpos = locinput + ST.max - ST.min; + if (ST.maxpos >= reginfo->strend) + ST.maxpos = reginfo->strend - 1; + } + goto curly_try_B_min_known; + + } + else { + /* avoid taking address of locinput, so it can remain + * a register var */ + char *li = locinput; + ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth); + if (ST.count < ST.min) + sayNO; + SET_locinput(li); + if ((ST.count > ST.min) + && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL)) + { + /* A{m,n} must come at the end of the string, there's + * no point in backing off ... */ + ST.min = ST.count; + /* ...except that $ and \Z can match before *and* after + newline at the end. Consider "\n\n" =~ /\n+\Z\n/. + We may back off by one in this case. */ + if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS) + ST.min--; + } + REGCP_SET(ST.cp); + goto curly_try_B_max; + } + /* NOTREACHED */ + assert(0); + + case CURLY_B_min_known_fail: + /* failed to find B in a non-greedy match where c1,c2 valid */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* Couldn't or didn't -- move forward. */ + ST.oldloc = locinput; + if (utf8_target) + locinput += UTF8SKIP(locinput); + else + locinput++; + ST.count++; + curly_try_B_min_known: + /* find the next place where 'B' could work, then call B */ + { + int n; + if (utf8_target) { + n = (ST.oldloc == locinput) ? 0 : 1; + if (ST.c1 == ST.c2) { + /* set n to utf8_distance(oldloc, locinput) */ + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); + n++; + } + } + else { + /* set n to utf8_distance(oldloc, locinput) */ + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); + n++; + } + } + } + else { /* Not utf8_target */ + if (ST.c1 == ST.c2) { + while (locinput <= ST.maxpos && + UCHARAT(locinput) != ST.c1) + locinput++; + } + else { + while (locinput <= ST.maxpos + && UCHARAT(locinput) != ST.c1 + && UCHARAT(locinput) != ST.c2) + locinput++; + } + n = locinput - ST.oldloc; + } + if (locinput > ST.maxpos) + sayNO; + if (n) { + /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is + * at b; check that everything between oldloc and + * locinput matches */ + char *li = ST.oldloc; + ST.count += n; + if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n) + sayNO; + assert(n == REG_INFTY || locinput == li); + } + CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput); + } + /* NOTREACHED */ + assert(0); + + case CURLY_B_min_fail: + /* failed to find B in a non-greedy match where c1,c2 invalid */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* failed -- move forward one */ + { + char *li = locinput; + if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) { + sayNO; + } + locinput = li; + } + { + ST.count++; + if (ST.count <= ST.max || (ST.max == REG_INFTY && + ST.count > 0)) /* count overflow ? */ + { + curly_try_B_min: + CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput); + } + } + sayNO; + /* NOTREACHED */ + assert(0); + + curly_try_B_max: + /* a successful greedy match: now try to match B */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == (U32)ST.paren) { + goto fake_end; + } + { + bool could_match = locinput < reginfo->strend; + + /* If it could work, try it. */ + if (ST.c1 != CHRTEST_VOID && could_match) { + if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) + { + could_match = memEQ(locinput, + ST.c1_utf8, + UTF8SKIP(locinput)) + || memEQ(locinput, + ST.c2_utf8, + UTF8SKIP(locinput)); + } + else { + could_match = UCHARAT(locinput) == ST.c1 + || UCHARAT(locinput) == ST.c2; + } + } + if (ST.c1 == CHRTEST_VOID || could_match) { + CURLY_SETPAREN(ST.paren, ST.count); + PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput); + /* NOTREACHED */ + assert(0); + } + } + /* FALLTHROUGH */ + + case CURLY_B_max_fail: + /* failed to find B in a greedy match */ + + REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } + /* back up. */ + if (--ST.count < ST.min) + sayNO; + locinput = HOPc(locinput, -1); + goto curly_try_B_max; + +#undef ST + + case END: /* last op of main pattern */ + fake_end: + if (cur_eval) { + /* we've just finished A in /(??{A})B/; now continue with B */ + + st->u.eval.prev_rex = rex_sv; /* inner */ + + /* Save *all* the positions. */ + st->u.eval.cp = regcppush(rex, 0, maxopenparen); + rex_sv = cur_eval->u.eval.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); + SET_reg_curpm(rex_sv); + rex = ReANY(rex_sv); + rexi = RXi_GET(rex); + cur_curlyx = cur_eval->u.eval.prev_curlyx; + + REGCP_SET(st->u.eval.lastcp); + + /* Restore parens of the outer rex without popping the + * savestack */ + S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp, + &maxopenparen); + + st->u.eval.prev_eval = cur_eval; + cur_eval = cur_eval->u.eval.prev_eval; + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n", + REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval));); + if ( nochange_depth ) + nochange_depth--; + + PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B, + locinput); /* match B */ + } + + if (locinput < reginfo->till) { + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n", + PL_colors[4], + (long)(locinput - startpos), + (long)(reginfo->till - startpos), + PL_colors[5])); + + sayNO_SILENT; /* Cannot match: too short. */ + } + sayYES; /* Success! */ + + case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %ssubpattern success...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])); + sayYES; /* Success! */ + +#undef ST +#define ST st->u.ifmatch + + { + char *newstart; + + case SUSPEND: /* (?>A) */ + ST.wanted = 1; + newstart = locinput; + goto do_ifmatch; + + case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?flags) { + char * const s = HOPBACKc(locinput, scan->flags); + if (!s) { + /* trivial fail */ + if (logical) { + logical = 0; + sw = 1 - cBOOL(ST.wanted); + } + else if (ST.wanted) + sayNO; + next = scan + ARG(scan); + if (next == scan) + next = NULL; + break; + } + newstart = s; + } + else + newstart = locinput; + + do_ifmatch: + ST.me = scan; + ST.logical = logical; + logical = 0; /* XXX: reset state of logical once it has been saved into ST */ + + /* execute body of (?...A) */ + PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart); + /* NOTREACHED */ + assert(0); + } + + case IFMATCH_A_fail: /* body of (?...A) failed */ + ST.wanted = !ST.wanted; + /* FALLTHROUGH */ + + case IFMATCH_A: /* body of (?...A) succeeded */ + if (ST.logical) { + sw = cBOOL(ST.wanted); + } + else if (!ST.wanted) + sayNO; + + if (OP(ST.me) != SUSPEND) { + /* restore old position except for (?>...) */ + locinput = st->locinput; + } + scan = ST.me + ARG(ST.me); + if (scan == ST.me) + scan = NULL; + continue; /* execute B */ + +#undef ST + + case LONGJMP: /* alternative with many branches compiles to + * (BRANCHJ; EXACT ...; LONGJMP ) x N */ + next = scan + ARG(scan); + if (next == scan) + next = NULL; + break; + + case COMMIT: /* (*COMMIT) */ + reginfo->cutpoint = reginfo->strend; + /* FALLTHROUGH */ + + case PRUNE: /* (*PRUNE) */ + if (!scan->flags) + sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + PUSH_STATE_GOTO(COMMIT_next, next, locinput); + /* NOTREACHED */ + assert(0); + + case COMMIT_next_fail: + no_final = 1; + /* FALLTHROUGH */ + + case OPFAIL: /* (*FAIL) */ + sayNO; + /* NOTREACHED */ + assert(0); + +#define ST st->u.mark + case MARKPOINT: /* (*MARK:foo) */ + ST.prev_mark = mark_state; + ST.mark_name = sv_commit = sv_yes_mark + = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + mark_state = st; + ST.mark_loc = locinput; + PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput); + /* NOTREACHED */ + assert(0); + + case MARKPOINT_next: + mark_state = ST.prev_mark; + sayYES; + /* NOTREACHED */ + assert(0); + + case MARKPOINT_next_fail: + if (popmark && sv_eq(ST.mark_name,popmark)) + { + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + popmark = NULL; /* we found our mark */ + sv_commit = ST.mark_name; + + DEBUG_EXECUTE_r({ + PerlIO_printf(Perl_debug_log, + "%*s %ssetting cutpoint to mark:%"SVf"...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], SVfARG(sv_commit), PL_colors[5]); + }); + } + mark_state = ST.prev_mark; + sv_yes_mark = mark_state ? + mark_state->u.mark.mark_name : NULL; + sayNO; + /* NOTREACHED */ + assert(0); + + case SKIP: /* (*SKIP) */ + if (scan->flags) { + /* (*SKIP) : if we fail we cut here*/ + ST.mark_name = NULL; + ST.mark_loc = locinput; + PUSH_STATE_GOTO(SKIP_next,next, locinput); + } else { + /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, + otherwise do nothing. Meaning we need to scan + */ + regmatch_state *cur = mark_state; + SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + + while (cur) { + if ( sv_eq( cur->u.mark.mark_name, + find ) ) + { + ST.mark_name = find; + PUSH_STATE_GOTO( SKIP_next, next, locinput); + } + cur = cur->u.mark.prev_mark; + } + } + /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ + break; + + case SKIP_next_fail: + if (ST.mark_name) { + /* (*CUT:NAME) - Set up to search for the name as we + collapse the stack*/ + popmark = ST.mark_name; + } else { + /* (*CUT) - No name, we cut here.*/ + if (ST.mark_loc > startpoint) + reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); + /* but we set sv_commit to latest mark_name if there + is one so they can test to see how things lead to this + cut */ + if (mark_state) + sv_commit=mark_state->u.mark.mark_name; + } + no_final = 1; + sayNO; + /* NOTREACHED */ + assert(0); +#undef ST + + case LNBREAK: /* \R */ + if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) { + locinput += n; + } else + sayNO; + break; + + default: + PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", + PTR2UV(scan), OP(scan)); + Perl_croak(aTHX_ "regexp memory corruption"); + + /* this is a point to jump to in order to increment + * locinput by one character */ + increment_locinput: + assert(!NEXTCHR_IS_EOS); + if (utf8_target) { + locinput += PL_utf8skip[nextchr]; + /* locinput is allowed to go 1 char off the end, but not 2+ */ + if (locinput > reginfo->strend) + sayNO; + } + else + locinput++; + break; + + } /* end switch */ + + /* switch break jumps here */ + scan = next; /* prepare to execute the next op and ... */ + continue; /* ... jump back to the top, reusing st */ + /* NOTREACHED */ + assert(0); + + push_yes_state: + /* push a state that backtracks on success */ + st->u.yes.prev_yes_state = yes_state; + yes_state = st; + /* FALLTHROUGH */ + push_state: + /* push a new regex state, then continue at scan */ + { + regmatch_state *newst; + + DEBUG_STACK_r({ + regmatch_state *cur = st; + regmatch_state *curyes = yes_state; + int curd = depth; + regmatch_slab *slab = PL_regmatch_slab; + for (;curd > -1;cur--,curd--) { + if (cur < SLAB_FIRST(slab)) { + slab = slab->prev; + cur = SLAB_LAST(slab); + } + PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n", + REPORT_CODE_OFF + 2 + depth * 2,"", + curd, PL_reg_name[cur->resume_state], + (curyes == cur) ? "yes" : "" + ); + if (curyes == cur) + curyes = cur->u.yes.prev_yes_state; + } + } else + DEBUG_STATE_pp("push") + ); + depth++; + st->locinput = locinput; + newst = st+1; + if (newst > SLAB_LAST(PL_regmatch_slab)) + newst = S_push_slab(aTHX); + PL_regmatch_state = newst; + + locinput = pushinput; + st = newst; + continue; + /* NOTREACHED */ + assert(0); + } + } + + /* + * We get here only if there's trouble -- normally "case END" is + * the terminating point. + */ + Perl_croak(aTHX_ "corrupted regexp pointers"); + /* NOTREACHED */ + sayNO; + +yes: + if (yes_state) { + /* we have successfully completed a subexpression, but we must now + * pop to the state marked by yes_state and continue from there */ + assert(st != yes_state); +#ifdef DEBUGGING + while (st != yes_state) { + st--; + if (st < SLAB_FIRST(PL_regmatch_slab)) { + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + DEBUG_STATE_r({ + if (no_final) { + DEBUG_STATE_pp("pop (no final)"); + } else { + DEBUG_STATE_pp("pop (yes)"); + } + }); + depth--; + } +#else + while (yes_state < SLAB_FIRST(PL_regmatch_slab) + || yes_state > SLAB_LAST(PL_regmatch_slab)) + { + /* not in this slab, pop slab */ + depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1); + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + depth -= (st - yes_state); +#endif + st = yes_state; + yes_state = st->u.yes.prev_yes_state; + PL_regmatch_state = st; + + if (no_final) + locinput= st->locinput; + state_num = st->resume_state + no_final; + goto reenter_switch; + } + + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", + PL_colors[4], PL_colors[5])); + + if (reginfo->info_aux_eval) { + /* each successfully executed (?{...}) block does the equivalent of + * local $^R = do {...} + * When popping the save stack, all these locals would be undone; + * bypass this by setting the outermost saved $^R to the latest + * value */ + /* I dont know if this is needed or works properly now. + * see code related to PL_replgv elsewhere in this file. + * Yves + */ + if (oreplsv != GvSV(PL_replgv)) + sv_setsv(oreplsv, GvSV(PL_replgv)); + } + result = 1; + goto final_exit; + +no: + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], PL_colors[5]) + ); + +no_silent: + if (no_final) { + if (yes_state) { + goto yes; + } else { + goto final_exit; + } + } + if (depth) { + /* there's a previous state to backtrack to */ + st--; + if (st < SLAB_FIRST(PL_regmatch_slab)) { + PL_regmatch_slab = PL_regmatch_slab->prev; + st = SLAB_LAST(PL_regmatch_slab); + } + PL_regmatch_state = st; + locinput= st->locinput; + + DEBUG_STATE_pp("pop"); + depth--; + if (yes_state == st) + yes_state = st->u.yes.prev_yes_state; + + state_num = st->resume_state + 1; /* failure = success + 1 */ + goto reenter_switch; + } + result = 0; + + final_exit: + if (rex->intflags & PREGf_VERBARG_SEEN) { + SV *sv_err = get_sv("REGERROR", 1); + SV *sv_mrk = get_sv("REGMARK", 1); + if (result) { + sv_commit = &PL_sv_no; + if (!sv_yes_mark) + sv_yes_mark = &PL_sv_yes; + } else { + if (!sv_commit) + sv_commit = &PL_sv_yes; + sv_yes_mark = &PL_sv_no; + } + assert(sv_err); + assert(sv_mrk); + sv_setsv(sv_err, sv_commit); + sv_setsv(sv_mrk, sv_yes_mark); + } + + + if (last_pushed_cv) { + dSP; + POP_MULTICALL; + PERL_UNUSED_VAR(SP); + } + + assert(!result || locinput - reginfo->strbeg >= 0); + return result ? locinput - reginfo->strbeg : -1; +} + +/* + - regrepeat - repeatedly match something simple, report how many + * + * What 'simple' means is a node which can be the operand of a quantifier like + * '+', or {1,3} + * + * startposp - pointer a pointer to the start position. This is updated + * to point to the byte following the highest successful + * match. + * p - the regnode to be repeatedly matched against. + * reginfo - struct holding match state, such as strend + * max - maximum number of things to match. + * depth - (for debugging) backtracking depth. + */ +STATIC I32 +S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, + regmatch_info *const reginfo, I32 max, int depth) +{ + char *scan; /* Pointer to current position in target string */ + I32 c; + char *loceol = reginfo->strend; /* local version */ + I32 hardcount = 0; /* How many matches so far */ + bool utf8_target = reginfo->is_utf8_target; + int to_complement = 0; /* Invert the result? */ + UV utf8_flags; + _char_class_number classnum; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_REGREPEAT; + + scan = *startposp; + if (max == REG_INFTY) + max = I32_MAX; + else if (! utf8_target && loceol - scan > max) + loceol = scan + max; + + /* Here, for the case of a non-UTF-8 target we have adjusted down + * to the maximum of how far we should go in it (leaving it set to the real + * end, if the maximum permissible would take us beyond that). This allows + * us to make the loop exit condition that we haven't gone past to + * also mean that we haven't exceeded the max permissible count, saving a + * test each time through the loop. But it assumes that the OP matches a + * single byte, which is true for most of the OPs below when applied to a + * non-UTF-8 target. Those relatively few OPs that don't have this + * characteristic will have to compensate. + * + * There is no adjustment for UTF-8 targets, as the number of bytes per + * character varies. OPs will have to test both that the count is less + * than the max permissible (using to keep track), and that we + * are still within the bounds of the string (using . A few OPs + * match a single byte no matter what the encoding. They can omit the max + * test if, for the UTF-8 case, they do the adjustment that was skipped + * above. + * + * Thus, the code above sets things up for the common case; and exceptional + * cases need extra work; the common case is to make sure doesn't + * go past , and for UTF-8 to also use to make sure the + * count doesn't exceed the maximum permissible */ + + switch (OP(p)) { + case REG_ANY: + if (utf8_target) { + while (scan < loceol && hardcount < max && *scan != '\n') { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && *scan != '\n') + scan++; + } + break; + case SANY: + if (utf8_target) { + while (scan < loceol && hardcount < max) { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else + scan = loceol; + break; + case CANY: /* Move forward bytes, unless goes off end */ + if (utf8_target && loceol - scan > max) { + + /* hadn't been adjusted in the UTF-8 case */ + scan += max; + } + else { + scan = loceol; + } + break; + case EXACT: + assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + + c = (U8)*STRING(p); + + /* Can use a simple loop if the pattern char to match on is invariant + * under UTF-8, or both target and pattern aren't UTF-8. Note that we + * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's + * true iff it doesn't matter if the argument is in UTF-8 or not */ + if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) { + if (utf8_target && loceol - scan > max) { + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match at all, 1 char == 1 byte */ + loceol = scan + max; + } + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else if (reginfo->is_utf8_pat) { + if (utf8_target) { + STRLEN scan_char_len; + + /* When both target and pattern are UTF-8, we have to do + * string EQ */ + while (hardcount < max + && scan < loceol + && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) + && memEQ(scan, STRING(p), scan_char_len)) + { + scan += scan_char_len; + hardcount++; + } + } + else if (! UTF8_IS_ABOVE_LATIN1(c)) { + + /* Target isn't utf8; convert the character in the UTF-8 + * pattern to non-UTF8, and do a simple loop */ + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } /* else pattern char is above Latin1, can't possibly match the + non-UTF-8 target */ + } + else { + + /* Here, the string must be utf8; pattern isn't, and is + * different in utf8 than not, so can't compare them directly. + * Outside the loop, find the two utf8 bytes that represent c, and + * then look for those in sequence in the utf8 string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } + break; + + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALLTHROUGH */ + case EXACTFA: + utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf; + + case EXACTFL: + utf8_flags = FOLDEQ_LOCALE; + goto do_exactf; + + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; + + case EXACTFU_SS: + case EXACTFU: + utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; + + do_exactf: { + int c1, c2; + U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; + + assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + + if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, + reginfo)) + { + if (c1 == CHRTEST_VOID) { + /* Use full Unicode fold matching */ + char *tmpeol = reginfo->strend; + STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; + while (hardcount < max + && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, pat_len, + reginfo->is_utf8_pat, utf8_flags)) + { + scan = tmpeol; + tmpeol = reginfo->strend; + hardcount++; + } + } + else if (utf8_target) { + if (c1 == c2) { + while (scan < loceol + && hardcount < max + && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while (scan < loceol + && hardcount < max + && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) + || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + } + else if (c1 == c2) { + while (scan < loceol && UCHARAT(scan) == c1) { + scan++; + } + } + else { + while (scan < loceol && + (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) + { + scan++; + } + } + } + break; + } + case ANYOF: + if (utf8_target) { + while (hardcount < max + && scan < loceol + && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && REGINCLASS(prog, p, (U8*)scan)) + scan++; + } + break; + + /* The argument (FLAGS) to all the POSIX node types is the class number */ + + case NPOSIXL: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL: + if (! utf8_target) { + while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), + *scan))) + { + scan++; + } + } else { + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p), + (U8 *) scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case POSIXD: + if (utf8_target) { + goto utf8_posix; + } + /* FALLTHROUGH */ + + case POSIXA: + if (utf8_target && loceol - scan > max) { + + /* We didn't adjust at the beginning of this routine + * because is UTF-8, but it is actually ok to do so, since here, to + * match, 1 char == 1 byte. */ + loceol = scan + max; + } + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + break; + + case NPOSIXD: + if (utf8_target) { + to_complement = 1; + goto utf8_posix; + } + /* FALLTHROUGH */ + + case NPOSIXA: + if (! utf8_target) { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + } + else { + + /* The complement of something that matches only ASCII matches all + * non-ASCII, plus everything in ASCII that isn't in the class. */ + while (hardcount < max && scan < loceol + && (! isASCII_utf8(scan) + || ! _generic_isCC_A((U8) *scan, FLAGS(p)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case NPOSIXU: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXU: + if (! utf8_target) { + while (scan < loceol && to_complement + ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p)))) + { + scan++; + } + } + else { + utf8_posix: + classnum = (_char_class_number) FLAGS(p); + if (classnum < _FIRST_NON_SWASH_CC) { + + /* Here, a swash is needed for above-Latin1 code points. + * Process as many Latin1 code points using the built-in rules. + * Go to another loop to finish processing upon encountering + * the first Latin1 code point. We could do that in this loop + * as well, but the other way saves having to test if the swash + * has been loaded every time through the loop: extra space to + * save a test. */ + while (hardcount < max && scan < loceol) { + if (UTF8_IS_INVARIANT(*scan)) { + if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan, + classnum)))) + { + break; + } + scan++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { + if (! (to_complement + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), + classnum)))) + { + break; + } + scan += 2; + } + else { + goto found_above_latin1; + } + + hardcount++; + } + } + else { + /* For these character classes, the knowledge of how to handle + * every code point is compiled in to Perl via a macro. This + * code is written for making the loops as tight as possible. + * It could be refactored to save space instead */ + switch (classnum) { + case _CC_ENUM_SPACE: /* XXX would require separate code + if we revert the change of \v + matching this */ + /* FALLTHROUGH */ + case _CC_ENUM_PSXSPC: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isSPACE_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_BLANK: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isBLANK_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_XDIGIT: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isXDIGIT_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_VERTSPACE: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isVERTWS_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + case _CC_ENUM_CNTRL: + while (hardcount < max + && scan < loceol + && (to_complement ^ cBOOL(isCNTRL_utf8(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + default: + Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum); + } + } + } + break; + + found_above_latin1: /* Continuation of POSIXU and NPOSIXU */ + + /* Load the swash if not already present */ + if (! PL_utf8_swash_ptrs[classnum]) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_swash_ptrs[classnum] = _core_swash_init( + "utf8", + "", + &PL_sv_undef, 1, 0, + PL_XPosix_ptrs[classnum], &flags); + } + + while (hardcount < max && scan < loceol + && to_complement ^ cBOOL(_generic_utf8( + classnum, + scan, + swash_fetch(PL_utf8_swash_ptrs[classnum], + (U8 *) scan, + TRUE)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + break; + + case LNBREAK: + if (utf8_target) { + while (hardcount < max && scan < loceol && + (c=is_LNBREAK_utf8_safe(scan, loceol))) { + scan += c; + hardcount++; + } + } else { + /* LNBREAK can match one or two latin chars, which is ok, but we + * have to use hardcount in this situation, and throw away the + * adjustment to done before the switch statement */ + loceol = reginfo->strend; + while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { + scan+=c; + hardcount++; + } + } + break; + + case BOUND: + case BOUNDA: + case BOUNDL: + case BOUNDU: + case EOS: + case GPOS: + case KEEPS: + case NBOUND: + case NBOUNDA: + case NBOUNDL: + case NBOUNDU: + case OPFAIL: + case SBOL: + case SEOL: + /* These are all 0 width, so match right here or not at all. */ + break; + + default: + Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); + /* NOTREACHED */ + assert(0); + + } + + if (hardcount) + c = hardcount; + else + c = scan - *startposp; + *startposp = scan; + + DEBUG_r({ + GET_RE_DEBUG_FLAGS_DECL; + DEBUG_EXECUTE_r({ + SV * const prop = sv_newmortal(); + regprop(prog, prop, p, reginfo); + PerlIO_printf(Perl_debug_log, + "%*s %s can match %"IVdf" times out of %"IVdf"...\n", + REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); + }); + }); + + return(c); +} + + +#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) +/* +- regclass_swash - prepare the utf8 swash. Wraps the shared core version to +create a copy so that changes the caller makes won't change the shared one. +If is non-null, will return NULL in it, for back-compat. + */ +SV * +Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp) +{ + PERL_ARGS_ASSERT_REGCLASS_SWASH; + + if (altsvp) { + *altsvp = NULL; + } + + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); +} + +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr) +{ + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. + * If is 'true', will attempt to create the swash if not already + * done. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). + * Tied intimately to how regcomp.c sets up the data structure */ + + SV *sw = NULL; + SV *si = NULL; /* Input swash initialization string */ + SV* invlist = NULL; + + RXi_GET_DECL(prog,progi); + const struct reg_data * const data = prog ? progi->data : NULL; + + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; + + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); + + if (data && data->count) { + const U32 n = ARG(node); + + if (data->what[n] == 's') { + SV * const rv = MUTABLE_SV(data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + + si = *ary; /* ary[0] = the string to initialize the swash with */ + + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if + * that inversion list has any user-defined properties in it. */ + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + assert(only_utf8_locale_ptr); + *only_utf8_locale_ptr = NULL; + } + + if (av_tindex(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; + } + } + + /* Element [1] is reserved for the set-up swash. If already there, + * return it; if not, create it and store it there */ + if (ary[1] && SvROK(ary[1])) { + sw = ary[1]; + } + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { + assert(si); + sw = _core_swash_init("utf8", /* the utf8 package */ + "", /* nameless */ + si, + 1, /* binary */ + 0, /* not from tr/// */ + invlist, + &swash_init_flags); + (void)av_store(av, 1, sw); + } + } + } + + /* If requested, return a printable version of what this swash matches */ + if (listsvp) { + SV* matches_string = newSVpvs(""); + + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ + if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) + && (si && si != &PL_sv_undef)) + { + sv_catsv(matches_string, si); + } + + /* Add the inversion list to whatever we have. This may have come from + * the swash, or from an input parameter */ + if (invlist) { + sv_catsv(matches_string, _invlist_contents(invlist)); + } + *listsvp = matches_string; + } + + return sw; +} +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ + +/* + - reginclass - determine if a character falls into a character class + + n is the ANYOF regnode + p is the target string + p_end points to one byte beyond the end of the target string + utf8_target tells whether p is in UTF-8. + + Returns true if matched; false otherwise. + + Note that this can be a synthetic start class, a combination of various + nodes, so things you think might be mutually exclusive, such as locale, + aren't. It can match both locale and non-locale + + */ + +STATIC bool +S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) +{ + dVAR; + const char flags = ANYOF_FLAGS(n); + bool match = FALSE; + UV c = *p; + + PERL_ARGS_ASSERT_REGINCLASS; + + /* If c is not already the code point, get it. Note that + * UTF8_IS_INVARIANT() works even if not in UTF-8 */ + if (! UTF8_IS_INVARIANT(c) && utf8_target) { + STRLEN c_len = 0; + c = utf8n_to_uvchr(p, p_end - p, &c_len, + (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) + | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); + /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for + * UTF8_ALLOW_FFFF */ + if (c_len == (STRLEN)-1) + Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + } + + /* If this character is potentially in the bitmap, check it */ + if (c < 256) { + if (ANYOF_BITMAP_TEST(n, c)) + match = TRUE; + else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL + && ! utf8_target + && ! isASCII(c)) + { + match = TRUE; + } + else if (flags & ANYOF_LOCALE_FLAGS) { + if (flags & ANYOF_LOC_FOLD) { + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } + } + if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { + + /* The data structure is arranged so bits 0, 2, 4, ... are set + * if the class includes the Posix character class given by + * bit/2; and 1, 3, 5, ... are set if the class includes the + * complemented Posix class given by int(bit/2). So we loop + * through the bits, each time changing whether we complement + * the result or not. Suppose for the sake of illustration + * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0 + * is set, it means there is a match for this ANYOF node if the + * character is in the class given by the expression (0 / 2 = 0 + * = \w). If it is in that class, isFOO_lc() will return 1, + * and since 'to_complement' is 0, the result will stay TRUE, + * and we exit the loop. Suppose instead that bit 0 is 0, but + * bit 1 is 1. That means there is a match if the character + * matches \W. We won't bother to call isFOO_lc() on bit 0, + * but will on bit 1. On the second iteration 'to_complement' + * will be 1, so the exclusive or will reverse things, so we + * are testing for \W. On the third iteration, 'to_complement' + * will be 0, and we would be testing for \s; the fourth + * iteration would test for \S, etc. + * + * Note that this code assumes that all the classes are closed + * under folding. For example, if a character matches \w, then + * its fold does too; and vice versa. This should be true for + * any well-behaved locale for all the currently defined Posix + * classes, except for :lower: and :upper:, which are handled + * by the pseudo-class :cased: which matches if either of the + * other two does. To get rid of this assumption, an outer + * loop could be used below to iterate over both the source + * character, and its fold (if different) */ + + int count = 0; + int to_complement = 0; + + while (count < ANYOF_MAX) { + if (ANYOF_POSIXL_TEST(n, count) + && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) + { + match = TRUE; + break; + } + count++; + to_complement ^= 1; + } + } + } + } + + + /* If the bitmap didn't (or couldn't) match, and something outside the + * bitmap could match, try that. */ + if (!match) { + if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { + match = TRUE; /* Everything above 255 matches */ + } + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) + { + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); + if (sw) { + U8 utf8_buffer[2]; + U8 * utf8_p; + if (utf8_target) { + utf8_p = (U8 *) p; + } else { /* Convert to utf8 */ + utf8_p = utf8_buffer; + append_utf8_from_native_byte(*p, &utf8_p); + utf8_p = utf8_buffer; + } + + if (swash_fetch(sw, utf8_p, TRUE)) { + match = TRUE; + } + } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } + } + + if (UNICODE_IS_SUPER(c) + && (flags & ANYOF_WARN_SUPER) + && ckWARN_d(WARN_NON_UNICODE)) + { + Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), + "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c); + } + } + +#if ANYOF_INVERT != 1 + /* Depending on compiler optimization cBOOL takes time, so if don't have to + * use it, don't */ +# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below, +#endif + + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ + return (flags & ANYOF_INVERT) ^ match; +} + +STATIC U8 * +S_reghop3(U8 *s, SSize_t off, const U8* lim) +{ + /* return the position 'off' UTF-8 characters away from 's', forward if + * 'off' >= 0, backwards if negative. But don't go outside of position + * 'lim', which better be < s if off < 0 */ + + PERL_ARGS_ASSERT_REGHOP3; + + if (off >= 0) { + while (off-- && s < lim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + } + else { + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + } + return s; +} + +STATIC U8 * +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) +{ + PERL_ARGS_ASSERT_REGHOP4; + + if (off >= 0) { + while (off-- && s < rlim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + } + else { + while (off++ && s > llim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > llim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + } + return s; +} + +/* like reghop3, but returns NULL on overrun, rather than returning last + * char pos */ + +STATIC U8 * +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) +{ + PERL_ARGS_ASSERT_REGHOPMAYBE3; + + if (off >= 0) { + while (off-- && s < lim) { + /* XXX could check well-formedness here */ + s += UTF8SKIP(s); + } + if (off >= 0) + return NULL; + } + else { + while (off++ && s > lim) { + s--; + if (UTF8_IS_CONTINUED(*s)) { + while (s > lim && UTF8_IS_CONTINUATION(*s)) + s--; + } + /* XXX could check well-formedness here */ + } + if (off <= 0) + return NULL; + } + return s; +} + + +/* when executing a regex that may have (?{}), extra stuff needs setting + up that will be visible to the called code, even before the current + match has finished. In particular: + + * $_ is localised to the SV currently being matched; + * pos($_) is created if necessary, ready to be updated on each call-out + to code; + * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm + isn't set until the current pattern is successfully finished), so that + $1 etc of the match-so-far can be seen; + * save the old values of subbeg etc of the current regex, and set then + to the current string (again, this is normally only done at the end + of execution) +*/ + +static void +S_setup_eval_state(pTHX_ regmatch_info *const reginfo) +{ + MAGIC *mg; + regexp *const rex = ReANY(reginfo->prog); + regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; + + eval_state->rex = rex; + + if (reginfo->sv) { + /* Make $_ available to executed code. */ + if (reginfo->sv != DEFSV) { + SAVE_DEFSV; + DEFSV_set(reginfo->sv); + } + + if (!(mg = mg_find_mglob(reginfo->sv))) { + /* prepare for quick setting of pos */ + mg = sv_magicext_mglob(reginfo->sv); + mg->mg_len = -1; + } + eval_state->pos_magic = mg; + eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; + } + else + eval_state->pos_magic = NULL; + + if (!PL_reg_curpm) { + /* PL_reg_curpm is a fake PMOP that we can attach the current + * regex to and point PL_curpm at, so that $1 et al are visible + * within a /(?{})/. It's just allocated once per interpreter the + * first time its needed */ + Newxz(PL_reg_curpm, 1, PMOP); +#ifdef USE_ITHREADS + { + SV* const repointer = &PL_sv_undef; + /* this regexp is also owned by the new PL_reg_curpm, which + will try to free it. */ + av_push(PL_regex_padav, repointer); + PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); + PL_regex_pad = AvARRAY(PL_regex_padav); + } +#endif + } + SET_reg_curpm(reginfo->prog); + eval_state->curpm = PL_curpm; + PL_curpm = PL_reg_curpm; + if (RXp_MATCH_COPIED(rex)) { + /* Here is a serious problem: we cannot rewrite subbeg, + since it may be needed if this match fails. Thus + $` inside (?{}) could fail... */ + eval_state->subbeg = rex->subbeg; + eval_state->sublen = rex->sublen; + eval_state->suboffset = rex->suboffset; + eval_state->subcoffset = rex->subcoffset; +#ifdef PERL_ANY_COW + eval_state->saved_copy = rex->saved_copy; +#endif + RXp_MATCH_COPIED_off(rex); + } + else + eval_state->subbeg = NULL; + rex->subbeg = (char *)reginfo->strbeg; + rex->suboffset = 0; + rex->subcoffset = 0; + rex->sublen = reginfo->strend - reginfo->strbeg; +} + + +/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */ + +static void +S_cleanup_regmatch_info_aux(pTHX_ void *arg) +{ + regmatch_info_aux *aux = (regmatch_info_aux *) arg; + regmatch_info_aux_eval *eval_state = aux->info_aux_eval; + regmatch_slab *s; + + Safefree(aux->poscache); + + if (eval_state) { + + /* undo the effects of S_setup_eval_state() */ + + if (eval_state->subbeg) { + regexp * const rex = eval_state->rex; + rex->subbeg = eval_state->subbeg; + rex->sublen = eval_state->sublen; + rex->suboffset = eval_state->suboffset; + rex->subcoffset = eval_state->subcoffset; +#ifdef PERL_ANY_COW + rex->saved_copy = eval_state->saved_copy; +#endif + RXp_MATCH_COPIED_on(rex); + } + if (eval_state->pos_magic) + { + eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } + + PL_curpm = eval_state->curpm; + } + + PL_regmatch_state = aux->old_regmatch_state; + PL_regmatch_slab = aux->old_regmatch_slab; + + /* free all slabs above current one - this must be the last action + * of this function, as aux and eval_state are allocated within + * slabs and may be freed here */ + + s = PL_regmatch_slab->next; + if (s) { + PL_regmatch_slab->next = NULL; + while (s) { + regmatch_slab * const osl = s; + s = s->next; + Safefree(osl); + } + } +} + + +STATIC void +S_to_utf8_substr(pTHX_ regexp *prog) +{ + /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile + * on the converted value */ + + int i = 1; + + PERL_ARGS_ASSERT_TO_UTF8_SUBSTR; + + do { + if (prog->substrs->data[i].substr + && !prog->substrs->data[i].utf8_substr) { + SV* const sv = newSVsv(prog->substrs->data[i].substr); + prog->substrs->data[i].utf8_substr = sv; + sv_utf8_upgrade(sv); + if (SvVALID(prog->substrs->data[i].substr)) { + if (SvTAIL(prog->substrs->data[i].substr)) { + /* Trim the trailing \n that fbm_compile added last + time. */ + SvCUR_set(sv, SvCUR(sv) - 1); + /* Whilst this makes the SV technically "invalid" (as its + buffer is no longer followed by "\0") when fbm_compile() + adds the "\n" back, a "\0" is restored. */ + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } + if (prog->substrs->data[i].substr == prog->check_substr) + prog->check_utf8 = sv; + } + } while (i--); +} + +STATIC bool +S_to_byte_substr(pTHX_ regexp *prog) +{ + /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile + * on the converted value; returns FALSE if can't be converted. */ + + int i = 1; + + PERL_ARGS_ASSERT_TO_BYTE_SUBSTR; + + do { + if (prog->substrs->data[i].utf8_substr + && !prog->substrs->data[i].substr) { + SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); + if (! sv_utf8_downgrade(sv, TRUE)) { + return FALSE; + } + if (SvVALID(prog->substrs->data[i].utf8_substr)) { + if (SvTAIL(prog->substrs->data[i].utf8_substr)) { + /* Trim the trailing \n that fbm_compile added last + time. */ + SvCUR_set(sv, SvCUR(sv) - 1); + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } + prog->substrs->data[i].substr = sv; + if (prog->substrs->data[i].utf8_substr == prog->check_utf8) + prog->check_substr = sv; + } + } while (i--); + + return TRUE; +} + +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * + * ex: set ts=8 sts=4 sw=4 et: + */ diff --git a/src/update.pl b/src/update.pl index b3c9984..d3c6b67 100644 --- a/src/update.pl +++ b/src/update.pl @@ -96,7 +96,7 @@ sub key_version { "$int$frac" => [ $num_version, $pretty_version ]; } -my $latest_dev_rev = 19; +my $latest_dev_rev = 21; sub perl_is_supported { my $v = "$_[0]"; diff --git a/t/re-engine-Hooks-TestDist/TestDist.xs b/t/re-engine-Hooks-TestDist/TestDist.xs index ef36ccb..0b701f7 100644 --- a/t/re-engine-Hooks-TestDist/TestDist.xs +++ b/t/re-engine-Hooks-TestDist/TestDist.xs @@ -11,6 +11,10 @@ #define __PACKAGE__ "re::engine::Hooks::TestDist" #define __PACKAGE_LEN__ (sizeof(__PACKAGE__)-1) +#ifndef REHT_HAS_PERL +# define REHT_HAS_PERL(R, V, S) (PERL_REVISION > (R) || (PERL_REVISION == (R) && (PERL_VERSION > (V) || (PERL_VERSION == (V) && (PERL_SUBVERSION >= (S)))))) +#endif + #include "regcomp.h" STATIC SV *reht_foo_var; @@ -48,8 +52,12 @@ STATIC void reht_custom_comp_node(pTHX_ regexp *rx, regnode *node) { node_name = PL_reg_name[OP(node)]; } +#if !REHT_HAS_PERL(5, 19, 1) + STATIC struct re_save_state reht_state_bak; +#endif + STATIC void reht_custom_exec_node(pTHX_ regexp *rx, regnode *node, regmatch_info *reginfo, regmatch_state *st) { STRLEN node_namelen; const char *node_name; @@ -57,9 +65,11 @@ STATIC void reht_custom_exec_node(pTHX_ regexp *rx, regnode *node, regmatch_info node_name = PL_reg_name[OP(node)]; node_namelen = strlen(node_name); +#if !REHT_HAS_PERL(5, 19, 1) /* The global regexp state may be overwritten if the Perl callback does a * regexp match. */ reht_state_bak = PL_reg_state; +#endif dSP; @@ -76,7 +86,9 @@ STATIC void reht_custom_exec_node(pTHX_ regexp *rx, regnode *node, regmatch_info FREETMPS; LEAVE; +#if !REHT_HAS_PERL(5, 19, 1) PL_reg_state = reht_state_bak; +#endif } /* --- XS ------------------------------------------------------------------ */