- case TRIEC: /* (ab|cd) with known charclass */
- /* In this case the charclass data is available inline so
- we can fail fast without a lot of extra overhead.
- */
- if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
- DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %sfailed to match trie start class...%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
- );
- sayNO_SILENT;
- assert(0); /* NOTREACHED */
- }
- /* FALL THROUGH */
- case TRIE: /* (ab|cd) */
- /* the basic plan of execution of the trie is:
- * At the beginning, run though all the states, and
- * find the longest-matching word. Also remember the position
- * of the shortest matching word. For example, this pattern:
- * 1 2 3 4 5
- * ab|a|x|abcd|abc
- * when matched against the string "abcde", will generate
- * accept states for all words except 3, with the longest
- * matching word being 4, and the shortest being 2 (with
- * the position being after char 1 of the string).
- *
- * Then for each matching word, in word order (i.e. 1,2,4,5),
- * we run the remainder of the pattern; on each try setting
- * the current position to the character following the word,
- * returning to try the next word on failure.
- *
- * We avoid having to build a list of words at runtime by
- * using a compile-time structure, wordinfo[].prev, which
- * gives, for each word, the previous accepting word (if any).
- * In the case above it would contain the mappings 1->2, 2->0,
- * 3->0, 4->5, 5->1. We can use this table to generate, from
- * the longest word (4 above), a list of all words, by
- * following the list of prev pointers; this gives us the
- * unordered list 4,5,1,2. Then given the current word we have
- * just tried, we can go through the list and find the
- * next-biggest word to try (so if we just failed on word 2,
- * the next in the list is 4).
- *
- * Since at runtime we don't record the matching position in
- * the string for each word, we have to work that out for
- * each word we're about to process. The wordinfo table holds
- * the character length of each word; given that we recorded
- * at the start: the position of the shortest word and its
- * length in chars, we just need to move the pointer the
- * difference between the two char lengths. Depending on
- * Unicode status and folding, that's cheap or expensive.
- *
- * This algorithm is optimised for the case where are only a
- * small number of accept states, i.e. 0,1, or maybe 2.
- * With lots of accepts states, and having to try all of them,
- * it becomes quadratic on number of accept states to find all
- * the next words.
- */
-
- {
- /* what type of TRIE am I? (utf8 makes this contextual) */
- DECL_TRIE_TYPE(scan);
-
- /* what trie are we using right now */
- reg_trie_data * const trie
- = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
- HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
- U32 state = trie->startstate;
-
- if ( trie->bitmap
- && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
- {
- if (trie->states[ state ].wordnum) {
- DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %smatched empty string...%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
- );
- if (!trie->jump)
- break;
- } else {
- DEBUG_EXECUTE_r(
- PerlIO_printf(Perl_debug_log,
- "%*s %sfailed to match trie start class...%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
- );
- sayNO_SILENT;
- }
- }
-
- {
- U8 *uc = ( U8* )locinput;
-
- STRLEN len = 0;
- STRLEN foldlen = 0;
- U8 *uscan = (U8*)NULL;
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
- U32 charcount = 0; /* how many input chars we have matched */
- U32 accepted = 0; /* have we seen any accepting states? */
-
- ST.jump = trie->jump;
- ST.me = scan;
- ST.firstpos = NULL;
- ST.longfold = FALSE; /* char longer if folded => it's harder */
- ST.nextword = 0;
-
- /* fully traverse the TRIE; note the position of the
- shortest accept state and the wordnum of the longest
- accept state */
-
- while ( state && uc <= (U8*)(reginfo->strend) ) {
- U32 base = trie->states[ state ].trans.base;
- UV uvc = 0;
- U16 charid = 0;
- U16 wordnum;
- wordnum = trie->states[ state ].wordnum;
-
- if (wordnum) { /* it's an accept state */
- if (!accepted) {
- accepted = 1;
- /* record first match position */
- if (ST.longfold) {
- ST.firstpos = (U8*)locinput;
- ST.firstchars = 0;
- }
- else {
- ST.firstpos = uc;
- ST.firstchars = charcount;
- }
- }
- if (!ST.nextword || wordnum < ST.nextword)
- ST.nextword = wordnum;
- ST.topword = wordnum;
- }
-
- DEBUG_TRIE_EXECUTE_r({
- DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
- PerlIO_printf( Perl_debug_log,
- "%*s %sState: %4"UVxf" Accepted: %c ",
- 2+depth * 2, "", PL_colors[4],
- (UV)state, (accepted ? 'Y' : 'N'));
- });
-
- /* read a char and goto next state */
- if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
- I32 offset;
- REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
- uscan, len, uvc, charid, foldlen,
- foldbuf, uniflags);
- charcount++;
- if (foldlen>0)
- ST.longfold = TRUE;
- if (charid &&
- ( ((offset =
- base + charid - 1 - trie->uniquecharcount)) >= 0)
-
- && ((U32)offset < trie->lasttrans)
- && trie->trans[offset].check == state)
- {
- state = trie->trans[offset].next;
- }
- else {
- state = 0;
- }
- uc += len;
-
- }
- else {
- state = 0;
- }
- DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,
- "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
- charid, uvc, (UV)state, PL_colors[5] );
- );
- }
- if (!accepted)
- sayNO;
-
- /* calculate total number of accept states */
- {
- U16 w = ST.topword;
- accepted = 0;
- while (w) {
- w = trie->wordinfo[w].prev;
- accepted++;
- }
- ST.accepted = accepted;
- }
-
- DEBUG_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,
- "%*s %sgot %"IVdf" possible matches%s\n",
- REPORT_CODE_OFF + depth * 2, "",
- PL_colors[4], (IV)ST.accepted, PL_colors[5] );
- );
- goto trie_first_try; /* jump into the fail handler */
- }}
- assert(0); /* NOTREACHED */
-
- case TRIE_next_fail: /* we failed - try next alternative */
+ case TRIEC: /* (ab|cd) with known charclass */
+ /* In this case the charclass data is available inline so
+ we can fail fast without a lot of extra overhead.
+ */
+ if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s %sfailed to match trie start class...%s\n",
+ REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
+ );
+ sayNO_SILENT;
+ assert(0); /* NOTREACHED */
+ }
+ /* FALL THROUGH */
+ case TRIE: /* (ab|cd) */
+ /* the basic plan of execution of the trie is:
+ * At the beginning, run though all the states, and
+ * find the longest-matching word. Also remember the position
+ * of the shortest matching word. For example, this pattern:
+ * 1 2 3 4 5
+ * ab|a|x|abcd|abc
+ * when matched against the string "abcde", will generate
+ * accept states for all words except 3, with the longest
+ * matching word being 4, and the shortest being 2 (with
+ * the position being after char 1 of the string).
+ *
+ * Then for each matching word, in word order (i.e. 1,2,4,5),
+ * we run the remainder of the pattern; on each try setting
+ * the current position to the character following the word,
+ * returning to try the next word on failure.
+ *
+ * We avoid having to build a list of words at runtime by
+ * using a compile-time structure, wordinfo[].prev, which
+ * gives, for each word, the previous accepting word (if any).
+ * In the case above it would contain the mappings 1->2, 2->0,
+ * 3->0, 4->5, 5->1. We can use this table to generate, from
+ * the longest word (4 above), a list of all words, by
+ * following the list of prev pointers; this gives us the
+ * unordered list 4,5,1,2. Then given the current word we have
+ * just tried, we can go through the list and find the
+ * next-biggest word to try (so if we just failed on word 2,
+ * the next in the list is 4).
+ *
+ * Since at runtime we don't record the matching position in
+ * the string for each word, we have to work that out for
+ * each word we're about to process. The wordinfo table holds
+ * the character length of each word; given that we recorded
+ * at the start: the position of the shortest word and its
+ * length in chars, we just need to move the pointer the
+ * difference between the two char lengths. Depending on
+ * Unicode status and folding, that's cheap or expensive.
+ *
+ * This algorithm is optimised for the case where are only a
+ * small number of accept states, i.e. 0,1, or maybe 2.
+ * With lots of accepts states, and having to try all of them,
+ * it becomes quadratic on number of accept states to find all
+ * the next words.
+ */
+
+ {
+ /* what type of TRIE am I? (utf8 makes this contextual) */
+ DECL_TRIE_TYPE(scan);
+
+ /* what trie are we using right now */
+ reg_trie_data * const trie
+ = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
+ HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
+ U32 state = trie->startstate;
+
+ if ( trie->bitmap
+ && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
+ {
+ if (trie->states[ state ].wordnum) {
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s %smatched empty string...%s\n",
+ REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
+ );
+ if (!trie->jump)
+ break;
+ } else {
+ DEBUG_EXECUTE_r(
+ PerlIO_printf(Perl_debug_log,
+ "%*s %sfailed to match trie start class...%s\n",
+ REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
+ );
+ sayNO_SILENT;
+ }
+ }
+
+ {
+ U8 *uc = ( U8* )locinput;
+
+ STRLEN len = 0;
+ STRLEN foldlen = 0;
+ U8 *uscan = (U8*)NULL;
+ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ U32 charcount = 0; /* how many input chars we have matched */
+ U32 accepted = 0; /* have we seen any accepting states? */
+
+ ST.jump = trie->jump;
+ ST.me = scan;
+ ST.firstpos = NULL;
+ ST.longfold = FALSE; /* char longer if folded => it's harder */
+ ST.nextword = 0;
+
+ /* fully traverse the TRIE; note the position of the
+ shortest accept state and the wordnum of the longest
+ accept state */
+
+ while ( state && uc <= (U8*)(reginfo->strend) ) {
+ U32 base = trie->states[ state ].trans.base;
+ UV uvc = 0;
+ U16 charid = 0;
+ U16 wordnum;
+ wordnum = trie->states[ state ].wordnum;
+
+ if (wordnum) { /* it's an accept state */
+ if (!accepted) {
+ accepted = 1;
+ /* record first match position */
+ if (ST.longfold) {
+ ST.firstpos = (U8*)locinput;
+ ST.firstchars = 0;
+ }
+ else {
+ ST.firstpos = uc;
+ ST.firstchars = charcount;
+ }
+ }
+ if (!ST.nextword || wordnum < ST.nextword)
+ ST.nextword = wordnum;
+ ST.topword = wordnum;
+ }
+
+ DEBUG_TRIE_EXECUTE_r({
+ DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
+ PerlIO_printf( Perl_debug_log,
+ "%*s %sState: %4"UVxf" Accepted: %c ",
+ 2+depth * 2, "", PL_colors[4],
+ (UV)state, (accepted ? 'Y' : 'N'));
+ });
+
+ /* read a char and goto next state */
+ if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
+ I32 offset;
+ REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
+ uscan, len, uvc, charid, foldlen,
+ foldbuf, uniflags);
+ charcount++;
+ if (foldlen>0)
+ ST.longfold = TRUE;
+ if (charid &&
+ ( ((offset =
+ base + charid - 1 - trie->uniquecharcount)) >= 0)
+
+ && ((U32)offset < trie->lasttrans)
+ && trie->trans[offset].check == state)
+ {
+ state = trie->trans[offset].next;
+ }
+ else {
+ state = 0;
+ }
+ uc += len;
+
+ }
+ else {
+ state = 0;
+ }
+ DEBUG_TRIE_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log,
+ "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
+ charid, uvc, (UV)state, PL_colors[5] );
+ );
+ }
+ if (!accepted)
+ sayNO;
+
+ /* calculate total number of accept states */
+ {
+ U16 w = ST.topword;
+ accepted = 0;
+ while (w) {
+ w = trie->wordinfo[w].prev;
+ accepted++;
+ }
+ ST.accepted = accepted;
+ }
+
+ DEBUG_EXECUTE_r(
+ PerlIO_printf( Perl_debug_log,
+ "%*s %sgot %"IVdf" possible matches%s\n",
+ REPORT_CODE_OFF + depth * 2, "",
+ PL_colors[4], (IV)ST.accepted, PL_colors[5] );
+ );
+ goto trie_first_try; /* jump into the fail handler */
+ }}
+ assert(0); /* NOTREACHED */
+
+ case TRIE_next_fail: /* we failed - try next alternative */
+ {
+ U8 *uc;
+ if ( ST.jump) {
+ REGCP_UNWIND(ST.cp);
+ UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
+ }
+ if (!--ST.accepted) {
+ DEBUG_EXECUTE_r({
+ PerlIO_printf( Perl_debug_log,
+ "%*s %sTRIE failed...%s\n",
+ REPORT_CODE_OFF+depth*2, "",
+ PL_colors[4],
+ PL_colors[5] );
+ });
+ sayNO_SILENT;
+ }
+ {
+ /* Find next-highest word to process. Note that this code
+ * is O(N^2) per trie run (O(N) per branch), so keep tight */
+ U16 min = 0;
+ U16 word;
+ U16 const nextword = ST.nextword;
+ reg_trie_wordinfo * const wordinfo
+ = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
+ for (word=ST.topword; word; word=wordinfo[word].prev) {
+ if (word > nextword && (!min || word < min))
+ min = word;
+ }
+ ST.nextword = min;
+ }
+
+ trie_first_try:
+ if (do_cutgroup) {
+ do_cutgroup = 0;
+ no_final = 0;
+ }
+
+ if ( ST.jump) {
+ ST.lastparen = rex->lastparen;
+ ST.lastcloseparen = rex->lastcloseparen;
+ REGCP_SET(ST.cp);
+ }
+
+ /* find start char of end of current word */
+ {
+ U32 chars; /* how many chars to skip */
+ reg_trie_data * const trie
+ = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
+
+ assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
+ >= ST.firstchars);
+ chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
+ - ST.firstchars;
+ uc = ST.firstpos;
+
+ if (ST.longfold) {
+ /* the hard option - fold each char in turn and find
+ * its folded length (which may be different */
+ U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
+ STRLEN foldlen;
+ STRLEN len;
+ UV uvc;
+ U8 *uscan;
+
+ while (chars) {
+ if (utf8_target) {
+ uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
+ uniflags);
+ uc += len;
+ }
+ else {
+ uvc = *uc;
+ uc++;
+ }
+ uvc = to_uni_fold(uvc, foldbuf, &foldlen);
+ uscan = foldbuf;
+ while (foldlen) {
+ if (!--chars)
+ break;
+ uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
+ uniflags);
+ uscan += len;
+ foldlen -= len;
+ }
+ }
+ }
+ else {
+ if (utf8_target)
+ while (chars--)
+ uc += UTF8SKIP(uc);
+ else
+ uc += chars;
+ }
+ }
+
+ scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
+ ? ST.jump[ST.nextword]
+ : NEXT_OFF(ST.me));
+
+ DEBUG_EXECUTE_r({
+ PerlIO_printf( Perl_debug_log,
+ "%*s %sTRIE matched word #%d, continuing%s\n",
+ REPORT_CODE_OFF+depth*2, "",
+ PL_colors[4],
+ ST.nextword,
+ PL_colors[5]
+ );
+ });
+
+ if (ST.accepted > 1 || has_cutgroup) {
+ PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
+ assert(0); /* NOTREACHED */
+ }
+ /* only one choice left - just continue */
+ DEBUG_EXECUTE_r({
+ AV *const trie_words
+ = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
+ SV ** const tmp = av_fetch( trie_words,
+ ST.nextword-1, 0 );
+ SV *sv= tmp ? sv_newmortal() : NULL;
+
+ PerlIO_printf( Perl_debug_log,
+ "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
+ REPORT_CODE_OFF+depth*2, "", PL_colors[4],
+ ST.nextword,
+ tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
+ PL_colors[0], PL_colors[1],
+ (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
+ )
+ : "not compiled under -Dr",
+ PL_colors[5] );
+ });
+
+ locinput = (char*)uc;
+ continue; /* execute rest of RE */
+ assert(0); /* NOTREACHED */
+ }
+#undef ST
+
+ case EXACT: { /* /abc/ */
+ char *s = STRING(scan);
+ ln = STR_LEN(scan);
+ if (utf8_target != is_utf8_pat) {
+ /* The target and the pattern have differing utf8ness. */
+ char *l = locinput;
+ const char * const e = s + ln;
+
+ if (utf8_target) {
+ /* The target is utf8, the pattern is not utf8.
+ * Above-Latin1 code points can't match the pattern;
+ * invariants match exactly, and the other Latin1 ones need
+ * to be downgraded to a single byte in order to do the
+ * comparison. (If we could be confident that the target
+ * is not malformed, this could be refactored to have fewer
+ * tests by just assuming that if the first bytes match, it
+ * is an invariant, but there are tests in the test suite
+ * dealing with (??{...}) which violate this) */
+ while (s < e) {
+ if (l >= reginfo->strend
+ || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
+ {
+ sayNO;
+ }
+ if (UTF8_IS_INVARIANT(*(U8*)l)) {
+ if (*l != *s) {
+ sayNO;
+ }
+ l++;
+ }
+ else {
+ if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
+ sayNO;
+ }
+ l += 2;
+ }
+ s++;
+ }
+ }
+ else {
+ /* The target is not utf8, the pattern is utf8. */
+ while (s < e) {
+ if (l >= reginfo->strend
+ || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
+ {
+ sayNO;
+ }
+ if (UTF8_IS_INVARIANT(*(U8*)s)) {
+ if (*s != *l) {
+ sayNO;
+ }
+ s++;
+ }
+ else {
+ if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
+ sayNO;
+ }
+ s += 2;
+ }
+ l++;
+ }
+ }
+ locinput = l;
+ }
+ else {
+ /* The target and the pattern have the same utf8ness. */
+ /* Inline the first character, for speed. */
+ if (reginfo->strend - locinput < ln
+ || UCHARAT(s) != nextchr
+ || (ln > 1 && memNE(s, locinput, ln)))
+ {
+ sayNO;
+ }
+ locinput += ln;
+ }
+ break;
+ }
+
+ case EXACTFL: { /* /abc/il */
+ re_fold_t folder;
+ const U8 * fold_array;
+ const char * s;
+ U32 fold_utf8_flags;
+
+ RX_MATCH_TAINTED_on(reginfo->prog);
+ folder = foldEQ_locale;
+ fold_array = PL_fold_locale;
+ fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
+ goto do_exactf;
+
+ case EXACTFU_SS: /* /\x{df}/iu */
+ case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
+ case EXACTFU: /* /abc/iu */
+ folder = foldEQ_latin1;
+ fold_array = PL_fold_latin1;
+ fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
+ goto do_exactf;
+
+ case EXACTFA: /* /abc/iaa */
+ folder = foldEQ_latin1;
+ fold_array = PL_fold_latin1;
+ fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
+ goto do_exactf;
+
+ case EXACTF: /* /abc/i */
+ folder = foldEQ;
+ fold_array = PL_fold;
+ fold_utf8_flags = 0;
+
+ do_exactf:
+ s = STRING(scan);
+ ln = STR_LEN(scan);
+
+ if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
+ /* Either target or the pattern are utf8, or has the issue where
+ * the fold lengths may differ. */
+ const char * const l = locinput;
+ char *e = reginfo->strend;
+
+ if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
+ l, &e, 0, utf8_target, fold_utf8_flags))
+ {
+ sayNO;
+ }
+ locinput = e;
+ break;
+ }
+
+ /* Neither the target nor the pattern are utf8 */
+ if (UCHARAT(s) != nextchr
+ && !NEXTCHR_IS_EOS
+ && UCHARAT(s) != fold_array[nextchr])
+ {
+ sayNO;
+ }
+ if (reginfo->strend - locinput < ln)
+ sayNO;
+ if (ln > 1 && ! folder(s, locinput, ln))
+ sayNO;
+ locinput += ln;
+ break;
+ }
+
+ /* XXX Could improve efficiency by separating these all out using a
+ * macro or in-line function. At that point regcomp.c would no longer
+ * have to set the FLAGS fields of these */
+ case BOUNDL: /* /\b/l */
+ case NBOUNDL: /* /\B/l */
+ RX_MATCH_TAINTED_on(reginfo->prog);
+ /* FALL THROUGH */
+ case BOUND: /* /\b/ */
+ case BOUNDU: /* /\b/u */
+ case BOUNDA: /* /\b/a */
+ case NBOUND: /* /\B/ */
+ case NBOUNDU: /* /\B/u */
+ case NBOUNDA: /* /\B/a */
+ /* was last char in word? */
+ if (utf8_target
+ && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
+ && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
+ {
+ if (locinput == reginfo->strbeg)
+ ln = '\n';
+ else {
+ const U8 * const r =
+ reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
+
+ ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
+ }
+ if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
+ ln = isWORDCHAR_uni(ln);
+ if (NEXTCHR_IS_EOS)
+ n = 0;
+ else {
+ LOAD_UTF8_CHARCLASS_ALNUM();
+ n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
+ utf8_target);
+ }
+ }
+ else {
+ ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
+ }
+ }
+ else {
+
+ /* Here the string isn't utf8, or is utf8 and only ascii
+ * characters are to match \w. In the latter case looking at
+ * the byte just prior to the current one may be just the final
+ * byte of a multi-byte character. This is ok. There are two
+ * cases:
+ * 1) it is a single byte character, and then the test is doing
+ * just what it's supposed to.
+ * 2) it is a multi-byte character, in which case the final
+ * byte is never mistakable for ASCII, and so the test
+ * will say it is not a word character, which is the
+ * correct answer. */
+ ln = (locinput != reginfo->strbeg) ?
+ UCHARAT(locinput - 1) : '\n';
+ switch (FLAGS(scan)) {
+ case REGEX_UNICODE_CHARSET:
+ ln = isWORDCHAR_L1(ln);
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
+ break;
+ case REGEX_LOCALE_CHARSET:
+ ln = isWORDCHAR_LC(ln);
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ ln = isWORDCHAR(ln);
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ ln = isWORDCHAR_A(ln);
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
+ break;
+ }
+ }
+ /* Note requires that all BOUNDs be lower than all NBOUNDs in
+ * regcomp.sym */
+ if (((!ln) == (!n)) == (OP(scan) < NBOUND))
+ sayNO;
+ break;
+
+ case ANYOF: /* /[abc]/ */
+ case ANYOF_WARN_SUPER:
+ if (NEXTCHR_IS_EOS)
+ sayNO;
+ if (utf8_target) {
+ if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
+ sayNO;
+ locinput += UTF8SKIP(locinput);
+ }
+ else {
+ if (!REGINCLASS(rex, scan, (U8*)locinput))
+ sayNO;
+ locinput++;
+ }
+ break;
+
+ /* The argument (FLAGS) to all the POSIX node types is the class number
+ * */
+
+ case NPOSIXL: /* \W or [:^punct:] etc. under /l */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXL: /* \w or [:punct:] etc. under /l */
+ if (NEXTCHR_IS_EOS)
+ sayNO;
+
+ /* The locale hasn't influenced the outcome before this, so defer
+ * tainting until now */
+ RX_MATCH_TAINTED_on(reginfo->prog);
+
+ /* Use isFOO_lc() for characters within Latin1. (Note that
+ * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
+ * wouldn't be invariant) */
+ if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
+ if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
+ sayNO;
+ }
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+ if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
+ (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
+ *(locinput + 1))))))
+ {
+ sayNO;
+ }
+ }
+ else { /* Here, must be an above Latin-1 code point */
+ goto utf8_posix_not_eos;
+ }
+
+ /* Here, must be utf8 */
+ locinput += UTF8SKIP(locinput);
+ break;
+
+ case NPOSIXD: /* \W or [:^punct:] etc. under /d */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXD: /* \w or [:punct:] etc. under /d */
+ if (utf8_target) {
+ goto utf8_posix;
+ }
+ goto posixa;
+
+ case NPOSIXA: /* \W or [:^punct:] etc. under /a */
+
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+
+ /* All UTF-8 variants match */
+ if (! UTF8_IS_INVARIANT(nextchr)) {
+ goto increment_locinput;
+ }
+
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXA: /* \w or [:punct:] etc. under /a */
+
+ posixa:
+ /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
+ * UTF-8, and also from NPOSIXA even in UTF-8 when the current
+ * character is a single byte */
+
+ if (NEXTCHR_IS_EOS
+ || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
+ FLAGS(scan)))))
+ {
+ sayNO;
+ }
+
+ /* Here we are either not in utf8, or we matched a utf8-invariant,
+ * so the next char is the next byte */
+ locinput++;
+ break;
+
+ case NPOSIXU: /* \W or [:^punct:] etc. under /u */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case POSIXU: /* \w or [:punct:] etc. under /u */
+ utf8_posix:
+ if (NEXTCHR_IS_EOS) {
+ sayNO;
+ }
+ utf8_posix_not_eos:
+
+ /* Use _generic_isCC() for characters within Latin1. (Note that
+ * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
+ * wouldn't be invariant) */
+ if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
+ if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
+ FLAGS(scan)))))
+ {
+ sayNO;
+ }
+ locinput++;
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
+ if (! (to_complement
+ ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
+ *(locinput + 1)),
+ FLAGS(scan)))))
+ {
+ sayNO;
+ }
+ locinput += 2;
+ }
+ else { /* Handle above Latin-1 code points */
+ classnum = (_char_class_number) FLAGS(scan);
+ if (classnum < _FIRST_NON_SWASH_CC) {
+
+ /* Here, uses a swash to find such code points. Load if if
+ * not done already */
+ if (! PL_utf8_swash_ptrs[classnum]) {
+ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+ PL_utf8_swash_ptrs[classnum]
+ = _core_swash_init("utf8",
+ swash_property_names[classnum],
+ &PL_sv_undef, 1, 0, NULL, &flags);
+ }
+ if (! (to_complement
+ ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
+ (U8 *) locinput, TRUE))))
+ {
+ sayNO;
+ }
+ }
+ else { /* Here, uses macros to find above Latin-1 code points */
+ switch (classnum) {
+ case _CC_ENUM_SPACE: /* XXX would require separate
+ code if we revert the change
+ of \v matching this */
+ case _CC_ENUM_PSXSPC:
+ if (! (to_complement
+ ^ cBOOL(is_XPERLSPACE_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_BLANK:
+ if (! (to_complement
+ ^ cBOOL(is_HORIZWS_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_XDIGIT:
+ if (! (to_complement
+ ^ cBOOL(is_XDIGIT_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ case _CC_ENUM_VERTSPACE:
+ if (! (to_complement
+ ^ cBOOL(is_VERTWS_high(locinput))))
+ {
+ sayNO;
+ }
+ break;
+ default: /* The rest, e.g. [:cntrl:], can't match
+ above Latin1 */
+ if (! to_complement) {
+ sayNO;
+ }
+ break;
+ }
+ }
+ locinput += UTF8SKIP(locinput);
+ }
+ break;
+
+ case CLUMP: /* Match \X: logical Unicode character. This is defined as
+ a Unicode extended Grapheme Cluster */
+ /* From http://www.unicode.org/reports/tr29 (5.2 version). An
+ extended Grapheme Cluster is:
+
+ CR LF
+ | Prepend* Begin Extend*
+ | .
+
+ Begin is: ( Special_Begin | ! Control )
+ Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
+ Extend is: ( Grapheme_Extend | Spacing_Mark )
+ Control is: [ GCB_Control | CR | LF ]
+ Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
+
+ If we create a 'Regular_Begin' = Begin - Special_Begin, then
+ we can rewrite
+
+ Begin is ( Regular_Begin + Special Begin )
+
+ It turns out that 98.4% of all Unicode code points match
+ Regular_Begin. Doing it this way eliminates a table match in
+ the previous implementation for almost all Unicode code points.
+
+ There is a subtlety with Prepend* which showed up in testing.
+ Note that the Begin, and only the Begin is required in:
+ | Prepend* Begin Extend*
+ Also, Begin contains '! Control'. A Prepend must be a
+ '! Control', which means it must also be a Begin. What it
+ comes down to is that if we match Prepend* and then find no
+ suitable Begin afterwards, that if we backtrack the last
+ Prepend, that one will be a suitable Begin.
+ */
+
+ if (NEXTCHR_IS_EOS)
+ sayNO;
+ if (! utf8_target) {
+
+ /* Match either CR LF or '.', as all the other possibilities
+ * require utf8 */
+ locinput++; /* Match the . or CR */
+ if (nextchr == '\r' /* And if it was CR, and the next is LF,
+ match the LF */
+ && locinput < reginfo->strend
+ && UCHARAT(locinput) == '\n')
+ {
+ locinput++;
+ }
+ }
+ else {
+
+ /* Utf8: See if is ( CR LF ); already know that locinput <
+ * reginfo->strend, so locinput+1 is in bounds */
+ if ( nextchr == '\r' && locinput+1 < reginfo->strend
+ && UCHARAT(locinput + 1) == '\n')
+ {
+ locinput += 2;
+ }
+ else {
+ STRLEN len;
+
+ /* In case have to backtrack to beginning, then match '.' */
+ char *starting = locinput;
+
+ /* In case have to backtrack the last prepend */
+ char *previous_prepend = NULL;
+
+ LOAD_UTF8_CHARCLASS_GCB();
+
+ /* Match (prepend)* */
+ while (locinput < reginfo->strend
+ && (len = is_GCB_Prepend_utf8(locinput)))
+ {
+ previous_prepend = locinput;
+ locinput += len;
+ }
+
+ /* As noted above, if we matched a prepend character, but
+ * the next thing won't match, back off the last prepend we
+ * matched, as it is guaranteed to match the begin */
+ if (previous_prepend
+ && (locinput >= reginfo->strend
+ || (! swash_fetch(PL_utf8_X_regular_begin,
+ (U8*)locinput, utf8_target)
+ && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
+ )
+ {
+ locinput = previous_prepend;
+ }
+
+ /* Note that here we know reginfo->strend > locinput, as we
+ * tested that upon input to this switch case, and if we
+ * moved locinput forward, we tested the result just above
+ * and it either passed, or we backed off so that it will
+ * now pass */
+ if (swash_fetch(PL_utf8_X_regular_begin,
+ (U8*)locinput, utf8_target)) {
+ locinput += UTF8SKIP(locinput);
+ }
+ else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
+
+ /* Here did not match the required 'Begin' in the
+ * second term. So just match the very first
+ * character, the '.' of the final term of the regex */
+ locinput = starting + UTF8SKIP(starting);
+ goto exit_utf8;
+ } else {
+
+ /* Here is a special begin. It can be composed of
+ * several individual characters. One possibility is
+ * RI+ */
+ if ((len = is_GCB_RI_utf8(locinput))) {
+ locinput += len;
+ while (locinput < reginfo->strend
+ && (len = is_GCB_RI_utf8(locinput)))
+ {
+ locinput += len;
+ }
+ } else if ((len = is_GCB_T_utf8(locinput))) {
+ /* Another possibility is T+ */
+ locinput += len;
+ while (locinput < reginfo->strend
+ && (len = is_GCB_T_utf8(locinput)))
+ {
+ locinput += len;
+ }
+ } else {
+
+ /* Here, neither RI+ nor T+; must be some other
+ * Hangul. That means it is one of the others: L,
+ * LV, LVT or V, and matches:
+ * L* (L | LVT T* | V * V* T* | LV V* T*) */
+
+ /* Match L* */
+ while (locinput < reginfo->strend
+ && (len = is_GCB_L_utf8(locinput)))
+ {
+ locinput += len;
+ }
+
+ /* Here, have exhausted L*. If the next character
+ * is not an LV, LVT nor V, it means we had to have
+ * at least one L, so matches L+ in the original
+ * equation, we have a complete hangul syllable.
+ * Are done. */
+
+ if (locinput < reginfo->strend
+ && is_GCB_LV_LVT_V_utf8(locinput))
+ {
+ /* Otherwise keep going. Must be LV, LVT or V.
+ * See if LVT, by first ruling out V, then LV */
+ if (! is_GCB_V_utf8(locinput)
+ /* All but every TCount one is LV */
+ && (valid_utf8_to_uvchr((U8 *) locinput,
+ NULL)
+ - SBASE)
+ % TCount != 0)