- defchar: {
- STRLEN len = 0;
- UV ender;
- char *p;
- char *s;
-#define MAX_NODE_STRING_SIZE 127
- char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
- char *s0;
- U8 upper_parse = MAX_NODE_STRING_SIZE;
- STRLEN foldlen;
- U8 node_type;
- bool next_is_quantifier;
- char * oldp = NULL;
-
- /* If a folding node contains only code points that don't
- * participate in folds, it can be changed into an EXACT node,
- * which allows the optimizer more things to look for */
- bool maybe_exact;
-
- ender = 0;
- node_type = compute_EXACTish(pRExC_state);
- ret = reg_node(pRExC_state, node_type);
-
- /* In pass1, folded, we use a temporary buffer instead of the
- * actual node, as the node doesn't exist yet */
- s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
-
- s0 = s;
-
- reparse:
-
- /* We do the EXACTFish to EXACT node only if folding, and not if in
- * locale, as whether a character folds or not isn't known until
- * runtime */
- maybe_exact = FOLD && ! LOC;
-
- /* XXX The node can hold up to 255 bytes, yet this only goes to
- * 127. I (khw) do not know why. Keeping it somewhat less than
- * 255 allows us to not have to worry about overflow due to
- * converting to utf8 and fold expansion, but that value is
- * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
- * split up by this limit into a single one using the real max of
- * 255. Even at 127, this breaks under rare circumstances. If
- * folding, we do not want to split a node at a character that is a
- * non-final in a multi-char fold, as an input string could just
- * happen to want to match across the node boundary. The join
- * would solve that problem if the join actually happens. But a
- * series of more than two nodes in a row each of 127 would cause
- * the first join to succeed to get to 254, but then there wouldn't
- * be room for the next one, which could at be one of those split
- * multi-char folds. I don't know of any fool-proof solution. One
- * could back off to end with only a code point that isn't such a
- * non-final, but it is possible for there not to be any in the
- * entire node. */
- for (p = RExC_parse - 1;
- len < upper_parse && p < RExC_end;
- len++)
- {
- oldp = p;
-
- if (RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
- switch ((U8)*p) {
- case '^':
- case '$':
- case '.':
- case '[':
- case '(':
- case ')':
- case '|':
- goto loopdone;
- case '\\':
- /* Literal Escapes Switch
-
- This switch is meant to handle escape sequences that
- resolve to a literal character.
-
- Every escape sequence that represents something
- else, like an assertion or a char class, is handled
- in the switch marked 'Special Escapes' above in this
- routine, but also has an entry here as anything that
- isn't explicitly mentioned here will be treated as
- an unescaped equivalent literal.
- */
-
- switch ((U8)*++p) {
- /* These are all the special escapes. */
- case 'A': /* Start assertion */
- case 'b': case 'B': /* Word-boundary assertion*/
- case 'C': /* Single char !DANGEROUS! */
- case 'd': case 'D': /* digit class */
- case 'g': case 'G': /* generic-backref, pos assertion */
- case 'h': case 'H': /* HORIZWS */
- case 'k': case 'K': /* named backref, keep marker */
- case 'p': case 'P': /* Unicode property */
- case 'R': /* LNBREAK */
- case 's': case 'S': /* space class */
- case 'v': case 'V': /* VERTWS */
- case 'w': case 'W': /* word class */
- case 'X': /* eXtended Unicode "combining character sequence" */
- case 'z': case 'Z': /* End of line/string assertion */
- --p;
- goto loopdone;
-
- /* Anything after here is an escape that resolves to a
- literal. (Except digits, which may or may not)
- */
- case 'n':
- ender = '\n';
- p++;
- break;
- case 'N': /* Handle a single-code point named character. */
- /* The options cause it to fail if a multiple code
- * point sequence. Handle those in the switch() above
- * */
- RExC_parse = p + 1;
- if (! grok_bslash_N(pRExC_state, NULL, &ender,
- flagp, depth, FALSE,
- FALSE /* not strict */ ))
- {
- if (*flagp & RESTART_UTF8)
- FAIL("panic: grok_bslash_N set RESTART_UTF8");
- RExC_parse = p = oldp;
- goto loopdone;
- }
- p = RExC_parse;
- if (ender > 0xff) {
- REQUIRE_UTF8;
- }
- break;
- case 'r':
- ender = '\r';
- p++;
- break;
- case 't':
- ender = '\t';
- p++;
- break;
- case 'f':
- ender = '\f';
- p++;
- break;
- case 'e':
- ender = ASCII_TO_NATIVE('\033');
- p++;
- break;
- case 'a':
- ender = ASCII_TO_NATIVE('\007');
- p++;
- break;
- case 'o':
- {
- UV result;
- const char* error_msg;
-
- bool valid = grok_bslash_o(&p,
- &result,
- &error_msg,
- TRUE, /* out warnings */
- FALSE, /* not strict */
- TRUE, /* Output warnings
- for non-
- portables */
- UTF);
- if (! valid) {
- RExC_parse = p; /* going to die anyway; point
- to exact spot of failure */
- vFAIL(error_msg);
- }
- ender = result;
- if (PL_encoding && ender < 0x100) {
- goto recode_encoding;
- }
- if (ender > 0xff) {
- REQUIRE_UTF8;
- }
- break;
- }
- case 'x':
- {
- UV result = UV_MAX; /* initialize to erroneous
- value */
- const char* error_msg;
-
- bool valid = grok_bslash_x(&p,
- &result,
- &error_msg,
- TRUE, /* out warnings */
- FALSE, /* not strict */
- TRUE, /* Output warnings
- for non-
- portables */
- UTF);
- if (! valid) {
- RExC_parse = p; /* going to die anyway; point
- to exact spot of failure */
- vFAIL(error_msg);
- }
- ender = result;
-
- if (PL_encoding && ender < 0x100) {
- goto recode_encoding;
- }
- if (ender > 0xff) {
- REQUIRE_UTF8;
- }
- break;
- }
- case 'c':
- p++;
- ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
- break;
- case '0': case '1': case '2': case '3':case '4':
- case '5': case '6': case '7':
- if (*p == '0' ||
- (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
- {
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
- STRLEN numlen = 3;
- ender = grok_oct(p, &numlen, &flags, NULL);
- if (ender > 0xff) {
- REQUIRE_UTF8;
- }
- p += numlen;
- if (SIZE_ONLY /* like \08, \178 */
- && numlen < 3
- && p < RExC_end
- && isDIGIT(*p) && ckWARN(WARN_REGEXP))
- {
- reg_warn_non_literal_string(
- p + 1,
- form_short_octal_warning(p, numlen));
- }
- }
- else { /* Not to be treated as an octal constant, go
- find backref */
- --p;
- goto loopdone;
- }
- if (PL_encoding && ender < 0x100)
- goto recode_encoding;
- break;
- recode_encoding:
- if (! RExC_override_recoding) {
- SV* enc = PL_encoding;
- ender = reg_recode((const char)(U8)ender, &enc);
- if (!enc && SIZE_ONLY)
- ckWARNreg(p, "Invalid escape in the specified encoding");
- REQUIRE_UTF8;
- }
- break;
- case '\0':
- if (p >= RExC_end)
- FAIL("Trailing \\");
- /* FALL THROUGH */
- default:
- if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
- /* Include any { following the alpha to emphasize
- * that it could be part of an escape at some point
- * in the future */
- int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
- ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
- }
- goto normal_default;
- } /* End of switch on '\' */
- break;
- default: /* A literal character */
-
- if (! SIZE_ONLY
- && RExC_flags & RXf_PMf_EXTENDED
- && ckWARN(WARN_DEPRECATED)
- && is_PATWS_non_low(p, UTF))
- {
- vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
- "Escape literal pattern white space under /x");
- }
-
- normal_default:
- if (UTF8_IS_START(*p) && UTF) {
- STRLEN numlen;
- ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
- &numlen, UTF8_ALLOW_DEFAULT);
- p += numlen;
- }
- else
- ender = (U8) *p++;
- break;
- } /* End of switch on the literal */
-
- /* Here, have looked at the literal character and <ender>
- * contains its ordinal, <p> points to the character after it
- */
-
- if ( RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
-
- /* If the next thing is a quantifier, it applies to this
- * character only, which means that this character has to be in
- * its own node and can't just be appended to the string in an
- * existing node, so if there are already other characters in
- * the node, close the node with just them, and set up to do
- * this character again next time through, when it will be the
- * only thing in its new node */
- if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
- {
- p = oldp;
- goto loopdone;
- }
-
- if (FOLD) {
- if (UTF
- /* See comments for join_exact() as to why we fold
- * this non-UTF at compile time */
- || (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S))
- {
-
-
- /* Prime the casefolded buffer. Locale rules, which
- * apply only to code points < 256, aren't known until
- * execution, so for them, just output the original
- * character using utf8. If we start to fold non-UTF
- * patterns, be sure to update join_exact() */
- if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
- *s = (U8) ender;
- foldlen = 1;
- } else {
- *s = UTF8_TWO_BYTE_HI(ender);
- *(s + 1) = UTF8_TWO_BYTE_LO(ender);
- foldlen = 2;
- }
- }
- else {
- UV folded = _to_uni_fold_flags(
- ender,
- (U8 *) s,
- &foldlen,
- FOLD_FLAGS_FULL
- | ((LOC) ? FOLD_FLAGS_LOCALE
- : (ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0)
- );
-
- /* If this node only contains non-folding code
- * points so far, see if this new one is also
- * non-folding */
- if (maybe_exact) {
- if (folded != ender) {
- maybe_exact = FALSE;
- }
- else {
- /* Here the fold is the original; we have
- * to check further to see if anything
- * folds to it */
- if (! PL_utf8_foldable) {
- SV* swash = swash_init("utf8",
- "_Perl_Any_Folds",
- &PL_sv_undef, 1, 0);
- PL_utf8_foldable =
- _get_swash_invlist(swash);
- SvREFCNT_dec_NN(swash);
- }
- if (_invlist_contains_cp(PL_utf8_foldable,
- ender))
- {
- maybe_exact = FALSE;
- }
- }
- }
- ender = folded;
- }
- s += foldlen;
-
- /* The loop increments <len> each time, as all but this
- * path (and the one just below for UTF) through it add
- * a single byte to the EXACTish node. But this one
- * has changed len to be the correct final value, so
- * subtract one to cancel out the increment that
- * follows */
- len += foldlen - 1;
- }
- else {
- *(s++) = (char) ender;
- maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
- }
- }
- else if (UTF) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
-
- /* See comment just above for - 1 */
- len--;
- }
- else {
- REGC((char)ender, s++);
- }
-
- if (next_is_quantifier) {
-
- /* Here, the next input is a quantifier, and to get here,
- * the current character is the only one in the node.
- * Also, here <len> doesn't include the final byte for this
- * character */
- len++;
- goto loopdone;
- }
-
- } /* End of loop through literal characters */
-
- /* Here we have either exhausted the input or ran out of room in
- * the node. (If we encountered a character that can't be in the
- * node, transfer is made directly to <loopdone>, and so we
- * wouldn't have fallen off the end of the loop.) In the latter
- * case, we artificially have to split the node into two, because
- * we just don't have enough space to hold everything. This
- * creates a problem if the final character participates in a
- * multi-character fold in the non-final position, as a match that
- * should have occurred won't, due to the way nodes are matched,
- * and our artificial boundary. So back off until we find a non-
- * problematic character -- one that isn't at the beginning or
- * middle of such a fold. (Either it doesn't participate in any
- * folds, or appears only in the final position of all the folds it
- * does participate in.) A better solution with far fewer false
- * positives, and that would fill the nodes more completely, would
- * be to actually have available all the multi-character folds to
- * test against, and to back-off only far enough to be sure that
- * this node isn't ending with a partial one. <upper_parse> is set
- * further below (if we need to reparse the node) to include just
- * up through that final non-problematic character that this code
- * identifies, so when it is set to less than the full node, we can
- * skip the rest of this */
- if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
-
- const STRLEN full_len = len;
-
- assert(len >= MAX_NODE_STRING_SIZE);
-
- /* Here, <s> points to the final byte of the final character.
- * Look backwards through the string until find a non-
- * problematic character */
-
- if (! UTF) {
-
- /* These two have no multi-char folds to non-UTF characters
- */
- if (ASCII_FOLD_RESTRICTED || LOC) {
- goto loopdone;
- }
-
- while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
- len = s - s0 + 1;
- }
- else {
- if (! PL_NonL1NonFinalFold) {
- PL_NonL1NonFinalFold = _new_invlist_C_array(
- NonL1_Perl_Non_Final_Folds_invlist);
- }
-
- /* Point to the first byte of the final character */
- s = (char *) utf8_hop((U8 *) s, -1);
-
- while (s >= s0) { /* Search backwards until find
- non-problematic char */
- if (UTF8_IS_INVARIANT(*s)) {
-
- /* There are no ascii characters that participate
- * in multi-char folds under /aa. In EBCDIC, the
- * non-ascii invariants are all control characters,
- * so don't ever participate in any folds. */
- if (ASCII_FOLD_RESTRICTED
- || ! IS_NON_FINAL_FOLD(*s))
- {
- break;
- }
- }
- else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-
- /* No Latin1 characters participate in multi-char
- * folds under /l */
- if (LOC
- || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
- *s, *(s+1))))
- {
- break;
- }
- }
- else if (! _invlist_contains_cp(
- PL_NonL1NonFinalFold,
- valid_utf8_to_uvchr((U8 *) s, NULL)))
- {
- break;
- }
-
- /* Here, the current character is problematic in that
- * it does occur in the non-final position of some
- * fold, so try the character before it, but have to
- * special case the very first byte in the string, so
- * we don't read outside the string */
- s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
- } /* End of loop backwards through the string */
-
- /* If there were only problematic characters in the string,
- * <s> will point to before s0, in which case the length
- * should be 0, otherwise include the length of the
- * non-problematic character just found */
- len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
- }
-
- /* Here, have found the final character, if any, that is
- * non-problematic as far as ending the node without splitting
- * it across a potential multi-char fold. <len> contains the
- * number of bytes in the node up-to and including that
- * character, or is 0 if there is no such character, meaning
- * the whole node contains only problematic characters. In
- * this case, give up and just take the node as-is. We can't
- * do any better */
- if (len == 0) {
- len = full_len;
- } else {
-
- /* Here, the node does contain some characters that aren't
- * problematic. If one such is the final character in the
- * node, we are done */
- if (len == full_len) {
- goto loopdone;
- }
- else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
-
- /* If the final character is problematic, but the
- * penultimate is not, back-off that last character to
- * later start a new node with it */
- p = oldp;
- goto loopdone;
- }
-
- /* Here, the final non-problematic character is earlier
- * in the input than the penultimate character. What we do
- * is reparse from the beginning, going up only as far as
- * this final ok one, thus guaranteeing that the node ends
- * in an acceptable character. The reason we reparse is
- * that we know how far in the character is, but we don't
- * know how to correlate its position with the input parse.
- * An alternate implementation would be to build that
- * correlation as we go along during the original parse,
- * but that would entail extra work for every node, whereas
- * this code gets executed only when the string is too
- * large for the node, and the final two characters are
- * problematic, an infrequent occurrence. Yet another
- * possible strategy would be to save the tail of the
- * string, and the next time regatom is called, initialize
- * with that. The problem with this is that unless you
- * back off one more character, you won't be guaranteed
- * regatom will get called again, unless regbranch,
- * regpiece ... are also changed. If you do back off that
- * extra character, so that there is input guaranteed to
- * force calling regatom, you can't handle the case where
- * just the first character in the node is acceptable. I
- * (khw) decided to try this method which doesn't have that
- * pitfall; if performance issues are found, we can do a
- * combination of the current approach plus that one */
- upper_parse = len;
- len = 0;
- s = s0;
- goto reparse;
- }
- } /* End of verifying node ends with an appropriate char */
-
- loopdone: /* Jumped to when encounters something that shouldn't be in
- the node */
-
- /* If 'maybe_exact' is still set here, means there are no
- * code points in the node that participate in folds */
- if (FOLD && maybe_exact) {
- OP(ret) = EXACT;
- }
-
- /* I (khw) don't know if you can get here with zero length, but the
- * old code handled this situation by creating a zero-length EXACT
- * node. Might as well be NOTHING instead */
- if (len == 0) {
- OP(ret) = NOTHING;
- }
- else{
- alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
- }
-
- RExC_parse = p - 1;
- Set_Node_Cur_Length(ret); /* MJD */
- nextchar(pRExC_state);
- {
- /* len is STRLEN which is unsigned, need to copy to signed */
- IV iv = len;
- if (iv < 0)
- vFAIL("Internal disaster");
- }
-
- } /* End of label 'defchar:' */
- break;
- } /* End of giant switch on input character */
-
- return(ret);