- nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
-
- if (!SIZE_ONLY && POSIXCC(nextvalue))
- checkposixcc(pRExC_state);
-
- /* allow 1st char to be ] (allowing it to be - is dealt with later) */
- if (UCHARAT(RExC_parse) == ']')
- goto charclassloop;
-
-parseit:
- while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
-
- charclassloop:
-
- namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
-
- if (!range) {
- rangebegin = RExC_parse;
- element_count++;
- }
- if (UTF) {
- value = utf8n_to_uvchr((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, UTF8_ALLOW_DEFAULT);
- RExC_parse += numlen;
- }
- else
- value = UCHARAT(RExC_parse++);
-
- nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
- if (value == '[' && POSIXCC(nextvalue))
- namedclass = regpposixcc(pRExC_state, value);
- else if (value == '\\') {
- if (UTF) {
- value = utf8n_to_uvchr((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, UTF8_ALLOW_DEFAULT);
- RExC_parse += numlen;
- }
- else
- value = UCHARAT(RExC_parse++);
- /* Some compilers cannot handle switching on 64-bit integer
- * values, therefore value cannot be an UV. Yes, this will
- * be a problem later if we want switch on Unicode.
- * A similar issue a little bit later when switching on
- * namedclass. --jhi */
- switch ((I32)value) {
- case 'w': namedclass = ANYOF_ALNUM; break;
- case 'W': namedclass = ANYOF_NALNUM; break;
- case 's': namedclass = ANYOF_SPACE; break;
- case 'S': namedclass = ANYOF_NSPACE; break;
- case 'd': namedclass = ANYOF_DIGIT; break;
- case 'D': namedclass = ANYOF_NDIGIT; break;
- case 'v': namedclass = ANYOF_VERTWS; break;
- case 'V': namedclass = ANYOF_NVERTWS; break;
- case 'h': namedclass = ANYOF_HORIZWS; break;
- case 'H': namedclass = ANYOF_NHORIZWS; break;
- case 'N': /* Handle \N{NAME} in class */
- {
- /* We only pay attention to the first char of
- multichar strings being returned. I kinda wonder
- if this makes sense as it does change the behaviour
- from earlier versions, OTOH that behaviour was broken
- as well. */
- UV v; /* value is register so we cant & it /grrr */
- if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
- goto parseit;
- }
- value= v;
- }
- break;
- case 'p':
- case 'P':
- {
- char *e;
- if (RExC_parse >= RExC_end)
- vFAIL2("Empty \\%c{}", (U8)value);
- if (*RExC_parse == '{') {
- const U8 c = (U8)value;
- e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL2("Missing right brace on \\%c{}", c);
- while (isSPACE(UCHARAT(RExC_parse)))
- RExC_parse++;
- if (e == RExC_parse)
- vFAIL2("Empty \\%c{}", c);
- n = e - RExC_parse;
- while (isSPACE(UCHARAT(RExC_parse + n - 1)))
- n--;
- }
- else {
- e = RExC_parse;
- n = 1;
- }
- if (!SIZE_ONLY) {
- SV** invlistsvp;
- SV* invlist;
- char* name;
- if (UCHARAT(RExC_parse) == '^') {
- RExC_parse++;
- n--;
- value = value == 'p' ? 'P' : 'p'; /* toggle */
- while (isSPACE(UCHARAT(RExC_parse))) {
- RExC_parse++;
- n--;
- }
- }
- /* Try to get the definition of the property into
- * <invlist>. If /i is in effect, the effective property
- * will have its name be <__NAME_i>. The design is
- * discussed in commit
- * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- Newx(name, n + sizeof("_i__\n"), char);
-
- sprintf(name, "%s%.*s%s\n",
- (FOLD) ? "__" : "",
- (int)n,
- RExC_parse,
- (FOLD) ? "_i" : ""
- );
-
- /* Look up the property name, and get its swash and
- * inversion list, if the property is found */
- if (swash) {
- SvREFCNT_dec(swash);
- }
- swash = _core_swash_init("utf8", name, &PL_sv_undef,
- 1, /* binary */
- 0, /* not tr/// */
- TRUE, /* this routine will handle
- undefined properties */
- NULL, FALSE /* No inversion list */
- );
- if ( ! swash
- || ! SvROK(swash)
- || ! SvTYPE(SvRV(swash)) == SVt_PVHV
- || ! (invlistsvp =
- hv_fetchs(MUTABLE_HV(SvRV(swash)),
- "INVLIST", FALSE))
- || ! (invlist = *invlistsvp))
- {
- if (swash) {
- SvREFCNT_dec(swash);
- swash = NULL;
- }
-
- /* Here didn't find it. It could be a user-defined
- * property that will be available at run-time. Add it
- * to the list to look up then */
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
- (value == 'p' ? '+' : '!'),
- name);
- has_user_defined_property = 1;
-
- /* We don't know yet, so have to assume that the
- * property could match something in the Latin1 range,
- * hence something that isn't utf8 */
- ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
- }
- else {
-
- /* Here, did get the swash and its inversion list. If
- * the swash is from a user-defined property, then this
- * whole character class should be regarded as such */
- SV** user_defined_svp =
- hv_fetchs(MUTABLE_HV(SvRV(swash)),
- "USER_DEFINED", FALSE);
- if (user_defined_svp) {
- has_user_defined_property
- |= SvUV(*user_defined_svp);
- }
-
- /* Invert if asking for the complement */
- if (value == 'P') {
- _invlist_union_complement_2nd(properties, invlist, &properties);
-
- /* The swash can't be used as-is, because we've
- * inverted things; delay removing it to here after
- * have copied its invlist above */
- SvREFCNT_dec(swash);
- swash = NULL;
- }
- else {
- _invlist_union(properties, invlist, &properties);
- }
- }
- Safefree(name);
- }
- RExC_parse = e + 1;
- namedclass = ANYOF_MAX; /* no official name, but it's named */
-
- /* \p means they want Unicode semantics */
- RExC_uni_semantics = 1;
- }
- break;
- case 'n': value = '\n'; break;
- case 'r': value = '\r'; break;
- case 't': value = '\t'; break;
- case 'f': value = '\f'; break;
- case 'b': value = '\b'; break;
- case 'e': value = ASCII_TO_NATIVE('\033');break;
- case 'a': value = ASCII_TO_NATIVE('\007');break;
- case 'o':
- RExC_parse--; /* function expects to be pointed at the 'o' */
- {
- const char* error_msg;
- bool valid = grok_bslash_o(RExC_parse,
- &value,
- &numlen,
- &error_msg,
- SIZE_ONLY);
- RExC_parse += numlen;
- if (! valid) {
- vFAIL(error_msg);
- }
- }
- if (PL_encoding && value < 0x100) {
- goto recode_encoding;
- }
- break;
- case 'x':
- if (*RExC_parse == '{') {
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX;
- char * const e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\x{}");
-
- numlen = e - RExC_parse;
- value = grok_hex(RExC_parse, &numlen, &flags, NULL);
- RExC_parse = e + 1;
- }
- else {
- I32 flags = PERL_SCAN_DISALLOW_PREFIX;
- numlen = 2;
- value = grok_hex(RExC_parse, &numlen, &flags, NULL);
- RExC_parse += numlen;
- }
- if (PL_encoding && value < 0x100)
- goto recode_encoding;
- break;
- case 'c':
- value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
- break;
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7':
- {
- /* Take 1-3 octal digits */
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
- numlen = 3;
- value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
- RExC_parse += numlen;
- if (PL_encoding && value < 0x100)
- goto recode_encoding;
- break;
- }
- recode_encoding:
- if (! RExC_override_recoding) {
- SV* enc = PL_encoding;
- value = reg_recode((const char)(U8)value, &enc);
- if (!enc && SIZE_ONLY)
- ckWARNreg(RExC_parse,
- "Invalid escape in the specified encoding");
- break;
- }
- default:
- /* Allow \_ to not give an error */
- if (!SIZE_ONLY && isALNUM(value) && value != '_') {
- ckWARN2reg(RExC_parse,
- "Unrecognized escape \\%c in character class passed through",
- (int)value);
- }
- break;
- }
- } /* end of \blah */
-#ifdef EBCDIC
- else
- literal_endpoint++;
-#endif
-
- if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
-
- /* What matches in a locale is not known until runtime, so need to
- * (one time per class) allocate extra space to pass to regexec.
- * The space will contain a bit for each named class that is to be
- * matched against. This isn't needed for \p{} and pseudo-classes,
- * as they are not affected by locale, and hence are dealt with
- * separately */
- if (LOC && namedclass < ANYOF_MAX && ! need_class) {
- need_class = 1;
- if (SIZE_ONLY) {
- RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
- }
- else {
- RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
- ANYOF_CLASS_ZERO(ret);
- }
- ANYOF_FLAGS(ret) |= ANYOF_CLASS;
- }
-
- /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
- * literal, as is the character that began the false range, i.e.
- * the 'a' in the examples */
- if (range) {
- if (!SIZE_ONLY) {
- const int w =
- RExC_parse >= rangebegin ?
- RExC_parse - rangebegin : 0;
- ckWARN4reg(RExC_parse,
- "False [] range \"%*.*s\"",
- w, w, rangebegin);
-
- stored +=
- set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
- if (prevvalue < 256) {
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
- }
- else {
- nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
- }
- }
-
- range = 0; /* this was not a true range */
- }
-
- if (!SIZE_ONLY) {
-
- /* Possible truncation here but in some 64-bit environments
- * the compiler gets heartburn about switch on 64-bit values.
- * A similar issue a little earlier when switching on value.
- * --jhi */
- switch ((I32)namedclass) {
-
- case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
- break;
- case ANYOF_NALNUMC:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
- break;
- case ANYOF_ALPHA:
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
- break;
- case ANYOF_NALPHA:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
- break;
- case ANYOF_ASCII:
- if (LOC) {
- ANYOF_CLASS_SET(ret, namedclass);
- }
- else {
- _invlist_union(properties, PL_ASCII, &properties);
- }
- break;
- case ANYOF_NASCII:
- if (LOC) {
- ANYOF_CLASS_SET(ret, namedclass);
- }
- else {
- _invlist_union_complement_2nd(properties,
- PL_ASCII, &properties);
- if (DEPENDS_SEMANTICS) {
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
- }
- }
- break;
- case ANYOF_BLANK:
- DO_POSIX(ret, namedclass, properties,
- PL_PosixBlank, PL_XPosixBlank);
- break;
- case ANYOF_NBLANK:
- DO_N_POSIX(ret, namedclass, properties,
- PL_PosixBlank, PL_XPosixBlank);
- break;
- case ANYOF_CNTRL:
- DO_POSIX(ret, namedclass, properties,
- PL_PosixCntrl, PL_XPosixCntrl);
- break;
- case ANYOF_NCNTRL:
- DO_N_POSIX(ret, namedclass, properties,
- PL_PosixCntrl, PL_XPosixCntrl);
- break;
- case ANYOF_DIGIT:
- /* Ignore the compiler warning for this macro, planned to
- * be eliminated later */
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
- break;
- case ANYOF_NDIGIT:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
- break;
- case ANYOF_GRAPH:
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
- break;
- case ANYOF_NGRAPH:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
- break;
- case ANYOF_HORIZWS:
- /* For these, we use the nonbitmap, as /d doesn't make a
- * difference in what these match. There would be problems
- * if these characters had folds other than themselves, as
- * nonbitmap is subject to folding. It turns out that \h
- * is just a synonym for XPosixBlank */
- _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
- break;
- case ANYOF_NHORIZWS:
- _invlist_union_complement_2nd(nonbitmap,
- PL_XPosixBlank, &nonbitmap);
- break;
- case ANYOF_LOWER:
- case ANYOF_NLOWER:
- { /* These require special handling, as they differ under
- folding, matching Cased there (which in the ASCII range
- is the same as Alpha */
-
- SV* ascii_source;
- SV* l1_source;
- const char *Xname;
-
- if (FOLD && ! LOC) {
- ascii_source = PL_PosixAlpha;
- l1_source = PL_L1Cased;
- Xname = "Cased";
- }
- else {
- ascii_source = PL_PosixLower;
- l1_source = PL_L1PosixLower;
- Xname = "XPosixLower";
- }
- if (namedclass == ANYOF_LOWER) {
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- ascii_source, l1_source, Xname, listsv);
- }
- else {
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
- properties, ascii_source, l1_source, Xname, listsv);
- }
- break;
- }
- case ANYOF_PRINT:
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
- break;
- case ANYOF_NPRINT:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
- break;
- case ANYOF_PUNCT:
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
- break;
- case ANYOF_NPUNCT:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
- break;
- case ANYOF_PSXSPC:
- DO_POSIX(ret, namedclass, properties,
- PL_PosixSpace, PL_XPosixSpace);
- break;
- case ANYOF_NPSXSPC:
- DO_N_POSIX(ret, namedclass, properties,
- PL_PosixSpace, PL_XPosixSpace);
- break;
- case ANYOF_SPACE:
- DO_POSIX(ret, namedclass, properties,
- PL_PerlSpace, PL_XPerlSpace);
- break;
- case ANYOF_NSPACE:
- DO_N_POSIX(ret, namedclass, properties,
- PL_PerlSpace, PL_XPerlSpace);
- break;
- case ANYOF_UPPER: /* Same as LOWER, above */
- case ANYOF_NUPPER:
- {
- SV* ascii_source;
- SV* l1_source;
- const char *Xname;
-
- if (FOLD && ! LOC) {
- ascii_source = PL_PosixAlpha;
- l1_source = PL_L1Cased;
- Xname = "Cased";
- }
- else {
- ascii_source = PL_PosixUpper;
- l1_source = PL_L1PosixUpper;
- Xname = "XPosixUpper";
- }
- if (namedclass == ANYOF_UPPER) {
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- ascii_source, l1_source, Xname, listsv);
- }
- else {
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
- properties, ascii_source, l1_source, Xname, listsv);
- }
- break;
- }
- case ANYOF_ALNUM: /* Really is 'Word' */
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
- break;
- case ANYOF_NALNUM:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
- break;
- case ANYOF_VERTWS:
- /* For these, we use the nonbitmap, as /d doesn't make a
- * difference in what these match. There would be problems
- * if these characters had folds other than themselves, as
- * nonbitmap is subject to folding */
- _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
- break;
- case ANYOF_NVERTWS:
- _invlist_union_complement_2nd(nonbitmap,
- PL_VertSpace, &nonbitmap);
- break;
- case ANYOF_XDIGIT:
- DO_POSIX(ret, namedclass, properties,
- PL_PosixXDigit, PL_XPosixXDigit);
- break;
- case ANYOF_NXDIGIT:
- DO_N_POSIX(ret, namedclass, properties,
- PL_PosixXDigit, PL_XPosixXDigit);
- break;
- case ANYOF_MAX:
- /* this is to handle \p and \P */
- break;
- default:
- vFAIL("Invalid [::] class");
- break;
- }
-
- continue;
- }
- } /* end of namedclass \blah */
-
- if (range) {
- if (prevvalue > (IV)value) /* b-a */ {
- const int w = RExC_parse - rangebegin;
- Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
- range = 0; /* not a valid range */
- }
- }
- else {
- prevvalue = value; /* save the beginning of the range */
- if (RExC_parse+1 < RExC_end
- && *RExC_parse == '-'
- && RExC_parse[1] != ']')
- {
- RExC_parse++;
-
- /* a bad range like \w-, [:word:]- ? */
- if (namedclass > OOB_NAMEDCLASS) {
- if (ckWARN(WARN_REGEXP)) {
- const int w =
- RExC_parse >= rangebegin ?
- RExC_parse - rangebegin : 0;
- vWARN4(RExC_parse,
- "False [] range \"%*.*s\"",
- w, w, rangebegin);
- }
- if (!SIZE_ONLY)
- stored +=
- set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
- } else
- range = 1; /* yeah, it's a range! */
- continue; /* but do it the next time */
- }
- }
-
- /* non-Latin1 code point implies unicode semantics. Must be set in
- * pass1 so is there for the whole of pass 2 */
- if (value > 255) {
- RExC_uni_semantics = 1;
- }
-
- /* now is the next time */
- if (!SIZE_ONLY) {
- if (prevvalue < 256) {
- const IV ceilvalue = value < 256 ? value : 255;
- IV i;
+ /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
+ * literal, as is the character that began the false range, i.e.
+ * the 'a' in the examples */
+ if (range) {
+ if (!SIZE_ONLY) {
+ const int w =
+ RExC_parse >= rangebegin ?
+ RExC_parse - rangebegin : 0;
+ ckWARN4reg(RExC_parse,
+ "False [] range \"%*.*s\"",
+ w, w, rangebegin);
+
+ stored +=
+ set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
+ if (prevvalue < 256) {
+ stored +=
+ set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
+ }
+ else {
+ nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
+ }
+ }
+
+ range = 0; /* this was not a true range */
+ }
+
+ if (!SIZE_ONLY) {
+
+ /* Possible truncation here but in some 64-bit environments
+ * the compiler gets heartburn about switch on 64-bit values.
+ * A similar issue a little earlier when switching on value.
+ * --jhi */
+ switch ((I32)namedclass) {
+
+ case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+ break;
+ case ANYOF_NALNUMC:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+ break;
+ case ANYOF_ALPHA:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+ break;
+ case ANYOF_NALPHA:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+ break;
+ case ANYOF_ASCII:
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
+ }
+ else {
+ _invlist_union(properties, PL_ASCII, &properties);
+ }
+ break;
+ case ANYOF_NASCII:
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
+ }
+ else {
+ _invlist_union_complement_2nd(properties,
+ PL_ASCII, &properties);
+ if (DEPENDS_SEMANTICS) {
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
+ }
+ }
+ break;
+ case ANYOF_BLANK:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixBlank, PL_XPosixBlank);
+ break;
+ case ANYOF_NBLANK:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixBlank, PL_XPosixBlank);
+ break;
+ case ANYOF_CNTRL:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixCntrl, PL_XPosixCntrl);
+ break;
+ case ANYOF_NCNTRL:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixCntrl, PL_XPosixCntrl);
+ break;
+ case ANYOF_DIGIT:
+ /* Ignore the compiler warning for this macro, planned to
+ * be eliminated later */
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ break;
+ case ANYOF_NDIGIT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ break;
+ case ANYOF_GRAPH:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+ break;
+ case ANYOF_NGRAPH:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+ break;
+ case ANYOF_HORIZWS:
+ /* For these, we use the nonbitmap, as /d doesn't make a
+ * difference in what these match. There would be problems
+ * if these characters had folds other than themselves, as
+ * nonbitmap is subject to folding. It turns out that \h
+ * is just a synonym for XPosixBlank */
+ _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
+ break;
+ case ANYOF_NHORIZWS:
+ _invlist_union_complement_2nd(nonbitmap,
+ PL_XPosixBlank, &nonbitmap);
+ break;
+ case ANYOF_LOWER:
+ case ANYOF_NLOWER:
+ { /* These require special handling, as they differ under
+ folding, matching Cased there (which in the ASCII range
+ is the same as Alpha */
+
+ SV* ascii_source;
+ SV* l1_source;
+ const char *Xname;
+
+ if (FOLD && ! LOC) {
+ ascii_source = PL_PosixAlpha;
+ l1_source = PL_L1Cased;
+ Xname = "Cased";
+ }
+ else {
+ ascii_source = PL_PosixLower;
+ l1_source = PL_L1PosixLower;
+ Xname = "XPosixLower";
+ }
+ if (namedclass == ANYOF_LOWER) {
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ ascii_source, l1_source, Xname, listsv);
+ }
+ else {
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+ properties, ascii_source, l1_source, Xname, listsv);
+ }
+ break;
+ }
+ case ANYOF_PRINT:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+ break;
+ case ANYOF_NPRINT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+ break;
+ case ANYOF_PUNCT:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+ break;
+ case ANYOF_NPUNCT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+ break;
+ case ANYOF_PSXSPC:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixSpace, PL_XPosixSpace);
+ break;
+ case ANYOF_NPSXSPC:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixSpace, PL_XPosixSpace);
+ break;
+ case ANYOF_SPACE:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PerlSpace, PL_XPerlSpace);
+ break;
+ case ANYOF_NSPACE:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PerlSpace, PL_XPerlSpace);
+ break;
+ case ANYOF_UPPER: /* Same as LOWER, above */
+ case ANYOF_NUPPER:
+ {
+ SV* ascii_source;
+ SV* l1_source;
+ const char *Xname;
+
+ if (FOLD && ! LOC) {
+ ascii_source = PL_PosixAlpha;
+ l1_source = PL_L1Cased;
+ Xname = "Cased";
+ }
+ else {
+ ascii_source = PL_PosixUpper;
+ l1_source = PL_L1PosixUpper;
+ Xname = "XPosixUpper";
+ }
+ if (namedclass == ANYOF_UPPER) {
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ ascii_source, l1_source, Xname, listsv);
+ }
+ else {
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+ properties, ascii_source, l1_source, Xname, listsv);
+ }
+ break;
+ }
+ case ANYOF_ALNUM: /* Really is 'Word' */
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ break;
+ case ANYOF_NALNUM:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ break;
+ case ANYOF_VERTWS:
+ /* For these, we use the nonbitmap, as /d doesn't make a
+ * difference in what these match. There would be problems
+ * if these characters had folds other than themselves, as
+ * nonbitmap is subject to folding */
+ _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
+ break;
+ case ANYOF_NVERTWS:
+ _invlist_union_complement_2nd(nonbitmap,
+ PL_VertSpace, &nonbitmap);
+ break;
+ case ANYOF_XDIGIT:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixXDigit, PL_XPosixXDigit);
+ break;
+ case ANYOF_NXDIGIT:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixXDigit, PL_XPosixXDigit);
+ break;
+ case ANYOF_MAX:
+ /* this is to handle \p and \P */
+ break;
+ default:
+ vFAIL("Invalid [::] class");
+ break;
+ }
+
+ continue;
+ }
+ } /* end of namedclass \blah */
+
+ if (range) {
+ if (prevvalue > (IV)value) /* b-a */ {
+ const int w = RExC_parse - rangebegin;
+ Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
+ range = 0; /* not a valid range */
+ }
+ }
+ else {
+ prevvalue = value; /* save the beginning of the range */
+ if (RExC_parse+1 < RExC_end
+ && *RExC_parse == '-'
+ && RExC_parse[1] != ']')
+ {
+ RExC_parse++;
+
+ /* a bad range like \w-, [:word:]- ? */
+ if (namedclass > OOB_NAMEDCLASS) {
+ if (ckWARN(WARN_REGEXP)) {
+ const int w =
+ RExC_parse >= rangebegin ?
+ RExC_parse - rangebegin : 0;
+ vWARN4(RExC_parse,
+ "False [] range \"%*.*s\"",
+ w, w, rangebegin);
+ }
+ if (!SIZE_ONLY)
+ stored +=
+ set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
+ } else
+ range = 1; /* yeah, it's a range! */
+ continue; /* but do it the next time */
+ }
+ }
+
+ /* non-Latin1 code point implies unicode semantics. Must be set in
+ * pass1 so is there for the whole of pass 2 */
+ if (value > 255) {
+ RExC_uni_semantics = 1;
+ }
+
+ /* now is the next time */
+ if (!SIZE_ONLY) {
+ if (prevvalue < 256) {
+ const IV ceilvalue = value < 256 ? value : 255;
+ IV i;