regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91
  92 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  94
  95 #ifdef op
  96 #undef op
  97 #endif /* op */
  98
  99 #ifdef MSDOS
 100 #  if defined(BUGGY_MSC6)
 101  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 102 #    pragma optimize("a",off)
 103  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 104 #    pragma optimize("w",on )
 105 #  endif /* BUGGY_MSC6 */
 106 #endif /* MSDOS */
 107
 108 #ifndef STATIC
 109 #define STATIC  static
 110 #endif
 111
 112
 113 typedef struct RExC_state_t {
 114     U32         flags;                  /* RXf_* are we folding, multilining? */
 115     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 116     char        *precomp;               /* uncompiled string. */
 117     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 118     regexp      *rx;                    /* perl core regexp structure */
 119     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 120     char        *start;                 /* Start of input for compile */
 121     char        *end;                   /* End of input for compile */
 122     char        *parse;                 /* Input-scan pointer. */
 123     I32         whilem_seen;            /* number of WHILEM in this expr */
 124     regnode     *emit_start;            /* Start of emitted-code area */
 125     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 126     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 127     I32         naughty;                /* How bad is this pattern? */
 128     I32         sawback;                /* Did we see \1, ...? */
 129     U32         seen;
 130     I32         size;                   /* Code size. */
 131     I32         npar;                   /* Capture buffer count, (OPEN). */
 132     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 133     I32         nestroot;               /* root parens we are in - used by accept */
 134     I32         extralen;
 135     I32         seen_zerolen;
 136     regnode     **open_parens;          /* pointers to open parens */
 137     regnode     **close_parens;         /* pointers to close parens */
 138     regnode     *opend;                 /* END node in program */
 139     I32         utf8;           /* whether the pattern is utf8 or not */
 140     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 141                                 /* XXX use this for future optimisation of case
 142                                  * where pattern must be upgraded to utf8. */
 143     I32         uni_semantics;  /* If a d charset modifier should use unicode
 144                                    rules, even if the pattern is not in
 145                                    utf8 */
 146     HV          *paren_names;           /* Paren names */
 147
 148     regnode     **recurse;              /* Recurse regops */
 149     I32         recurse_count;          /* Number of recurse regops */
 150     I32         in_lookbehind;
 151     I32         contains_locale;
 152     I32         override_recoding;
 153     struct reg_code_block *code_blocks; /* positions of literal (?{})
 154                                             within pattern */
 155     int         num_code_blocks;        /* size of code_blocks[] */
 156     int         code_index;             /* next code_blocks[] slot */
 157 #if ADD_TO_REGEXEC
 158     char        *starttry;              /* -Dr: where regtry was called. */
 159 #define RExC_starttry   (pRExC_state->starttry)
 160 #endif
 161     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 162 #ifdef DEBUGGING
 163     const char  *lastparse;
 164     I32         lastnum;
 165     AV          *paren_name_list;       /* idx -> name */
 166 #define RExC_lastparse  (pRExC_state->lastparse)
 167 #define RExC_lastnum    (pRExC_state->lastnum)
 168 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 169 #endif
 170 } RExC_state_t;
 171
 172 #define RExC_flags      (pRExC_state->flags)
 173 #define RExC_pm_flags   (pRExC_state->pm_flags)
 174 #define RExC_precomp    (pRExC_state->precomp)
 175 #define RExC_rx_sv      (pRExC_state->rx_sv)
 176 #define RExC_rx         (pRExC_state->rx)
 177 #define RExC_rxi        (pRExC_state->rxi)
 178 #define RExC_start      (pRExC_state->start)
 179 #define RExC_end        (pRExC_state->end)
 180 #define RExC_parse      (pRExC_state->parse)
 181 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 182 #ifdef RE_TRACK_PATTERN_OFFSETS
 183 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 184 #endif
 185 #define RExC_emit       (pRExC_state->emit)
 186 #define RExC_emit_start (pRExC_state->emit_start)
 187 #define RExC_emit_bound (pRExC_state->emit_bound)
 188 #define RExC_naughty    (pRExC_state->naughty)
 189 #define RExC_sawback    (pRExC_state->sawback)
 190 #define RExC_seen       (pRExC_state->seen)
 191 #define RExC_size       (pRExC_state->size)
 192 #define RExC_npar       (pRExC_state->npar)
 193 #define RExC_nestroot   (pRExC_state->nestroot)
 194 #define RExC_extralen   (pRExC_state->extralen)
 195 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 196 #define RExC_utf8       (pRExC_state->utf8)
 197 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 198 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 199 #define RExC_open_parens        (pRExC_state->open_parens)
 200 #define RExC_close_parens       (pRExC_state->close_parens)
 201 #define RExC_opend      (pRExC_state->opend)
 202 #define RExC_paren_names        (pRExC_state->paren_names)
 203 #define RExC_recurse    (pRExC_state->recurse)
 204 #define RExC_recurse_count      (pRExC_state->recurse_count)
 205 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 206 #define RExC_contains_locale    (pRExC_state->contains_locale)
 207 #define RExC_override_recoding  (pRExC_state->override_recoding)
 208
 209
 210 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 211 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 212         ((*s) == '{' && regcurly(s)))
 213
 214 #ifdef SPSTART
 215 #undef SPSTART          /* dratted cpp namespace... */
 216 #endif
 217 /*
 218  * Flags to be passed up and down.
 219  */
 220 #define WORST           0       /* Worst case. */
 221 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 222
 223 /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
 224  * character, and if utf8, must be invariant.  Note that this is not the same
 225  * thing as REGNODE_SIMPLE */
 226 #define SIMPLE          0x02
 227 #define SPSTART         0x04    /* Starts with * or +. */
 228 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 229 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 230
 231 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 232
 233 /* whether trie related optimizations are enabled */
 234 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 235 #define TRIE_STUDY_OPT
 236 #define FULL_TRIE_STUDY
 237 #define TRIE_STCLASS
 238 #endif
 239
 240
 241
 242 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 243 #define PBITVAL(paren) (1 << ((paren) & 7))
 244 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 245 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 246 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 247
 248 /* If not already in utf8, do a longjmp back to the beginning */
 249 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 250 #define REQUIRE_UTF8    STMT_START {                                       \
 251                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 252                         } STMT_END
 253
 254 /* About scan_data_t.
 255
 256   During optimisation we recurse through the regexp program performing
 257   various inplace (keyhole style) optimisations. In addition study_chunk
 258   and scan_commit populate this data structure with information about
 259   what strings MUST appear in the pattern. We look for the longest
 260   string that must appear at a fixed location, and we look for the
 261   longest string that may appear at a floating location. So for instance
 262   in the pattern:
 263
 264     /FOO[xX]A.*B[xX]BAR/
 265
 266   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 267   strings (because they follow a .* construct). study_chunk will identify
 268   both FOO and BAR as being the longest fixed and floating strings respectively.
 269
 270   The strings can be composites, for instance
 271
 272      /(f)(o)(o)/
 273
 274   will result in a composite fixed substring 'foo'.
 275
 276   For each string some basic information is maintained:
 277
 278   - offset or min_offset
 279     This is the position the string must appear at, or not before.
 280     It also implicitly (when combined with minlenp) tells us how many
 281     characters must match before the string we are searching for.
 282     Likewise when combined with minlenp and the length of the string it
 283     tells us how many characters must appear after the string we have
 284     found.
 285
 286   - max_offset
 287     Only used for floating strings. This is the rightmost point that
 288     the string can appear at. If set to I32 max it indicates that the
 289     string can occur infinitely far to the right.
 290
 291   - minlenp
 292     A pointer to the minimum length of the pattern that the string
 293     was found inside. This is important as in the case of positive
 294     lookahead or positive lookbehind we can have multiple patterns
 295     involved. Consider
 296
 297     /(?=FOO).*F/
 298
 299     The minimum length of the pattern overall is 3, the minimum length
 300     of the lookahead part is 3, but the minimum length of the part that
 301     will actually match is 1. So 'FOO's minimum length is 3, but the
 302     minimum length for the F is 1. This is important as the minimum length
 303     is used to determine offsets in front of and behind the string being
 304     looked for.  Since strings can be composites this is the length of the
 305     pattern at the time it was committed with a scan_commit. Note that
 306     the length is calculated by study_chunk, so that the minimum lengths
 307     are not known until the full pattern has been compiled, thus the
 308     pointer to the value.
 309
 310   - lookbehind
 311
 312     In the case of lookbehind the string being searched for can be
 313     offset past the start point of the final matching string.
 314     If this value was just blithely removed from the min_offset it would
 315     invalidate some of the calculations for how many chars must match
 316     before or after (as they are derived from min_offset and minlen and
 317     the length of the string being searched for).
 318     When the final pattern is compiled and the data is moved from the
 319     scan_data_t structure into the regexp structure the information
 320     about lookbehind is factored in, with the information that would
 321     have been lost precalculated in the end_shift field for the
 322     associated string.
 323
 324   The fields pos_min and pos_delta are used to store the minimum offset
 325   and the delta to the maximum offset at the current point in the pattern.
 326
 327 */
 328
 329 typedef struct scan_data_t {
 330     /*I32 len_min;      unused */
 331     /*I32 len_delta;    unused */
 332     I32 pos_min;
 333     I32 pos_delta;
 334     SV *last_found;
 335     I32 last_end;           /* min value, <0 unless valid. */
 336     I32 last_start_min;
 337     I32 last_start_max;
 338     SV **longest;           /* Either &l_fixed, or &l_float. */
 339     SV *longest_fixed;      /* longest fixed string found in pattern */
 340     I32 offset_fixed;       /* offset where it starts */
 341     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 342     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 343     SV *longest_float;      /* longest floating string found in pattern */
 344     I32 offset_float_min;   /* earliest point in string it can appear */
 345     I32 offset_float_max;   /* latest point in string it can appear */
 346     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 347     I32 lookbehind_float;   /* is the position of the string modified by LB */
 348     I32 flags;
 349     I32 whilem_c;
 350     I32 *last_closep;
 351     struct regnode_charclass_class *start_class;
 352 } scan_data_t;
 353
 354 /*
 355  * Forward declarations for pregcomp()'s friends.
 356  */
 357
 358 static const scan_data_t zero_scan_data =
 359   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 360
 361 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 362 #define SF_BEFORE_SEOL          0x0001
 363 #define SF_BEFORE_MEOL          0x0002
 364 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 365 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 366
 367 #ifdef NO_UNARY_PLUS
 368 #  define SF_FIX_SHIFT_EOL      (0+2)
 369 #  define SF_FL_SHIFT_EOL               (0+4)
 370 #else
 371 #  define SF_FIX_SHIFT_EOL      (+2)
 372 #  define SF_FL_SHIFT_EOL               (+4)
 373 #endif
 374
 375 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 376 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 377
 378 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 379 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 380 #define SF_IS_INF               0x0040
 381 #define SF_HAS_PAR              0x0080
 382 #define SF_IN_PAR               0x0100
 383 #define SF_HAS_EVAL             0x0200
 384 #define SCF_DO_SUBSTR           0x0400
 385 #define SCF_DO_STCLASS_AND      0x0800
 386 #define SCF_DO_STCLASS_OR       0x1000
 387 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 388 #define SCF_WHILEM_VISITED_POS  0x2000
 389
 390 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 391 #define SCF_SEEN_ACCEPT         0x8000
 392
 393 #define UTF cBOOL(RExC_utf8)
 394
 395 /* The enums for all these are ordered so things work out correctly */
 396 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 397 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 398 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 399 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 400 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 401 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 402 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 403
 404 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 405
 406 #define OOB_NAMEDCLASS          -1
 407
 408 /* There is no code point that is out-of-bounds, so this is problematic.  But
 409  * its only current use is to initialize a variable that is always set before
 410  * looked at. */
 411 #define OOB_UNICODE             0xDEADBEEF
 412
 413 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 414 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 415
 416
 417 /* length of regex to show in messages that don't mark a position within */
 418 #define RegexLengthToShowInErrorMessages 127
 419
 420 /*
 421  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 422  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 423  * op/pragma/warn/regcomp.
 424  */
 425 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 426 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 427
 428 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 429
 430 /*
 431  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 432  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 433  * "...".
 434  */
 435 #define _FAIL(code) STMT_START {                                        \
 436     const char *ellipses = "";                                          \
 437     IV len = RExC_end - RExC_precomp;                                   \
 438                                                                         \
 439     if (!SIZE_ONLY)                                                     \
 440         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 441     if (len > RegexLengthToShowInErrorMessages) {                       \
 442         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 443         len = RegexLengthToShowInErrorMessages - 10;                    \
 444         ellipses = "...";                                               \
 445     }                                                                   \
 446     code;                                                               \
 447 } STMT_END
 448
 449 #define FAIL(msg) _FAIL(                            \
 450     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 451             msg, (int)len, RExC_precomp, ellipses))
 452
 453 #define FAIL2(msg,arg) _FAIL(                       \
 454     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 455             arg, (int)len, RExC_precomp, ellipses))
 456
 457 /*
 458  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 459  */
 460 #define Simple_vFAIL(m) STMT_START {                                    \
 461     const IV offset = RExC_parse - RExC_precomp;                        \
 462     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 463             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 464 } STMT_END
 465
 466 /*
 467  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 468  */
 469 #define vFAIL(m) STMT_START {                           \
 470     if (!SIZE_ONLY)                                     \
 471         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 472     Simple_vFAIL(m);                                    \
 473 } STMT_END
 474
 475 /*
 476  * Like Simple_vFAIL(), but accepts two arguments.
 477  */
 478 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 479     const IV offset = RExC_parse - RExC_precomp;                        \
 480     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 481             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 482 } STMT_END
 483
 484 /*
 485  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 486  */
 487 #define vFAIL2(m,a1) STMT_START {                       \
 488     if (!SIZE_ONLY)                                     \
 489         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 490     Simple_vFAIL2(m, a1);                               \
 491 } STMT_END
 492
 493
 494 /*
 495  * Like Simple_vFAIL(), but accepts three arguments.
 496  */
 497 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 498     const IV offset = RExC_parse - RExC_precomp;                \
 499     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 500             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 501 } STMT_END
 502
 503 /*
 504  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 505  */
 506 #define vFAIL3(m,a1,a2) STMT_START {                    \
 507     if (!SIZE_ONLY)                                     \
 508         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 509     Simple_vFAIL3(m, a1, a2);                           \
 510 } STMT_END
 511
 512 /*
 513  * Like Simple_vFAIL(), but accepts four arguments.
 514  */
 515 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 516     const IV offset = RExC_parse - RExC_precomp;                \
 517     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 518             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 519 } STMT_END
 520
 521 #define ckWARNreg(loc,m) STMT_START {                                   \
 522     const IV offset = loc - RExC_precomp;                               \
 523     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 524             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 525 } STMT_END
 526
 527 #define ckWARNregdep(loc,m) STMT_START {                                \
 528     const IV offset = loc - RExC_precomp;                               \
 529     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 530             m REPORT_LOCATION,                                          \
 531             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 532 } STMT_END
 533
 534 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 535     const IV offset = loc - RExC_precomp;                               \
 536     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 537             m REPORT_LOCATION,                                          \
 538             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 539 } STMT_END
 540
 541 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 542     const IV offset = loc - RExC_precomp;                               \
 543     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 544             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 545 } STMT_END
 546
 547 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 548     const IV offset = loc - RExC_precomp;                               \
 549     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 550             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 551 } STMT_END
 552
 553 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 554     const IV offset = loc - RExC_precomp;                               \
 555     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 556             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 557 } STMT_END
 558
 559 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 560     const IV offset = loc - RExC_precomp;                               \
 561     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 562             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 563 } STMT_END
 564
 565 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 566     const IV offset = loc - RExC_precomp;                               \
 567     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 568             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 569 } STMT_END
 570
 571 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 572     const IV offset = loc - RExC_precomp;                               \
 573     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 574             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 575 } STMT_END
 576
 577
 578 /* Allow for side effects in s */
 579 #define REGC(c,s) STMT_START {                  \
 580     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 581 } STMT_END
 582
 583 /* Macros for recording node offsets.   20001227 mjd@plover.com
 584  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 585  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 586  * Element 0 holds the number n.
 587  * Position is 1 indexed.
 588  */
 589 #ifndef RE_TRACK_PATTERN_OFFSETS
 590 #define Set_Node_Offset_To_R(node,byte)
 591 #define Set_Node_Offset(node,byte)
 592 #define Set_Cur_Node_Offset
 593 #define Set_Node_Length_To_R(node,len)
 594 #define Set_Node_Length(node,len)
 595 #define Set_Node_Cur_Length(node)
 596 #define Node_Offset(n)
 597 #define Node_Length(n)
 598 #define Set_Node_Offset_Length(node,offset,len)
 599 #define ProgLen(ri) ri->u.proglen
 600 #define SetProgLen(ri,x) ri->u.proglen = x
 601 #else
 602 #define ProgLen(ri) ri->u.offsets[0]
 603 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 604 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 605     if (! SIZE_ONLY) {                                                  \
 606         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 607                     __LINE__, (int)(node), (int)(byte)));               \
 608         if((node) < 0) {                                                \
 609             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 610         } else {                                                        \
 611             RExC_offsets[2*(node)-1] = (byte);                          \
 612         }                                                               \
 613     }                                                                   \
 614 } STMT_END
 615
 616 #define Set_Node_Offset(node,byte) \
 617     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 618 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 619
 620 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 621     if (! SIZE_ONLY) {                                                  \
 622         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 623                 __LINE__, (int)(node), (int)(len)));                    \
 624         if((node) < 0) {                                                \
 625             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 626         } else {                                                        \
 627             RExC_offsets[2*(node)] = (len);                             \
 628         }                                                               \
 629     }                                                                   \
 630 } STMT_END
 631
 632 #define Set_Node_Length(node,len) \
 633     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 634 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 635 #define Set_Node_Cur_Length(node) \
 636     Set_Node_Length(node, RExC_parse - parse_start)
 637
 638 /* Get offsets and lengths */
 639 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 640 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 641
 642 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 643     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 644     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 645 } STMT_END
 646 #endif
 647
 648 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 649 #define EXPERIMENTAL_INPLACESCAN
 650 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 651
 652 #define DEBUG_STUDYDATA(str,data,depth)                              \
 653 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 654     PerlIO_printf(Perl_debug_log,                                    \
 655         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 656         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 657         (int)(depth)*2, "",                                          \
 658         (IV)((data)->pos_min),                                       \
 659         (IV)((data)->pos_delta),                                     \
 660         (UV)((data)->flags),                                         \
 661         (IV)((data)->whilem_c),                                      \
 662         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 663         is_inf ? "INF " : ""                                         \
 664     );                                                               \
 665     if ((data)->last_found)                                          \
 666         PerlIO_printf(Perl_debug_log,                                \
 667             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 668             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 669             SvPVX_const((data)->last_found),                         \
 670             (IV)((data)->last_end),                                  \
 671             (IV)((data)->last_start_min),                            \
 672             (IV)((data)->last_start_max),                            \
 673             ((data)->longest &&                                      \
 674              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 675             SvPVX_const((data)->longest_fixed),                      \
 676             (IV)((data)->offset_fixed),                              \
 677             ((data)->longest &&                                      \
 678              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 679             SvPVX_const((data)->longest_float),                      \
 680             (IV)((data)->offset_float_min),                          \
 681             (IV)((data)->offset_float_max)                           \
 682         );                                                           \
 683     PerlIO_printf(Perl_debug_log,"\n");                              \
 684 });
 685
 686 static void clear_re(pTHX_ void *r);
 687
 688 /* Mark that we cannot extend a found fixed substring at this point.
 689    Update the longest found anchored substring and the longest found
 690    floating substrings if needed. */
 691
 692 STATIC void
 693 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 694 {
 695     const STRLEN l = CHR_SVLEN(data->last_found);
 696     const STRLEN old_l = CHR_SVLEN(*data->longest);
 697     GET_RE_DEBUG_FLAGS_DECL;
 698
 699     PERL_ARGS_ASSERT_SCAN_COMMIT;
 700
 701     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 702         SvSetMagicSV(*data->longest, data->last_found);
 703         if (*data->longest == data->longest_fixed) {
 704             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 705             if (data->flags & SF_BEFORE_EOL)
 706                 data->flags
 707                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 708             else
 709                 data->flags &= ~SF_FIX_BEFORE_EOL;
 710             data->minlen_fixed=minlenp;
 711             data->lookbehind_fixed=0;
 712         }
 713         else { /* *data->longest == data->longest_float */
 714             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 715             data->offset_float_max = (l
 716                                       ? data->last_start_max
 717                                       : data->pos_min + data->pos_delta);
 718             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 719                 data->offset_float_max = I32_MAX;
 720             if (data->flags & SF_BEFORE_EOL)
 721                 data->flags
 722                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 723             else
 724                 data->flags &= ~SF_FL_BEFORE_EOL;
 725             data->minlen_float=minlenp;
 726             data->lookbehind_float=0;
 727         }
 728     }
 729     SvCUR_set(data->last_found, 0);
 730     {
 731         SV * const sv = data->last_found;
 732         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 733             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 734             if (mg)
 735                 mg->mg_len = 0;
 736         }
 737     }
 738     data->last_end = -1;
 739     data->flags &= ~SF_BEFORE_EOL;
 740     DEBUG_STUDYDATA("commit: ",data,0);
 741 }
 742
 743 /* Can match anything (initialization) */
 744 STATIC void
 745 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 746 {
 747     PERL_ARGS_ASSERT_CL_ANYTHING;
 748
 749     ANYOF_BITMAP_SETALL(cl);
 750     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 751                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 752
 753     /* If any portion of the regex is to operate under locale rules,
 754      * initialization includes it.  The reason this isn't done for all regexes
 755      * is that the optimizer was written under the assumption that locale was
 756      * all-or-nothing.  Given the complexity and lack of documentation in the
 757      * optimizer, and that there are inadequate test cases for locale, so many
 758      * parts of it may not work properly, it is safest to avoid locale unless
 759      * necessary. */
 760     if (RExC_contains_locale) {
 761         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 762         cl->flags |= ANYOF_LOCALE;
 763     }
 764     else {
 765         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 766     }
 767 }
 768
 769 /* Can match anything (initialization) */
 770 STATIC int
 771 S_cl_is_anything(const struct regnode_charclass_class *cl)
 772 {
 773     int value;
 774
 775     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 776
 777     for (value = 0; value <= ANYOF_MAX; value += 2)
 778         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 779             return 1;
 780     if (!(cl->flags & ANYOF_UNICODE_ALL))
 781         return 0;
 782     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 783         return 0;
 784     return 1;
 785 }
 786
 787 /* Can match anything (initialization) */
 788 STATIC void
 789 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 790 {
 791     PERL_ARGS_ASSERT_CL_INIT;
 792
 793     Zero(cl, 1, struct regnode_charclass_class);
 794     cl->type = ANYOF;
 795     cl_anything(pRExC_state, cl);
 796     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 797 }
 798
 799 /* These two functions currently do the exact same thing */
 800 #define cl_init_zero            S_cl_init
 801
 802 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 803  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 804  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 805 STATIC void
 806 S_cl_and(struct regnode_charclass_class *cl,
 807         const struct regnode_charclass_class *and_with)
 808 {
 809     PERL_ARGS_ASSERT_CL_AND;
 810
 811     assert(and_with->type == ANYOF);
 812
 813     /* I (khw) am not sure all these restrictions are necessary XXX */
 814     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 815         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 816         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 817         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 818         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 819         int i;
 820
 821         if (and_with->flags & ANYOF_INVERT)
 822             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 823                 cl->bitmap[i] &= ~and_with->bitmap[i];
 824         else
 825             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 826                 cl->bitmap[i] &= and_with->bitmap[i];
 827     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 828
 829     if (and_with->flags & ANYOF_INVERT) {
 830
 831         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 832          * aren't affected by the inversion.  Those that are affected are
 833          * handled individually below */
 834         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 835         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 836         cl->flags |= affected_flags;
 837
 838         /* We currently don't know how to deal with things that aren't in the
 839          * bitmap, but we know that the intersection is no greater than what
 840          * is already in cl, so let there be false positives that get sorted
 841          * out after the synthetic start class succeeds, and the node is
 842          * matched for real. */
 843
 844         /* The inversion of these two flags indicate that the resulting
 845          * intersection doesn't have them */
 846         if (and_with->flags & ANYOF_UNICODE_ALL) {
 847             cl->flags &= ~ANYOF_UNICODE_ALL;
 848         }
 849         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 850             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 851         }
 852     }
 853     else {   /* and'd node is not inverted */
 854         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 855
 856         if (! ANYOF_NONBITMAP(and_with)) {
 857
 858             /* Here 'and_with' doesn't match anything outside the bitmap
 859              * (except possibly ANYOF_UNICODE_ALL), which means the
 860              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 861              * which case we don't know what the intersection is, but it's no
 862              * greater than what cl already has, so can just leave it alone,
 863              * with possible false positives */
 864             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 865                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 866                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 867             }
 868         }
 869         else if (! ANYOF_NONBITMAP(cl)) {
 870
 871             /* Here, 'and_with' does match something outside the bitmap, and cl
 872              * doesn't have a list of things to match outside the bitmap.  If
 873              * cl can match all code points above 255, the intersection will
 874              * be those above-255 code points that 'and_with' matches.  If cl
 875              * can't match all Unicode code points, it means that it can't
 876              * match anything outside the bitmap (since the 'if' that got us
 877              * into this block tested for that), so we leave the bitmap empty.
 878              */
 879             if (cl->flags & ANYOF_UNICODE_ALL) {
 880                 ARG_SET(cl, ARG(and_with));
 881
 882                 /* and_with's ARG may match things that don't require UTF8.
 883                  * And now cl's will too, in spite of this being an 'and'.  See
 884                  * the comments below about the kludge */
 885                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 886             }
 887         }
 888         else {
 889             /* Here, both 'and_with' and cl match something outside the
 890              * bitmap.  Currently we do not do the intersection, so just match
 891              * whatever cl had at the beginning.  */
 892         }
 893
 894
 895         /* Take the intersection of the two sets of flags.  However, the
 896          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 897          * kludge around the fact that this flag is not treated like the others
 898          * which are initialized in cl_anything().  The way the optimizer works
 899          * is that the synthetic start class (SSC) is initialized to match
 900          * anything, and then the first time a real node is encountered, its
 901          * values are AND'd with the SSC's with the result being the values of
 902          * the real node.  However, there are paths through the optimizer where
 903          * the AND never gets called, so those initialized bits are set
 904          * inappropriately, which is not usually a big deal, as they just cause
 905          * false positives in the SSC, which will just mean a probably
 906          * imperceptible slow down in execution.  However this bit has a
 907          * higher false positive consequence in that it can cause utf8.pm,
 908          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 909          * bigger slowdown and also causes significant extra memory to be used.
 910          * In order to prevent this, the code now takes a different tack.  The
 911          * bit isn't set unless some part of the regular expression needs it,
 912          * but once set it won't get cleared.  This means that these extra
 913          * modules won't get loaded unless there was some path through the
 914          * pattern that would have required them anyway, and  so any false
 915          * positives that occur by not ANDing them out when they could be
 916          * aren't as severe as they would be if we treated this bit like all
 917          * the others */
 918         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 919                                       & ANYOF_NONBITMAP_NON_UTF8;
 920         cl->flags &= and_with->flags;
 921         cl->flags |= outside_bitmap_but_not_utf8;
 922     }
 923 }
 924
 925 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 926  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 927  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 928 STATIC void
 929 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 930 {
 931     PERL_ARGS_ASSERT_CL_OR;
 932
 933     if (or_with->flags & ANYOF_INVERT) {
 934
 935         /* Here, the or'd node is to be inverted.  This means we take the
 936          * complement of everything not in the bitmap, but currently we don't
 937          * know what that is, so give up and match anything */
 938         if (ANYOF_NONBITMAP(or_with)) {
 939             cl_anything(pRExC_state, cl);
 940         }
 941         /* We do not use
 942          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 943          *   <= (B1 | !B2) | (CL1 | !CL2)
 944          * which is wasteful if CL2 is small, but we ignore CL2:
 945          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 946          * XXXX Can we handle case-fold?  Unclear:
 947          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 948          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 949          */
 950         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 951              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 952              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 953             int i;
 954
 955             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 956                 cl->bitmap[i] |= ~or_with->bitmap[i];
 957         } /* XXXX: logic is complicated otherwise */
 958         else {
 959             cl_anything(pRExC_state, cl);
 960         }
 961
 962         /* And, we can just take the union of the flags that aren't affected
 963          * by the inversion */
 964         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 965
 966         /* For the remaining flags:
 967             ANYOF_UNICODE_ALL and inverted means to not match anything above
 968                     255, which means that the union with cl should just be
 969                     what cl has in it, so can ignore this flag
 970             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 971                     is 127-255 to match them, but then invert that, so the
 972                     union with cl should just be what cl has in it, so can
 973                     ignore this flag
 974          */
 975     } else {    /* 'or_with' is not inverted */
 976         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 977         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 978              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 979                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 980             int i;
 981
 982             /* OR char bitmap and class bitmap separately */
 983             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 984                 cl->bitmap[i] |= or_with->bitmap[i];
 985             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 986                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 987                     cl->classflags[i] |= or_with->classflags[i];
 988                 cl->flags |= ANYOF_CLASS;
 989             }
 990         }
 991         else { /* XXXX: logic is complicated, leave it along for a moment. */
 992             cl_anything(pRExC_state, cl);
 993         }
 994
 995         if (ANYOF_NONBITMAP(or_with)) {
 996
 997             /* Use the added node's outside-the-bit-map match if there isn't a
 998              * conflict.  If there is a conflict (both nodes match something
 999              * outside the bitmap, but what they match outside is not the same
1000              * pointer, and hence not easily compared until XXX we extend
1001              * inversion lists this far), give up and allow the start class to
1002              * match everything outside the bitmap.  If that stuff is all above
1003              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1004             if (! ANYOF_NONBITMAP(cl)) {
1005                 ARG_SET(cl, ARG(or_with));
1006             }
1007             else if (ARG(cl) != ARG(or_with)) {
1008
1009                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1010                     cl_anything(pRExC_state, cl);
1011                 }
1012                 else {
1013                     cl->flags |= ANYOF_UNICODE_ALL;
1014                 }
1015             }
1016         }
1017
1018         /* Take the union */
1019         cl->flags |= or_with->flags;
1020     }
1021 }
1022
1023 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1024 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1025 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1026 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1027
1028
1029 #ifdef DEBUGGING
1030 /*
1031    dump_trie(trie,widecharmap,revcharmap)
1032    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1033    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1034
1035    These routines dump out a trie in a somewhat readable format.
1036    The _interim_ variants are used for debugging the interim
1037    tables that are used to generate the final compressed
1038    representation which is what dump_trie expects.
1039
1040    Part of the reason for their existence is to provide a form
1041    of documentation as to how the different representations function.
1042
1043 */
1044
1045 /*
1046   Dumps the final compressed table form of the trie to Perl_debug_log.
1047   Used for debugging make_trie().
1048 */
1049
1050 STATIC void
1051 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1052             AV *revcharmap, U32 depth)
1053 {
1054     U32 state;
1055     SV *sv=sv_newmortal();
1056     int colwidth= widecharmap ? 6 : 4;
1057     U16 word;
1058     GET_RE_DEBUG_FLAGS_DECL;
1059
1060     PERL_ARGS_ASSERT_DUMP_TRIE;
1061
1062     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1063         (int)depth * 2 + 2,"",
1064         "Match","Base","Ofs" );
1065
1066     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1067         SV ** const tmp = av_fetch( revcharmap, state, 0);
1068         if ( tmp ) {
1069             PerlIO_printf( Perl_debug_log, "%*s",
1070                 colwidth,
1071                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1072                             PL_colors[0], PL_colors[1],
1073                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1074                             PERL_PV_ESCAPE_FIRSTCHAR
1075                 )
1076             );
1077         }
1078     }
1079     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1080         (int)depth * 2 + 2,"");
1081
1082     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1083         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1084     PerlIO_printf( Perl_debug_log, "\n");
1085
1086     for( state = 1 ; state < trie->statecount ; state++ ) {
1087         const U32 base = trie->states[ state ].trans.base;
1088
1089         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1090
1091         if ( trie->states[ state ].wordnum ) {
1092             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1093         } else {
1094             PerlIO_printf( Perl_debug_log, "%6s", "" );
1095         }
1096
1097         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1098
1099         if ( base ) {
1100             U32 ofs = 0;
1101
1102             while( ( base + ofs  < trie->uniquecharcount ) ||
1103                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1104                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1105                     ofs++;
1106
1107             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1108
1109             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1110                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1111                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1112                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1113                 {
1114                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1115                     colwidth,
1116                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1117                 } else {
1118                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1119                 }
1120             }
1121
1122             PerlIO_printf( Perl_debug_log, "]");
1123
1124         }
1125         PerlIO_printf( Perl_debug_log, "\n" );
1126     }
1127     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1128     for (word=1; word <= trie->wordcount; word++) {
1129         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1130             (int)word, (int)(trie->wordinfo[word].prev),
1131             (int)(trie->wordinfo[word].len));
1132     }
1133     PerlIO_printf(Perl_debug_log, "\n" );
1134 }
1135 /*
1136   Dumps a fully constructed but uncompressed trie in list form.
1137   List tries normally only are used for construction when the number of
1138   possible chars (trie->uniquecharcount) is very high.
1139   Used for debugging make_trie().
1140 */
1141 STATIC void
1142 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1143                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1144                          U32 depth)
1145 {
1146     U32 state;
1147     SV *sv=sv_newmortal();
1148     int colwidth= widecharmap ? 6 : 4;
1149     GET_RE_DEBUG_FLAGS_DECL;
1150
1151     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1152
1153     /* print out the table precompression.  */
1154     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1155         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1156         "------:-----+-----------------\n" );
1157
1158     for( state=1 ; state < next_alloc ; state ++ ) {
1159         U16 charid;
1160
1161         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1162             (int)depth * 2 + 2,"", (UV)state  );
1163         if ( ! trie->states[ state ].wordnum ) {
1164             PerlIO_printf( Perl_debug_log, "%5s| ","");
1165         } else {
1166             PerlIO_printf( Perl_debug_log, "W%4x| ",
1167                 trie->states[ state ].wordnum
1168             );
1169         }
1170         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1171             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1172             if ( tmp ) {
1173                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1174                     colwidth,
1175                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1176                             PL_colors[0], PL_colors[1],
1177                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1178                             PERL_PV_ESCAPE_FIRSTCHAR
1179                     ) ,
1180                     TRIE_LIST_ITEM(state,charid).forid,
1181                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1182                 );
1183                 if (!(charid % 10))
1184                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1185                         (int)((depth * 2) + 14), "");
1186             }
1187         }
1188         PerlIO_printf( Perl_debug_log, "\n");
1189     }
1190 }
1191
1192 /*
1193   Dumps a fully constructed but uncompressed trie in table form.
1194   This is the normal DFA style state transition table, with a few
1195   twists to facilitate compression later.
1196   Used for debugging make_trie().
1197 */
1198 STATIC void
1199 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1200                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1201                           U32 depth)
1202 {
1203     U32 state;
1204     U16 charid;
1205     SV *sv=sv_newmortal();
1206     int colwidth= widecharmap ? 6 : 4;
1207     GET_RE_DEBUG_FLAGS_DECL;
1208
1209     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1210
1211     /*
1212        print out the table precompression so that we can do a visual check
1213        that they are identical.
1214      */
1215
1216     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1217
1218     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1219         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1220         if ( tmp ) {
1221             PerlIO_printf( Perl_debug_log, "%*s",
1222                 colwidth,
1223                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1224                             PL_colors[0], PL_colors[1],
1225                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1226                             PERL_PV_ESCAPE_FIRSTCHAR
1227                 )
1228             );
1229         }
1230     }
1231
1232     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1233
1234     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1235         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1236     }
1237
1238     PerlIO_printf( Perl_debug_log, "\n" );
1239
1240     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1241
1242         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1243             (int)depth * 2 + 2,"",
1244             (UV)TRIE_NODENUM( state ) );
1245
1246         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1247             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1248             if (v)
1249                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1250             else
1251                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1252         }
1253         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1254             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1255         } else {
1256             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1257             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1258         }
1259     }
1260 }
1261
1262 #endif
1263
1264
1265 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1266   startbranch: the first branch in the whole branch sequence
1267   first      : start branch of sequence of branch-exact nodes.
1268                May be the same as startbranch
1269   last       : Thing following the last branch.
1270                May be the same as tail.
1271   tail       : item following the branch sequence
1272   count      : words in the sequence
1273   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1274   depth      : indent depth
1275
1276 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1277
1278 A trie is an N'ary tree where the branches are determined by digital
1279 decomposition of the key. IE, at the root node you look up the 1st character and
1280 follow that branch repeat until you find the end of the branches. Nodes can be
1281 marked as "accepting" meaning they represent a complete word. Eg:
1282
1283   /he|she|his|hers/
1284
1285 would convert into the following structure. Numbers represent states, letters
1286 following numbers represent valid transitions on the letter from that state, if
1287 the number is in square brackets it represents an accepting state, otherwise it
1288 will be in parenthesis.
1289
1290       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1291       |    |
1292       |   (2)
1293       |    |
1294      (1)   +-i->(6)-+-s->[7]
1295       |
1296       +-s->(3)-+-h->(4)-+-e->[5]
1297
1298       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1299
1300 This shows that when matching against the string 'hers' we will begin at state 1
1301 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1302 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1303 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1304 single traverse. We store a mapping from accepting to state to which word was
1305 matched, and then when we have multiple possibilities we try to complete the
1306 rest of the regex in the order in which they occured in the alternation.
1307
1308 The only prior NFA like behaviour that would be changed by the TRIE support is
1309 the silent ignoring of duplicate alternations which are of the form:
1310
1311  / (DUPE|DUPE) X? (?{ ... }) Y /x
1312
1313 Thus EVAL blocks following a trie may be called a different number of times with
1314 and without the optimisation. With the optimisations dupes will be silently
1315 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1316 the following demonstrates:
1317
1318  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1319
1320 which prints out 'word' three times, but
1321
1322  'words'=~/(word|word|word)(?{ print $1 })S/
1323
1324 which doesnt print it out at all. This is due to other optimisations kicking in.
1325
1326 Example of what happens on a structural level:
1327
1328 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1329
1330    1: CURLYM[1] {1,32767}(18)
1331    5:   BRANCH(8)
1332    6:     EXACT <ac>(16)
1333    8:   BRANCH(11)
1334    9:     EXACT <ad>(16)
1335   11:   BRANCH(14)
1336   12:     EXACT <ab>(16)
1337   16:   SUCCEED(0)
1338   17:   NOTHING(18)
1339   18: END(0)
1340
1341 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1342 and should turn into:
1343
1344    1: CURLYM[1] {1,32767}(18)
1345    5:   TRIE(16)
1346         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1347           <ac>
1348           <ad>
1349           <ab>
1350   16:   SUCCEED(0)
1351   17:   NOTHING(18)
1352   18: END(0)
1353
1354 Cases where tail != last would be like /(?foo|bar)baz/:
1355
1356    1: BRANCH(4)
1357    2:   EXACT <foo>(8)
1358    4: BRANCH(7)
1359    5:   EXACT <bar>(8)
1360    7: TAIL(8)
1361    8: EXACT <baz>(10)
1362   10: END(0)
1363
1364 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1365 and would end up looking like:
1366
1367     1: TRIE(8)
1368       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1369         <foo>
1370         <bar>
1371    7: TAIL(8)
1372    8: EXACT <baz>(10)
1373   10: END(0)
1374
1375     d = uvuni_to_utf8_flags(d, uv, 0);
1376
1377 is the recommended Unicode-aware way of saying
1378
1379     *(d++) = uv;
1380 */
1381
1382 #define TRIE_STORE_REVCHAR(val)                                            \
1383     STMT_START {                                                           \
1384         if (UTF) {                                                         \
1385             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1386             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1387             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1388             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1389             SvPOK_on(zlopp);                                               \
1390             SvUTF8_on(zlopp);                                              \
1391             av_push(revcharmap, zlopp);                                    \
1392         } else {                                                           \
1393             char ooooff = (char)val;                                           \
1394             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1395         }                                                                  \
1396         } STMT_END
1397
1398 #define TRIE_READ_CHAR STMT_START {                                                     \
1399     wordlen++;                                                                          \
1400     if ( UTF ) {                                                                        \
1401         /* if it is UTF then it is either already folded, or does not need folding */   \
1402         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1403     }                                                                                   \
1404     else if (folder == PL_fold_latin1) {                                                \
1405         /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1406         if ( foldlen > 0 ) {                                                            \
1407            uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1408            foldlen -= len;                                                              \
1409            scan += len;                                                                 \
1410            len = 0;                                                                     \
1411         } else {                                                                        \
1412             len = 1;                                                                    \
1413             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1414             skiplen = UNISKIP(uvc);                                                     \
1415             foldlen -= skiplen;                                                         \
1416             scan = foldbuf + skiplen;                                                   \
1417         }                                                                               \
1418     } else {                                                                            \
1419         /* raw data, will be folded later if needed */                                  \
1420         uvc = (U32)*uc;                                                                 \
1421         len = 1;                                                                        \
1422     }                                                                                   \
1423 } STMT_END
1424
1425
1426
1427 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1428     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1429         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1430         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1431     }                                                           \
1432     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1433     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1434     TRIE_LIST_CUR( state )++;                                   \
1435 } STMT_END
1436
1437 #define TRIE_LIST_NEW(state) STMT_START {                       \
1438     Newxz( trie->states[ state ].trans.list,               \
1439         4, reg_trie_trans_le );                                 \
1440      TRIE_LIST_CUR( state ) = 1;                                \
1441      TRIE_LIST_LEN( state ) = 4;                                \
1442 } STMT_END
1443
1444 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1445     U16 dupe= trie->states[ state ].wordnum;                    \
1446     regnode * const noper_next = regnext( noper );              \
1447                                                                 \
1448     DEBUG_r({                                                   \
1449         /* store the word for dumping */                        \
1450         SV* tmp;                                                \
1451         if (OP(noper) != NOTHING)                               \
1452             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1453         else                                                    \
1454             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1455         av_push( trie_words, tmp );                             \
1456     });                                                         \
1457                                                                 \
1458     curword++;                                                  \
1459     trie->wordinfo[curword].prev   = 0;                         \
1460     trie->wordinfo[curword].len    = wordlen;                   \
1461     trie->wordinfo[curword].accept = state;                     \
1462                                                                 \
1463     if ( noper_next < tail ) {                                  \
1464         if (!trie->jump)                                        \
1465             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1466         trie->jump[curword] = (U16)(noper_next - convert);      \
1467         if (!jumper)                                            \
1468             jumper = noper_next;                                \
1469         if (!nextbranch)                                        \
1470             nextbranch= regnext(cur);                           \
1471     }                                                           \
1472                                                                 \
1473     if ( dupe ) {                                               \
1474         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1475         /* chain, so that when the bits of chain are later    */\
1476         /* linked together, the dups appear in the chain      */\
1477         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1478         trie->wordinfo[dupe].prev = curword;                    \
1479     } else {                                                    \
1480         /* we haven't inserted this word yet.                */ \
1481         trie->states[ state ].wordnum = curword;                \
1482     }                                                           \
1483 } STMT_END
1484
1485
1486 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1487      ( ( base + charid >=  ucharcount                                   \
1488          && base + charid < ubound                                      \
1489          && state == trie->trans[ base - ucharcount + charid ].check    \
1490          && trie->trans[ base - ucharcount + charid ].next )            \
1491            ? trie->trans[ base - ucharcount + charid ].next             \
1492            : ( state==1 ? special : 0 )                                 \
1493       )
1494
1495 #define MADE_TRIE       1
1496 #define MADE_JUMP_TRIE  2
1497 #define MADE_EXACT_TRIE 4
1498
1499 STATIC I32
1500 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1501 {
1502     dVAR;
1503     /* first pass, loop through and scan words */
1504     reg_trie_data *trie;
1505     HV *widecharmap = NULL;
1506     AV *revcharmap = newAV();
1507     regnode *cur;
1508     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1509     STRLEN len = 0;
1510     UV uvc = 0;
1511     U16 curword = 0;
1512     U32 next_alloc = 0;
1513     regnode *jumper = NULL;
1514     regnode *nextbranch = NULL;
1515     regnode *convert = NULL;
1516     U32 *prev_states; /* temp array mapping each state to previous one */
1517     /* we just use folder as a flag in utf8 */
1518     const U8 * folder = NULL;
1519
1520 #ifdef DEBUGGING
1521     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1522     AV *trie_words = NULL;
1523     /* along with revcharmap, this only used during construction but both are
1524      * useful during debugging so we store them in the struct when debugging.
1525      */
1526 #else
1527     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1528     STRLEN trie_charcount=0;
1529 #endif
1530     SV *re_trie_maxbuff;
1531     GET_RE_DEBUG_FLAGS_DECL;
1532
1533     PERL_ARGS_ASSERT_MAKE_TRIE;
1534 #ifndef DEBUGGING
1535     PERL_UNUSED_ARG(depth);
1536 #endif
1537
1538     switch (flags) {
1539         case EXACT: break;
1540         case EXACTFA:
1541         case EXACTFU_SS:
1542         case EXACTFU_TRICKYFOLD:
1543         case EXACTFU: folder = PL_fold_latin1; break;
1544         case EXACTF:  folder = PL_fold; break;
1545         case EXACTFL: folder = PL_fold_locale; break;
1546         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1547     }
1548
1549     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1550     trie->refcount = 1;
1551     trie->startstate = 1;
1552     trie->wordcount = word_count;
1553     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1554     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1555     if (flags == EXACT)
1556         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1557     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1558                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1559
1560     DEBUG_r({
1561         trie_words = newAV();
1562     });
1563
1564     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1565     if (!SvIOK(re_trie_maxbuff)) {
1566         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1567     }
1568     DEBUG_TRIE_COMPILE_r({
1569                 PerlIO_printf( Perl_debug_log,
1570                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1571                   (int)depth * 2 + 2, "",
1572                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1573                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1574                   (int)depth);
1575     });
1576
1577    /* Find the node we are going to overwrite */
1578     if ( first == startbranch && OP( last ) != BRANCH ) {
1579         /* whole branch chain */
1580         convert = first;
1581     } else {
1582         /* branch sub-chain */
1583         convert = NEXTOPER( first );
1584     }
1585
1586     /*  -- First loop and Setup --
1587
1588        We first traverse the branches and scan each word to determine if it
1589        contains widechars, and how many unique chars there are, this is
1590        important as we have to build a table with at least as many columns as we
1591        have unique chars.
1592
1593        We use an array of integers to represent the character codes 0..255
1594        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1595        native representation of the character value as the key and IV's for the
1596        coded index.
1597
1598        *TODO* If we keep track of how many times each character is used we can
1599        remap the columns so that the table compression later on is more
1600        efficient in terms of memory by ensuring the most common value is in the
1601        middle and the least common are on the outside.  IMO this would be better
1602        than a most to least common mapping as theres a decent chance the most
1603        common letter will share a node with the least common, meaning the node
1604        will not be compressible. With a middle is most common approach the worst
1605        case is when we have the least common nodes twice.
1606
1607      */
1608
1609     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1610         regnode *noper = NEXTOPER( cur );
1611         const U8 *uc = (U8*)STRING( noper );
1612         const U8 *e  = uc + STR_LEN( noper );
1613         STRLEN foldlen = 0;
1614         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1615         STRLEN skiplen = 0;
1616         const U8 *scan = (U8*)NULL;
1617         U32 wordlen      = 0;         /* required init */
1618         STRLEN chars = 0;
1619         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1620
1621         if (OP(noper) == NOTHING) {
1622             regnode *noper_next= regnext(noper);
1623             if (noper_next != tail && OP(noper_next) == flags) {
1624                 noper = noper_next;
1625                 uc= (U8*)STRING(noper);
1626                 e= uc + STR_LEN(noper);
1627                 trie->minlen= STR_LEN(noper);
1628             } else {
1629                 trie->minlen= 0;
1630                 continue;
1631             }
1632         }
1633
1634         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1635             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1636                                           regardless of encoding */
1637             if (OP( noper ) == EXACTFU_SS) {
1638                 /* false positives are ok, so just set this */
1639                 TRIE_BITMAP_SET(trie,0xDF);
1640             }
1641         }
1642         for ( ; uc < e ; uc += len ) {
1643             TRIE_CHARCOUNT(trie)++;
1644             TRIE_READ_CHAR;
1645             chars++;
1646             if ( uvc < 256 ) {
1647                 if ( folder ) {
1648                     U8 folded= folder[ (U8) uvc ];
1649                     if ( !trie->charmap[ folded ] ) {
1650                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
1651                         TRIE_STORE_REVCHAR( folded );
1652                     }
1653                 }
1654                 if ( !trie->charmap[ uvc ] ) {
1655                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1656                     TRIE_STORE_REVCHAR( uvc );
1657                 }
1658                 if ( set_bit ) {
1659                     /* store the codepoint in the bitmap, and its folded
1660                      * equivalent. */
1661                     TRIE_BITMAP_SET(trie, uvc);
1662
1663                     /* store the folded codepoint */
1664                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1665
1666                     if ( !UTF ) {
1667                         /* store first byte of utf8 representation of
1668                            variant codepoints */
1669                         if (! UNI_IS_INVARIANT(uvc)) {
1670                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1671                         }
1672                     }
1673                     set_bit = 0; /* We've done our bit :-) */
1674                 }
1675             } else {
1676                 SV** svpp;
1677                 if ( !widecharmap )
1678                     widecharmap = newHV();
1679
1680                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1681
1682                 if ( !svpp )
1683                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1684
1685                 if ( !SvTRUE( *svpp ) ) {
1686                     sv_setiv( *svpp, ++trie->uniquecharcount );
1687                     TRIE_STORE_REVCHAR(uvc);
1688                 }
1689             }
1690         }
1691         if( cur == first ) {
1692             trie->minlen = chars;
1693             trie->maxlen = chars;
1694         } else if (chars < trie->minlen) {
1695             trie->minlen = chars;
1696         } else if (chars > trie->maxlen) {
1697             trie->maxlen = chars;
1698         }
1699         if (OP( noper ) == EXACTFU_SS) {
1700             /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1701             if (trie->minlen > 1)
1702                 trie->minlen= 1;
1703         }
1704         if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1705             /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1706              *                - We assume that any such sequence might match a 2 byte string */
1707             if (trie->minlen > 2 )
1708                 trie->minlen= 2;
1709         }
1710
1711     } /* end first pass */
1712     DEBUG_TRIE_COMPILE_r(
1713         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1714                 (int)depth * 2 + 2,"",
1715                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1716                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1717                 (int)trie->minlen, (int)trie->maxlen )
1718     );
1719
1720     /*
1721         We now know what we are dealing with in terms of unique chars and
1722         string sizes so we can calculate how much memory a naive
1723         representation using a flat table  will take. If it's over a reasonable
1724         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1725         conservative but potentially much slower representation using an array
1726         of lists.
1727
1728         At the end we convert both representations into the same compressed
1729         form that will be used in regexec.c for matching with. The latter
1730         is a form that cannot be used to construct with but has memory
1731         properties similar to the list form and access properties similar
1732         to the table form making it both suitable for fast searches and
1733         small enough that its feasable to store for the duration of a program.
1734
1735         See the comment in the code where the compressed table is produced
1736         inplace from the flat tabe representation for an explanation of how
1737         the compression works.
1738
1739     */
1740
1741
1742     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1743     prev_states[1] = 0;
1744
1745     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1746         /*
1747             Second Pass -- Array Of Lists Representation
1748
1749             Each state will be represented by a list of charid:state records
1750             (reg_trie_trans_le) the first such element holds the CUR and LEN
1751             points of the allocated array. (See defines above).
1752
1753             We build the initial structure using the lists, and then convert
1754             it into the compressed table form which allows faster lookups
1755             (but cant be modified once converted).
1756         */
1757
1758         STRLEN transcount = 1;
1759
1760         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1761             "%*sCompiling trie using list compiler\n",
1762             (int)depth * 2 + 2, ""));
1763
1764         trie->states = (reg_trie_state *)
1765             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1766                                   sizeof(reg_trie_state) );
1767         TRIE_LIST_NEW(1);
1768         next_alloc = 2;
1769
1770         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1771
1772             regnode *noper   = NEXTOPER( cur );
1773             U8 *uc           = (U8*)STRING( noper );
1774             const U8 *e      = uc + STR_LEN( noper );
1775             U32 state        = 1;         /* required init */
1776             U16 charid       = 0;         /* sanity init */
1777             U8 *scan         = (U8*)NULL; /* sanity init */
1778             STRLEN foldlen   = 0;         /* required init */
1779             U32 wordlen      = 0;         /* required init */
1780             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1781             STRLEN skiplen   = 0;
1782
1783             if (OP(noper) == NOTHING) {
1784                 regnode *noper_next= regnext(noper);
1785                 if (noper_next != tail && OP(noper_next) == flags) {
1786                     noper = noper_next;
1787                     uc= (U8*)STRING(noper);
1788                     e= uc + STR_LEN(noper);
1789                 }
1790             }
1791
1792             if (OP(noper) != NOTHING) {
1793                 for ( ; uc < e ; uc += len ) {
1794
1795                     TRIE_READ_CHAR;
1796
1797                     if ( uvc < 256 ) {
1798                         charid = trie->charmap[ uvc ];
1799                     } else {
1800                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1801                         if ( !svpp ) {
1802                             charid = 0;
1803                         } else {
1804                             charid=(U16)SvIV( *svpp );
1805                         }
1806                     }
1807                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1808                     if ( charid ) {
1809
1810                         U16 check;
1811                         U32 newstate = 0;
1812
1813                         charid--;
1814                         if ( !trie->states[ state ].trans.list ) {
1815                             TRIE_LIST_NEW( state );
1816                         }
1817                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1818                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1819                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1820                                 break;
1821                             }
1822                         }
1823                         if ( ! newstate ) {
1824                             newstate = next_alloc++;
1825                             prev_states[newstate] = state;
1826                             TRIE_LIST_PUSH( state, charid, newstate );
1827                             transcount++;
1828                         }
1829                         state = newstate;
1830                     } else {
1831                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1832                     }
1833                 }
1834             }
1835             TRIE_HANDLE_WORD(state);
1836
1837         } /* end second pass */
1838
1839         /* next alloc is the NEXT state to be allocated */
1840         trie->statecount = next_alloc;
1841         trie->states = (reg_trie_state *)
1842             PerlMemShared_realloc( trie->states,
1843                                    next_alloc
1844                                    * sizeof(reg_trie_state) );
1845
1846         /* and now dump it out before we compress it */
1847         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1848                                                          revcharmap, next_alloc,
1849                                                          depth+1)
1850         );
1851
1852         trie->trans = (reg_trie_trans *)
1853             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1854         {
1855             U32 state;
1856             U32 tp = 0;
1857             U32 zp = 0;
1858
1859
1860             for( state=1 ; state < next_alloc ; state ++ ) {
1861                 U32 base=0;
1862
1863                 /*
1864                 DEBUG_TRIE_COMPILE_MORE_r(
1865                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1866                 );
1867                 */
1868
1869                 if (trie->states[state].trans.list) {
1870                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1871                     U16 maxid=minid;
1872                     U16 idx;
1873
1874                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1875                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1876                         if ( forid < minid ) {
1877                             minid=forid;
1878                         } else if ( forid > maxid ) {
1879                             maxid=forid;
1880                         }
1881                     }
1882                     if ( transcount < tp + maxid - minid + 1) {
1883                         transcount *= 2;
1884                         trie->trans = (reg_trie_trans *)
1885                             PerlMemShared_realloc( trie->trans,
1886                                                      transcount
1887                                                      * sizeof(reg_trie_trans) );
1888                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1889                     }
1890                     base = trie->uniquecharcount + tp - minid;
1891                     if ( maxid == minid ) {
1892                         U32 set = 0;
1893                         for ( ; zp < tp ; zp++ ) {
1894                             if ( ! trie->trans[ zp ].next ) {
1895                                 base = trie->uniquecharcount + zp - minid;
1896                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1897                                 trie->trans[ zp ].check = state;
1898                                 set = 1;
1899                                 break;
1900                             }
1901                         }
1902                         if ( !set ) {
1903                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1904                             trie->trans[ tp ].check = state;
1905                             tp++;
1906                             zp = tp;
1907                         }
1908                     } else {
1909                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1910                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1911                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1912                             trie->trans[ tid ].check = state;
1913                         }
1914                         tp += ( maxid - minid + 1 );
1915                     }
1916                     Safefree(trie->states[ state ].trans.list);
1917                 }
1918                 /*
1919                 DEBUG_TRIE_COMPILE_MORE_r(
1920                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1921                 );
1922                 */
1923                 trie->states[ state ].trans.base=base;
1924             }
1925             trie->lasttrans = tp + 1;
1926         }
1927     } else {
1928         /*
1929            Second Pass -- Flat Table Representation.
1930
1931            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1932            We know that we will need Charcount+1 trans at most to store the data
1933            (one row per char at worst case) So we preallocate both structures
1934            assuming worst case.
1935
1936            We then construct the trie using only the .next slots of the entry
1937            structs.
1938
1939            We use the .check field of the first entry of the node temporarily to
1940            make compression both faster and easier by keeping track of how many non
1941            zero fields are in the node.
1942
1943            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1944            transition.
1945
1946            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1947            number representing the first entry of the node, and state as a
1948            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1949            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1950            are 2 entrys per node. eg:
1951
1952              A B       A B
1953           1. 2 4    1. 3 7
1954           2. 0 3    3. 0 5
1955           3. 0 0    5. 0 0
1956           4. 0 0    7. 0 0
1957
1958            The table is internally in the right hand, idx form. However as we also
1959            have to deal with the states array which is indexed by nodenum we have to
1960            use TRIE_NODENUM() to convert.
1961
1962         */
1963         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1964             "%*sCompiling trie using table compiler\n",
1965             (int)depth * 2 + 2, ""));
1966
1967         trie->trans = (reg_trie_trans *)
1968             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1969                                   * trie->uniquecharcount + 1,
1970                                   sizeof(reg_trie_trans) );
1971         trie->states = (reg_trie_state *)
1972             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1973                                   sizeof(reg_trie_state) );
1974         next_alloc = trie->uniquecharcount + 1;
1975
1976
1977         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1978
1979             regnode *noper   = NEXTOPER( cur );
1980             const U8 *uc     = (U8*)STRING( noper );
1981             const U8 *e      = uc + STR_LEN( noper );
1982
1983             U32 state        = 1;         /* required init */
1984
1985             U16 charid       = 0;         /* sanity init */
1986             U32 accept_state = 0;         /* sanity init */
1987             U8 *scan         = (U8*)NULL; /* sanity init */
1988
1989             STRLEN foldlen   = 0;         /* required init */
1990             U32 wordlen      = 0;         /* required init */
1991             STRLEN skiplen   = 0;
1992             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1993
1994             if (OP(noper) == NOTHING) {
1995                 regnode *noper_next= regnext(noper);
1996                 if (noper_next != tail && OP(noper_next) == flags) {
1997                     noper = noper_next;
1998                     uc= (U8*)STRING(noper);
1999                     e= uc + STR_LEN(noper);
2000                 }
2001             }
2002
2003             if ( OP(noper) != NOTHING ) {
2004                 for ( ; uc < e ; uc += len ) {
2005
2006                     TRIE_READ_CHAR;
2007
2008                     if ( uvc < 256 ) {
2009                         charid = trie->charmap[ uvc ];
2010                     } else {
2011                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2012                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2013                     }
2014                     if ( charid ) {
2015                         charid--;
2016                         if ( !trie->trans[ state + charid ].next ) {
2017                             trie->trans[ state + charid ].next = next_alloc;
2018                             trie->trans[ state ].check++;
2019                             prev_states[TRIE_NODENUM(next_alloc)]
2020                                     = TRIE_NODENUM(state);
2021                             next_alloc += trie->uniquecharcount;
2022                         }
2023                         state = trie->trans[ state + charid ].next;
2024                     } else {
2025                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2026                     }
2027                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
2028                 }
2029             }
2030             accept_state = TRIE_NODENUM( state );
2031             TRIE_HANDLE_WORD(accept_state);
2032
2033         } /* end second pass */
2034
2035         /* and now dump it out before we compress it */
2036         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2037                                                           revcharmap,
2038                                                           next_alloc, depth+1));
2039
2040         {
2041         /*
2042            * Inplace compress the table.*
2043
2044            For sparse data sets the table constructed by the trie algorithm will
2045            be mostly 0/FAIL transitions or to put it another way mostly empty.
2046            (Note that leaf nodes will not contain any transitions.)
2047
2048            This algorithm compresses the tables by eliminating most such
2049            transitions, at the cost of a modest bit of extra work during lookup:
2050
2051            - Each states[] entry contains a .base field which indicates the
2052            index in the state[] array wheres its transition data is stored.
2053
2054            - If .base is 0 there are no valid transitions from that node.
2055
2056            - If .base is nonzero then charid is added to it to find an entry in
2057            the trans array.
2058
2059            -If trans[states[state].base+charid].check!=state then the
2060            transition is taken to be a 0/Fail transition. Thus if there are fail
2061            transitions at the front of the node then the .base offset will point
2062            somewhere inside the previous nodes data (or maybe even into a node
2063            even earlier), but the .check field determines if the transition is
2064            valid.
2065
2066            XXX - wrong maybe?
2067            The following process inplace converts the table to the compressed
2068            table: We first do not compress the root node 1,and mark all its
2069            .check pointers as 1 and set its .base pointer as 1 as well. This
2070            allows us to do a DFA construction from the compressed table later,
2071            and ensures that any .base pointers we calculate later are greater
2072            than 0.
2073
2074            - We set 'pos' to indicate the first entry of the second node.
2075
2076            - We then iterate over the columns of the node, finding the first and
2077            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2078            and set the .check pointers accordingly, and advance pos
2079            appropriately and repreat for the next node. Note that when we copy
2080            the next pointers we have to convert them from the original
2081            NODEIDX form to NODENUM form as the former is not valid post
2082            compression.
2083
2084            - If a node has no transitions used we mark its base as 0 and do not
2085            advance the pos pointer.
2086
2087            - If a node only has one transition we use a second pointer into the
2088            structure to fill in allocated fail transitions from other states.
2089            This pointer is independent of the main pointer and scans forward
2090            looking for null transitions that are allocated to a state. When it
2091            finds one it writes the single transition into the "hole".  If the
2092            pointer doesnt find one the single transition is appended as normal.
2093
2094            - Once compressed we can Renew/realloc the structures to release the
2095            excess space.
2096
2097            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2098            specifically Fig 3.47 and the associated pseudocode.
2099
2100            demq
2101         */
2102         const U32 laststate = TRIE_NODENUM( next_alloc );
2103         U32 state, charid;
2104         U32 pos = 0, zp=0;
2105         trie->statecount = laststate;
2106
2107         for ( state = 1 ; state < laststate ; state++ ) {
2108             U8 flag = 0;
2109             const U32 stateidx = TRIE_NODEIDX( state );
2110             const U32 o_used = trie->trans[ stateidx ].check;
2111             U32 used = trie->trans[ stateidx ].check;
2112             trie->trans[ stateidx ].check = 0;
2113
2114             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2115                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2116                     if ( trie->trans[ stateidx + charid ].next ) {
2117                         if (o_used == 1) {
2118                             for ( ; zp < pos ; zp++ ) {
2119                                 if ( ! trie->trans[ zp ].next ) {
2120                                     break;
2121                                 }
2122                             }
2123                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2124                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2125                             trie->trans[ zp ].check = state;
2126                             if ( ++zp > pos ) pos = zp;
2127                             break;
2128                         }
2129                         used--;
2130                     }
2131                     if ( !flag ) {
2132                         flag = 1;
2133                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2134                     }
2135                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2136                     trie->trans[ pos ].check = state;
2137                     pos++;
2138                 }
2139             }
2140         }
2141         trie->lasttrans = pos + 1;
2142         trie->states = (reg_trie_state *)
2143             PerlMemShared_realloc( trie->states, laststate
2144                                    * sizeof(reg_trie_state) );
2145         DEBUG_TRIE_COMPILE_MORE_r(
2146                 PerlIO_printf( Perl_debug_log,
2147                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2148                     (int)depth * 2 + 2,"",
2149                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2150                     (IV)next_alloc,
2151                     (IV)pos,
2152                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2153             );
2154
2155         } /* end table compress */
2156     }
2157     DEBUG_TRIE_COMPILE_MORE_r(
2158             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2159                 (int)depth * 2 + 2, "",
2160                 (UV)trie->statecount,
2161                 (UV)trie->lasttrans)
2162     );
2163     /* resize the trans array to remove unused space */
2164     trie->trans = (reg_trie_trans *)
2165         PerlMemShared_realloc( trie->trans, trie->lasttrans
2166                                * sizeof(reg_trie_trans) );
2167
2168     {   /* Modify the program and insert the new TRIE node */
2169         U8 nodetype =(U8)(flags & 0xFF);
2170         char *str=NULL;
2171
2172 #ifdef DEBUGGING
2173         regnode *optimize = NULL;
2174 #ifdef RE_TRACK_PATTERN_OFFSETS
2175
2176         U32 mjd_offset = 0;
2177         U32 mjd_nodelen = 0;
2178 #endif /* RE_TRACK_PATTERN_OFFSETS */
2179 #endif /* DEBUGGING */
2180         /*
2181            This means we convert either the first branch or the first Exact,
2182            depending on whether the thing following (in 'last') is a branch
2183            or not and whther first is the startbranch (ie is it a sub part of
2184            the alternation or is it the whole thing.)
2185            Assuming its a sub part we convert the EXACT otherwise we convert
2186            the whole branch sequence, including the first.
2187          */
2188         /* Find the node we are going to overwrite */
2189         if ( first != startbranch || OP( last ) == BRANCH ) {
2190             /* branch sub-chain */
2191             NEXT_OFF( first ) = (U16)(last - first);
2192 #ifdef RE_TRACK_PATTERN_OFFSETS
2193             DEBUG_r({
2194                 mjd_offset= Node_Offset((convert));
2195                 mjd_nodelen= Node_Length((convert));
2196             });
2197 #endif
2198             /* whole branch chain */
2199         }
2200 #ifdef RE_TRACK_PATTERN_OFFSETS
2201         else {
2202             DEBUG_r({
2203                 const  regnode *nop = NEXTOPER( convert );
2204                 mjd_offset= Node_Offset((nop));
2205                 mjd_nodelen= Node_Length((nop));
2206             });
2207         }
2208         DEBUG_OPTIMISE_r(
2209             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2210                 (int)depth * 2 + 2, "",
2211                 (UV)mjd_offset, (UV)mjd_nodelen)
2212         );
2213 #endif
2214         /* But first we check to see if there is a common prefix we can
2215            split out as an EXACT and put in front of the TRIE node.  */
2216         trie->startstate= 1;
2217         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2218             U32 state;
2219             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2220                 U32 ofs = 0;
2221                 I32 idx = -1;
2222                 U32 count = 0;
2223                 const U32 base = trie->states[ state ].trans.base;
2224
2225                 if ( trie->states[state].wordnum )
2226                         count = 1;
2227
2228                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2229                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2230                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2231                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2232                     {
2233                         if ( ++count > 1 ) {
2234                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2235                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2236                             if ( state == 1 ) break;
2237                             if ( count == 2 ) {
2238                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2239                                 DEBUG_OPTIMISE_r(
2240                                     PerlIO_printf(Perl_debug_log,
2241                                         "%*sNew Start State=%"UVuf" Class: [",
2242                                         (int)depth * 2 + 2, "",
2243                                         (UV)state));
2244                                 if (idx >= 0) {
2245                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2246                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2247
2248                                     TRIE_BITMAP_SET(trie,*ch);
2249                                     if ( folder )
2250                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2251                                     DEBUG_OPTIMISE_r(
2252                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2253                                     );
2254                                 }
2255                             }
2256                             TRIE_BITMAP_SET(trie,*ch);
2257                             if ( folder )
2258                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2259                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2260                         }
2261                         idx = ofs;
2262                     }
2263                 }
2264                 if ( count == 1 ) {
2265                     SV **tmp = av_fetch( revcharmap, idx, 0);
2266                     STRLEN len;
2267                     char *ch = SvPV( *tmp, len );
2268                     DEBUG_OPTIMISE_r({
2269                         SV *sv=sv_newmortal();
2270                         PerlIO_printf( Perl_debug_log,
2271                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2272                             (int)depth * 2 + 2, "",
2273                             (UV)state, (UV)idx,
2274                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2275                                 PL_colors[0], PL_colors[1],
2276                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2277                                 PERL_PV_ESCAPE_FIRSTCHAR
2278                             )
2279                         );
2280                     });
2281                     if ( state==1 ) {
2282                         OP( convert ) = nodetype;
2283                         str=STRING(convert);
2284                         STR_LEN(convert)=0;
2285                     }
2286                     STR_LEN(convert) += len;
2287                     while (len--)
2288                         *str++ = *ch++;
2289                 } else {
2290 #ifdef DEBUGGING
2291                     if (state>1)
2292                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2293 #endif
2294                     break;
2295                 }
2296             }
2297             trie->prefixlen = (state-1);
2298             if (str) {
2299                 regnode *n = convert+NODE_SZ_STR(convert);
2300                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2301                 trie->startstate = state;
2302                 trie->minlen -= (state - 1);
2303                 trie->maxlen -= (state - 1);
2304 #ifdef DEBUGGING
2305                /* At least the UNICOS C compiler choked on this
2306                 * being argument to DEBUG_r(), so let's just have
2307                 * it right here. */
2308                if (
2309 #ifdef PERL_EXT_RE_BUILD
2310                    1
2311 #else
2312                    DEBUG_r_TEST
2313 #endif
2314                    ) {
2315                    regnode *fix = convert;
2316                    U32 word = trie->wordcount;
2317                    mjd_nodelen++;
2318                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2319                    while( ++fix < n ) {
2320                        Set_Node_Offset_Length(fix, 0, 0);
2321                    }
2322                    while (word--) {
2323                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2324                        if (tmp) {
2325                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2326                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2327                            else
2328                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2329                        }
2330                    }
2331                }
2332 #endif
2333                 if (trie->maxlen) {
2334                     convert = n;
2335                 } else {
2336                     NEXT_OFF(convert) = (U16)(tail - convert);
2337                     DEBUG_r(optimize= n);
2338                 }
2339             }
2340         }
2341         if (!jumper)
2342             jumper = last;
2343         if ( trie->maxlen ) {
2344             NEXT_OFF( convert ) = (U16)(tail - convert);
2345             ARG_SET( convert, data_slot );
2346             /* Store the offset to the first unabsorbed branch in
2347                jump[0], which is otherwise unused by the jump logic.
2348                We use this when dumping a trie and during optimisation. */
2349             if (trie->jump)
2350                 trie->jump[0] = (U16)(nextbranch - convert);
2351
2352             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2353              *   and there is a bitmap
2354              *   and the first "jump target" node we found leaves enough room
2355              * then convert the TRIE node into a TRIEC node, with the bitmap
2356              * embedded inline in the opcode - this is hypothetically faster.
2357              */
2358             if ( !trie->states[trie->startstate].wordnum
2359                  && trie->bitmap
2360                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2361             {
2362                 OP( convert ) = TRIEC;
2363                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2364                 PerlMemShared_free(trie->bitmap);
2365                 trie->bitmap= NULL;
2366             } else
2367                 OP( convert ) = TRIE;
2368
2369             /* store the type in the flags */
2370             convert->flags = nodetype;
2371             DEBUG_r({
2372             optimize = convert
2373                       + NODE_STEP_REGNODE
2374                       + regarglen[ OP( convert ) ];
2375             });
2376             /* XXX We really should free up the resource in trie now,
2377                    as we won't use them - (which resources?) dmq */
2378         }
2379         /* needed for dumping*/
2380         DEBUG_r(if (optimize) {
2381             regnode *opt = convert;
2382
2383             while ( ++opt < optimize) {
2384                 Set_Node_Offset_Length(opt,0,0);
2385             }
2386             /*
2387                 Try to clean up some of the debris left after the
2388                 optimisation.
2389              */
2390             while( optimize < jumper ) {
2391                 mjd_nodelen += Node_Length((optimize));
2392                 OP( optimize ) = OPTIMIZED;
2393                 Set_Node_Offset_Length(optimize,0,0);
2394                 optimize++;
2395             }
2396             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2397         });
2398     } /* end node insert */
2399     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2400
2401     /*  Finish populating the prev field of the wordinfo array.  Walk back
2402      *  from each accept state until we find another accept state, and if
2403      *  so, point the first word's .prev field at the second word. If the
2404      *  second already has a .prev field set, stop now. This will be the
2405      *  case either if we've already processed that word's accept state,
2406      *  or that state had multiple words, and the overspill words were
2407      *  already linked up earlier.
2408      */
2409     {
2410         U16 word;
2411         U32 state;
2412         U16 prev;
2413
2414         for (word=1; word <= trie->wordcount; word++) {
2415             prev = 0;
2416             if (trie->wordinfo[word].prev)
2417                 continue;
2418             state = trie->wordinfo[word].accept;
2419             while (state) {
2420                 state = prev_states[state];
2421                 if (!state)
2422                     break;
2423                 prev = trie->states[state].wordnum;
2424                 if (prev)
2425                     break;
2426             }
2427             trie->wordinfo[word].prev = prev;
2428         }
2429         Safefree(prev_states);
2430     }
2431
2432
2433     /* and now dump out the compressed format */
2434     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2435
2436     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2437 #ifdef DEBUGGING
2438     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2439     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2440 #else
2441     SvREFCNT_dec(revcharmap);
2442 #endif
2443     return trie->jump
2444            ? MADE_JUMP_TRIE
2445            : trie->startstate>1
2446              ? MADE_EXACT_TRIE
2447              : MADE_TRIE;
2448 }
2449
2450 STATIC void
2451 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2452 {
2453 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2454
2455    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2456    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2457    ISBN 0-201-10088-6
2458
2459    We find the fail state for each state in the trie, this state is the longest proper
2460    suffix of the current state's 'word' that is also a proper prefix of another word in our
2461    trie. State 1 represents the word '' and is thus the default fail state. This allows
2462    the DFA not to have to restart after its tried and failed a word at a given point, it
2463    simply continues as though it had been matching the other word in the first place.
2464    Consider
2465       'abcdgu'=~/abcdefg|cdgu/
2466    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2467    fail, which would bring us to the state representing 'd' in the second word where we would
2468    try 'g' and succeed, proceeding to match 'cdgu'.
2469  */
2470  /* add a fail transition */
2471     const U32 trie_offset = ARG(source);
2472     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2473     U32 *q;
2474     const U32 ucharcount = trie->uniquecharcount;
2475     const U32 numstates = trie->statecount;
2476     const U32 ubound = trie->lasttrans + ucharcount;
2477     U32 q_read = 0;
2478     U32 q_write = 0;
2479     U32 charid;
2480     U32 base = trie->states[ 1 ].trans.base;
2481     U32 *fail;
2482     reg_ac_data *aho;
2483     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2484     GET_RE_DEBUG_FLAGS_DECL;
2485
2486     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2487 #ifndef DEBUGGING
2488     PERL_UNUSED_ARG(depth);
2489 #endif
2490
2491
2492     ARG_SET( stclass, data_slot );
2493     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2494     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2495     aho->trie=trie_offset;
2496     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2497     Copy( trie->states, aho->states, numstates, reg_trie_state );
2498     Newxz( q, numstates, U32);
2499     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2500     aho->refcount = 1;
2501     fail = aho->fail;
2502     /* initialize fail[0..1] to be 1 so that we always have
2503        a valid final fail state */
2504     fail[ 0 ] = fail[ 1 ] = 1;
2505
2506     for ( charid = 0; charid < ucharcount ; charid++ ) {
2507         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2508         if ( newstate ) {
2509             q[ q_write ] = newstate;
2510             /* set to point at the root */
2511             fail[ q[ q_write++ ] ]=1;
2512         }
2513     }
2514     while ( q_read < q_write) {
2515         const U32 cur = q[ q_read++ % numstates ];
2516         base = trie->states[ cur ].trans.base;
2517
2518         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2519             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2520             if (ch_state) {
2521                 U32 fail_state = cur;
2522                 U32 fail_base;
2523                 do {
2524                     fail_state = fail[ fail_state ];
2525                     fail_base = aho->states[ fail_state ].trans.base;
2526                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2527
2528                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2529                 fail[ ch_state ] = fail_state;
2530                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2531                 {
2532                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2533                 }
2534                 q[ q_write++ % numstates] = ch_state;
2535             }
2536         }
2537     }
2538     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2539        when we fail in state 1, this allows us to use the
2540        charclass scan to find a valid start char. This is based on the principle
2541        that theres a good chance the string being searched contains lots of stuff
2542        that cant be a start char.
2543      */
2544     fail[ 0 ] = fail[ 1 ] = 0;
2545     DEBUG_TRIE_COMPILE_r({
2546         PerlIO_printf(Perl_debug_log,
2547                       "%*sStclass Failtable (%"UVuf" states): 0",
2548                       (int)(depth * 2), "", (UV)numstates
2549         );
2550         for( q_read=1; q_read<numstates; q_read++ ) {
2551             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2552         }
2553         PerlIO_printf(Perl_debug_log, "\n");
2554     });
2555     Safefree(q);
2556     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2557 }
2558
2559
2560 /*
2561  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2562  * These need to be revisited when a newer toolchain becomes available.
2563  */
2564 #if defined(__sparc64__) && defined(__GNUC__)
2565 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2566 #       undef  SPARC64_GCC_WORKAROUND
2567 #       define SPARC64_GCC_WORKAROUND 1
2568 #   endif
2569 #endif
2570
2571 #define DEBUG_PEEP(str,scan,depth) \
2572     DEBUG_OPTIMISE_r({if (scan){ \
2573        SV * const mysv=sv_newmortal(); \
2574        regnode *Next = regnext(scan); \
2575        regprop(RExC_rx, mysv, scan); \
2576        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2577        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2578        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2579    }});
2580
2581
2582 /* The below joins as many adjacent EXACTish nodes as possible into a single
2583  * one, and looks for problematic sequences of characters whose folds vs.
2584  * non-folds have sufficiently different lengths, that the optimizer would be
2585  * fooled into rejecting legitimate matches of them, and the trie construction
2586  * code needs to handle specially.  The joining is only done if:
2587  * 1) there is room in the current conglomerated node to entirely contain the
2588  *    next one.
2589  * 2) they are the exact same node type
2590  *
2591  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2592  * these get optimized out
2593  *
2594  * If there are problematic code sequences, *min_subtract is set to the delta
2595  * that the minimum size of the node can be less than its actual size.  And,
2596  * the node type of the result is changed to reflect that it contains these
2597  * sequences.
2598  *
2599  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2600  * and contains LATIN SMALL LETTER SHARP S
2601  *
2602  * This is as good a place as any to discuss the design of handling these
2603  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2604  * are three code points currently in Unicode whose folded lengths differ so
2605  * much from the un-folded lengths that it causes problems for the optimizer
2606  * and trie construction.  Why only these are problematic, and not others where
2607  * lengths also differ is something I (khw) do not understand.  New versions of
2608  * Unicode might add more such code points.  Hopefully the logic in
2609  * fold_grind.t that figures out what to test (in part by verifying that each
2610  * size-combination gets tested) will catch any that do come along, so they can
2611  * be added to the special handling below.  The chances of new ones are
2612  * actually rather small, as most, if not all, of the world's scripts that have
2613  * casefolding have already been encoded by Unicode.  Also, a number of
2614  * Unicode's decisions were made to allow compatibility with pre-existing
2615  * standards, and almost all of those have already been dealt with.  These
2616  * would otherwise be the most likely candidates for generating further tricky
2617  * sequences.  In other words, Unicode by itself is unlikely to add new ones
2618  * unless it is for compatibility with pre-existing standards, and there aren't
2619  * many of those left.
2620  *
2621  * The previous designs for dealing with these involved assigning a special
2622  * node for them.  This approach doesn't work, as evidenced by this example:
2623  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2624  * Both these fold to "sss", but if the pattern is parsed to create a node
2625  * that would match just the \xDF, it won't be able to handle the case where a
2626  * successful match would have to cross the node's boundary.  The new approach
2627  * that hopefully generally solves the problem generates an EXACTFU_SS node
2628  * that is "sss".
2629  *
2630  * There are a number of components to the approach (a lot of work for just
2631  * three code points!):
2632  * 1)   This routine examines each EXACTFish node that could contain the
2633  *      problematic sequences.  It returns in *min_subtract how much to
2634  *      subtract from the the actual length of the string to get a real minimum
2635  *      for one that could match it.  This number is usually 0 except for the
2636  *      problematic sequences.  This delta is used by the caller to adjust the
2637  *      min length of the match, and the delta between min and max, so that the
2638  *      optimizer doesn't reject these possibilities based on size constraints.
2639  * 2)   These sequences require special handling by the trie code, so this code
2640  *      changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
2641  *      EXACTFU_SS.
2642  * 3)   This is sufficient for the two Greek sequences (described below), but
2643  *      the one involving the Sharp s (\xDF) needs more.  The node type
2644  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2645  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2646  *      case where there is a possible fold length change.  That means that a
2647  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2648  *      itself with length changes, and so can be processed faster.  regexec.c
2649  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2650  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2651  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2652  *      fold of the MICRO SIGN requires UTF-8.  Also what EXACTF and EXACTFL
2653  *      nodes fold to isn't known until runtime.  The fold possibilities for
2654  *      the non-UTF8 patterns are quite simple, except for the sharp s.  All
2655  *      the ones that don't involve a UTF-8 target string are members of a
2656  *      fold-pair, and arrays are set up for all of them so that the other
2657  *      member of the pair can be found quickly.  Code elsewhere in this file
2658  *      makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
2659  *      if the pattern isn't UTF-8.  This avoids the issues described in the
2660  *      next item.
2661  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2662  *      'ss' or not is not knowable at compile time.  It will match iff the
2663  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2664  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2665  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2666  *      described in item 3).  An assumption that the optimizer part of
2667  *      regexec.c (probably unwittingly) makes is that a character in the
2668  *      pattern corresponds to at most a single character in the target string.
2669  *      (And I do mean character, and not byte here, unlike other parts of the
2670  *      documentation that have never been updated to account for multibyte
2671  *      Unicode.)  This assumption is wrong only in this case, as all other
2672  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2673  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2674  *      reluctant to try to change this assumption, so instead the code punts.
2675  *      This routine examines EXACTF nodes for the sharp s, and returns a
2676  *      boolean indicating whether or not the node is an EXACTF node that
2677  *      contains a sharp s.  When it is true, the caller sets a flag that later
2678  *      causes the optimizer in this file to not set values for the floating
2679  *      and fixed string lengths, and thus avoids the optimizer code in
2680  *      regexec.c that makes the invalid assumption.  Thus, there is no
2681  *      optimization based on string lengths for EXACTF nodes that contain the
2682  *      sharp s.  This only happens for /id rules (which means the pattern
2683  *      isn't in UTF-8).
2684  */
2685
2686 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2687     if (PL_regkind[OP(scan)] == EXACT) \
2688         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2689
2690 STATIC U32
2691 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2692     /* Merge several consecutive EXACTish nodes into one. */
2693     regnode *n = regnext(scan);
2694     U32 stringok = 1;
2695     regnode *next = scan + NODE_SZ_STR(scan);
2696     U32 merged = 0;
2697     U32 stopnow = 0;
2698 #ifdef DEBUGGING
2699     regnode *stop = scan;
2700     GET_RE_DEBUG_FLAGS_DECL;
2701 #else
2702     PERL_UNUSED_ARG(depth);
2703 #endif
2704
2705     PERL_ARGS_ASSERT_JOIN_EXACT;
2706 #ifndef EXPERIMENTAL_INPLACESCAN
2707     PERL_UNUSED_ARG(flags);
2708     PERL_UNUSED_ARG(val);
2709 #endif
2710     DEBUG_PEEP("join",scan,depth);
2711
2712     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2713      * EXACT ones that are mergeable to the current one. */
2714     while (n
2715            && (PL_regkind[OP(n)] == NOTHING
2716                || (stringok && OP(n) == OP(scan)))
2717            && NEXT_OFF(n)
2718            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2719     {
2720
2721         if (OP(n) == TAIL || n > next)
2722             stringok = 0;
2723         if (PL_regkind[OP(n)] == NOTHING) {
2724             DEBUG_PEEP("skip:",n,depth);
2725             NEXT_OFF(scan) += NEXT_OFF(n);
2726             next = n + NODE_STEP_REGNODE;
2727 #ifdef DEBUGGING
2728             if (stringok)
2729                 stop = n;
2730 #endif
2731             n = regnext(n);
2732         }
2733         else if (stringok) {
2734             const unsigned int oldl = STR_LEN(scan);
2735             regnode * const nnext = regnext(n);
2736
2737             /* XXX I (khw) kind of doubt that this works on platforms where
2738              * U8_MAX is above 255 because of lots of other assumptions */
2739             if (oldl + STR_LEN(n) > U8_MAX)
2740                 break;
2741
2742             DEBUG_PEEP("merg",n,depth);
2743             merged++;
2744
2745             NEXT_OFF(scan) += NEXT_OFF(n);
2746             STR_LEN(scan) += STR_LEN(n);
2747             next = n + NODE_SZ_STR(n);
2748             /* Now we can overwrite *n : */
2749             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2750 #ifdef DEBUGGING
2751             stop = next - 1;
2752 #endif
2753             n = nnext;
2754             if (stopnow) break;
2755         }
2756
2757 #ifdef EXPERIMENTAL_INPLACESCAN
2758         if (flags && !NEXT_OFF(n)) {
2759             DEBUG_PEEP("atch", val, depth);
2760             if (reg_off_by_arg[OP(n)]) {
2761                 ARG_SET(n, val - n);
2762             }
2763             else {
2764                 NEXT_OFF(n) = val - n;
2765             }
2766             stopnow = 1;
2767         }
2768 #endif
2769     }
2770
2771     *min_subtract = 0;
2772     *has_exactf_sharp_s = FALSE;
2773
2774     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2775      * can now analyze for sequences of problematic code points.  (Prior to
2776      * this final joining, sequences could have been split over boundaries, and
2777      * hence missed).  The sequences only happen in folding, hence for any
2778      * non-EXACT EXACTish node */
2779     if (OP(scan) != EXACT) {
2780         U8 *s;
2781         U8 * s0 = (U8*) STRING(scan);
2782         U8 * const s_end = s0 + STR_LEN(scan);
2783
2784         /* The below is perhaps overboard, but this allows us to save a test
2785          * each time through the loop at the expense of a mask.  This is
2786          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2787          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2788          * This uses an exclusive 'or' to find that bit and then inverts it to
2789          * form a mask, with just a single 0, in the bit position where 'S' and
2790          * 's' differ. */
2791         const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2792         const U8 s_masked = 's' & S_or_s_mask;
2793
2794         /* One pass is made over the node's string looking for all the
2795          * possibilities.  to avoid some tests in the loop, there are two main
2796          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2797          * non-UTF-8 */
2798         if (UTF) {
2799
2800             /* There are two problematic Greek code points in Unicode
2801              * casefolding
2802              *
2803              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2804              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2805              *
2806              * which casefold to
2807              *
2808              * Unicode                      UTF-8
2809              *
2810              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2811              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2812              *
2813              * This means that in case-insensitive matching (or "loose
2814              * matching", as Unicode calls it), an EXACTF of length six (the
2815              * UTF-8 encoded byte length of the above casefolded versions) can
2816              * match a target string of length two (the byte length of UTF-8
2817              * encoded U+0390 or U+03B0).  This would rather mess up the
2818              * minimum length computation.  (there are other code points that
2819              * also fold to these two sequences, but the delta is smaller)
2820              *
2821              * If these sequences are found, the minimum length is decreased by
2822              * four (six minus two).
2823              *
2824              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2825              * LETTER SHARP S.  We decrease the min length by 1 for each
2826              * occurrence of 'ss' found */
2827
2828 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2829 #           define U390_first_byte 0xb4
2830             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2831 #           define U3B0_first_byte 0xb5
2832             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2833 #else
2834 #           define U390_first_byte 0xce
2835             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2836 #           define U3B0_first_byte 0xcf
2837             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2838 #endif
2839             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2840                                                  yields a net of 0 */
2841             /* Examine the string for one of the problematic sequences */
2842             for (s = s0;
2843                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2844                                  * sequence we are looking for is 2 */
2845                  s += UTF8SKIP(s))
2846             {
2847
2848                 /* Look for the first byte in each problematic sequence */
2849                 switch (*s) {
2850                     /* We don't have to worry about other things that fold to
2851                      * 's' (such as the long s, U+017F), as all above-latin1
2852                      * code points have been pre-folded */
2853                     case 's':
2854                     case 'S':
2855
2856                         /* Current character is an 's' or 'S'.  If next one is
2857                          * as well, we have the dreaded sequence */
2858                         if (((*(s+1) & S_or_s_mask) == s_masked)
2859                             /* These two node types don't have special handling
2860                              * for 'ss' */
2861                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2862                         {
2863                             *min_subtract += 1;
2864                             OP(scan) = EXACTFU_SS;
2865                             s++;    /* No need to look at this character again */
2866                         }
2867                         break;
2868
2869                     case U390_first_byte:
2870                         if (s_end - s >= len
2871
2872                             /* The 1's are because are skipping comparing the
2873                              * first byte */
2874                             && memEQ(s + 1, U390_tail, len - 1))
2875                         {
2876                             goto greek_sequence;
2877                         }
2878                         break;
2879
2880                     case U3B0_first_byte:
2881                         if (! (s_end - s >= len
2882                                && memEQ(s + 1, U3B0_tail, len - 1)))
2883                         {
2884                             break;
2885                         }
2886                       greek_sequence:
2887                         *min_subtract += 4;
2888
2889                         /* This requires special handling by trie's, so change
2890                          * the node type to indicate this.  If EXACTFA and
2891                          * EXACTFL were ever to be handled by trie's, this
2892                          * would have to be changed.  If this node has already
2893                          * been changed to EXACTFU_SS in this loop, leave it as
2894                          * is.  (I (khw) think it doesn't matter in regexec.c
2895                          * for UTF patterns, but no need to change it */
2896                         if (OP(scan) == EXACTFU) {
2897                             OP(scan) = EXACTFU_TRICKYFOLD;
2898                         }
2899                         s += 6; /* We already know what this sequence is.  Skip
2900                                    the rest of it */
2901                         break;
2902                 }
2903             }
2904         }
2905         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2906
2907             /* Here, the pattern is not UTF-8.  We need to look only for the
2908              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2909              * in the final position.  Otherwise we can stop looking 1 byte
2910              * earlier because have to find both the first and second 's' */
2911             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2912
2913             for (s = s0; s < upper; s++) {
2914                 switch (*s) {
2915                     case 'S':
2916                     case 's':
2917                         if (s_end - s > 1
2918                             && ((*(s+1) & S_or_s_mask) == s_masked))
2919                         {
2920                             *min_subtract += 1;
2921
2922                             /* EXACTF nodes need to know that the minimum
2923                              * length changed so that a sharp s in the string
2924                              * can match this ss in the pattern, but they
2925                              * remain EXACTF nodes, as they won't match this
2926                              * unless the target string is is UTF-8, which we
2927                              * don't know until runtime */
2928                             if (OP(scan) != EXACTF) {
2929                                 OP(scan) = EXACTFU_SS;
2930                             }
2931                             s++;
2932                         }
2933                         break;
2934                     case LATIN_SMALL_LETTER_SHARP_S:
2935                         if (OP(scan) == EXACTF) {
2936                             *has_exactf_sharp_s = TRUE;
2937                         }
2938                         break;
2939                 }
2940             }
2941         }
2942     }
2943
2944 #ifdef DEBUGGING
2945     /* Allow dumping but overwriting the collection of skipped
2946      * ops and/or strings with fake optimized ops */
2947     n = scan + NODE_SZ_STR(scan);
2948     while (n <= stop) {
2949         OP(n) = OPTIMIZED;
2950         FLAGS(n) = 0;
2951         NEXT_OFF(n) = 0;
2952         n++;
2953     }
2954 #endif
2955     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2956     return stopnow;
2957 }
2958
2959 /* REx optimizer.  Converts nodes into quicker variants "in place".
2960    Finds fixed substrings.  */
2961
2962 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2963    to the position after last scanned or to NULL. */
2964
2965 #define INIT_AND_WITHP \
2966     assert(!and_withp); \
2967     Newx(and_withp,1,struct regnode_charclass_class); \
2968     SAVEFREEPV(and_withp)
2969
2970 /* this is a chain of data about sub patterns we are processing that
2971    need to be handled separately/specially in study_chunk. Its so
2972    we can simulate recursion without losing state.  */
2973 struct scan_frame;
2974 typedef struct scan_frame {
2975     regnode *last;  /* last node to process in this frame */
2976     regnode *next;  /* next node to process when last is reached */
2977     struct scan_frame *prev; /*previous frame*/
2978     I32 stop; /* what stopparen do we use */
2979 } scan_frame;
2980
2981
2982 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2983
2984 #define CASE_SYNST_FNC(nAmE)                                       \
2985 case nAmE:                                                         \
2986     if (flags & SCF_DO_STCLASS_AND) {                              \
2987             for (value = 0; value < 256; value++)                  \
2988                 if (!is_ ## nAmE ## _cp(value))                       \
2989                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2990     }                                                              \
2991     else {                                                         \
2992             for (value = 0; value < 256; value++)                  \
2993                 if (is_ ## nAmE ## _cp(value))                        \
2994                     ANYOF_BITMAP_SET(data->start_class, value);    \
2995     }                                                              \
2996     break;                                                         \
2997 case N ## nAmE:                                                    \
2998     if (flags & SCF_DO_STCLASS_AND) {                              \
2999             for (value = 0; value < 256; value++)                   \
3000                 if (is_ ## nAmE ## _cp(value))                         \
3001                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
3002     }                                                               \
3003     else {                                                          \
3004             for (value = 0; value < 256; value++)                   \
3005                 if (!is_ ## nAmE ## _cp(value))                        \
3006                     ANYOF_BITMAP_SET(data->start_class, value);     \
3007     }                                                               \
3008     break
3009
3010
3011
3012 STATIC I32
3013 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3014                         I32 *minlenp, I32 *deltap,
3015                         regnode *last,
3016                         scan_data_t *data,
3017                         I32 stopparen,
3018                         U8* recursed,
3019                         struct regnode_charclass_class *and_withp,
3020                         U32 flags, U32 depth)
3021                         /* scanp: Start here (read-write). */
3022                         /* deltap: Write maxlen-minlen here. */
3023                         /* last: Stop before this one. */
3024                         /* data: string data about the pattern */
3025                         /* stopparen: treat close N as END */
3026                         /* recursed: which subroutines have we recursed into */
3027                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3028 {
3029     dVAR;
3030     I32 min = 0, pars = 0, code;
3031     regnode *scan = *scanp, *next;
3032     I32 delta = 0;
3033     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3034     int is_inf_internal = 0;            /* The studied chunk is infinite */
3035     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3036     scan_data_t data_fake;
3037     SV *re_trie_maxbuff = NULL;
3038     regnode *first_non_open = scan;
3039     I32 stopmin = I32_MAX;
3040     scan_frame *frame = NULL;
3041     GET_RE_DEBUG_FLAGS_DECL;
3042
3043     PERL_ARGS_ASSERT_STUDY_CHUNK;
3044
3045 #ifdef DEBUGGING
3046     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3047 #endif
3048
3049     if ( depth == 0 ) {
3050         while (first_non_open && OP(first_non_open) == OPEN)
3051             first_non_open=regnext(first_non_open);
3052     }
3053
3054
3055   fake_study_recurse:
3056     while ( scan && OP(scan) != END && scan < last ){
3057         UV min_subtract = 0;    /* How much to subtract from the minimum node
3058                                    length to get a real minimum (because the
3059                                    folded version may be shorter) */
3060         bool has_exactf_sharp_s = FALSE;
3061         /* Peephole optimizer: */
3062         DEBUG_STUDYDATA("Peep:", data,depth);
3063         DEBUG_PEEP("Peep",scan,depth);
3064
3065         /* Its not clear to khw or hv why this is done here, and not in the
3066          * clauses that deal with EXACT nodes.  khw's guess is that it's
3067          * because of a previous design */
3068         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3069
3070         /* Follow the next-chain of the current node and optimize
3071            away all the NOTHINGs from it.  */
3072         if (OP(scan) != CURLYX) {
3073             const int max = (reg_off_by_arg[OP(scan)]
3074                        ? I32_MAX
3075                        /* I32 may be smaller than U16 on CRAYs! */
3076                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3077             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3078             int noff;
3079             regnode *n = scan;
3080
3081             /* Skip NOTHING and LONGJMP. */
3082             while ((n = regnext(n))
3083                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3084                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3085                    && off + noff < max)
3086                 off += noff;
3087             if (reg_off_by_arg[OP(scan)])
3088                 ARG(scan) = off;
3089             else
3090                 NEXT_OFF(scan) = off;
3091         }
3092
3093
3094
3095         /* The principal pseudo-switch.  Cannot be a switch, since we
3096            look into several different things.  */
3097         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3098                    || OP(scan) == IFTHEN) {
3099             next = regnext(scan);
3100             code = OP(scan);
3101             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3102
3103             if (OP(next) == code || code == IFTHEN) {
3104                 /* NOTE - There is similar code to this block below for handling
3105                    TRIE nodes on a re-study.  If you change stuff here check there
3106                    too. */
3107                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3108                 struct regnode_charclass_class accum;
3109                 regnode * const startbranch=scan;
3110
3111                 if (flags & SCF_DO_SUBSTR)
3112                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3113                 if (flags & SCF_DO_STCLASS)
3114                     cl_init_zero(pRExC_state, &accum);
3115
3116                 while (OP(scan) == code) {
3117                     I32 deltanext, minnext, f = 0, fake;
3118                     struct regnode_charclass_class this_class;
3119
3120                     num++;
3121                     data_fake.flags = 0;
3122                     if (data) {
3123                         data_fake.whilem_c = data->whilem_c;
3124                         data_fake.last_closep = data->last_closep;
3125                     }
3126                     else
3127                         data_fake.last_closep = &fake;
3128
3129                     data_fake.pos_delta = delta;
3130                     next = regnext(scan);
3131                     scan = NEXTOPER(scan);
3132                     if (code != BRANCH)
3133                         scan = NEXTOPER(scan);
3134                     if (flags & SCF_DO_STCLASS) {
3135                         cl_init(pRExC_state, &this_class);
3136                         data_fake.start_class = &this_class;
3137                         f = SCF_DO_STCLASS_AND;
3138                     }
3139                     if (flags & SCF_WHILEM_VISITED_POS)
3140                         f |= SCF_WHILEM_VISITED_POS;
3141
3142                     /* we suppose the run is continuous, last=next...*/
3143                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3144                                           next, &data_fake,
3145                                           stopparen, recursed, NULL, f,depth+1);
3146                     if (min1 > minnext)
3147                         min1 = minnext;
3148                     if (max1 < minnext + deltanext)
3149                         max1 = minnext + deltanext;
3150                     if (deltanext == I32_MAX)
3151                         is_inf = is_inf_internal = 1;
3152                     scan = next;
3153                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3154                         pars++;
3155                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3156                         if ( stopmin > minnext)
3157                             stopmin = min + min1;
3158                         flags &= ~SCF_DO_SUBSTR;
3159                         if (data)
3160                             data->flags |= SCF_SEEN_ACCEPT;
3161                     }
3162                     if (data) {
3163                         if (data_fake.flags & SF_HAS_EVAL)
3164                             data->flags |= SF_HAS_EVAL;
3165                         data->whilem_c = data_fake.whilem_c;
3166                     }
3167                     if (flags & SCF_DO_STCLASS)
3168                         cl_or(pRExC_state, &accum, &this_class);
3169                 }
3170                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3171                     min1 = 0;
3172                 if (flags & SCF_DO_SUBSTR) {
3173                     data->pos_min += min1;
3174                     data->pos_delta += max1 - min1;
3175                     if (max1 != min1 || is_inf)
3176                         data->longest = &(data->longest_float);
3177                 }
3178                 min += min1;
3179                 delta += max1 - min1;
3180                 if (flags & SCF_DO_STCLASS_OR) {
3181                     cl_or(pRExC_state, data->start_class, &accum);
3182                     if (min1) {
3183                         cl_and(data->start_class, and_withp);
3184                         flags &= ~SCF_DO_STCLASS;
3185                     }
3186                 }
3187                 else if (flags & SCF_DO_STCLASS_AND) {
3188                     if (min1) {
3189                         cl_and(data->start_class, &accum);
3190                         flags &= ~SCF_DO_STCLASS;
3191                     }
3192                     else {
3193                         /* Switch to OR mode: cache the old value of
3194                          * data->start_class */
3195                         INIT_AND_WITHP;
3196                         StructCopy(data->start_class, and_withp,
3197                                    struct regnode_charclass_class);
3198                         flags &= ~SCF_DO_STCLASS_AND;
3199                         StructCopy(&accum, data->start_class,
3200                                    struct regnode_charclass_class);
3201                         flags |= SCF_DO_STCLASS_OR;
3202                         data->start_class->flags |= ANYOF_EOS;
3203                     }
3204                 }
3205
3206                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3207                 /* demq.
3208
3209                    Assuming this was/is a branch we are dealing with: 'scan' now
3210                    points at the item that follows the branch sequence, whatever
3211                    it is. We now start at the beginning of the sequence and look
3212                    for subsequences of
3213
3214                    BRANCH->EXACT=>x1
3215                    BRANCH->EXACT=>x2
3216                    tail
3217
3218                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3219
3220                    If we can find such a subsequence we need to turn the first
3221                    element into a trie and then add the subsequent branch exact
3222                    strings to the trie.
3223
3224                    We have two cases
3225
3226                      1. patterns where the whole set of branches can be converted.
3227
3228                      2. patterns where only a subset can be converted.
3229
3230                    In case 1 we can replace the whole set with a single regop
3231                    for the trie. In case 2 we need to keep the start and end
3232                    branches so
3233
3234                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3235                      becomes BRANCH TRIE; BRANCH X;
3236
3237                   There is an additional case, that being where there is a
3238                   common prefix, which gets split out into an EXACT like node
3239                   preceding the TRIE node.
3240
3241                   If x(1..n)==tail then we can do a simple trie, if not we make
3242                   a "jump" trie, such that when we match the appropriate word
3243                   we "jump" to the appropriate tail node. Essentially we turn
3244                   a nested if into a case structure of sorts.
3245
3246                 */
3247
3248                     int made=0;
3249                     if (!re_trie_maxbuff) {
3250                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3251                         if (!SvIOK(re_trie_maxbuff))
3252                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3253                     }
3254                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3255                         regnode *cur;
3256                         regnode *first = (regnode *)NULL;
3257                         regnode *last = (regnode *)NULL;
3258                         regnode *tail = scan;
3259                         U8 trietype = 0;
3260                         U32 count=0;
3261
3262 #ifdef DEBUGGING
3263                         SV * const mysv = sv_newmortal();       /* for dumping */
3264 #endif
3265                         /* var tail is used because there may be a TAIL
3266                            regop in the way. Ie, the exacts will point to the
3267                            thing following the TAIL, but the last branch will
3268                            point at the TAIL. So we advance tail. If we
3269                            have nested (?:) we may have to move through several
3270                            tails.
3271                          */
3272
3273                         while ( OP( tail ) == TAIL ) {
3274                             /* this is the TAIL generated by (?:) */
3275                             tail = regnext( tail );
3276                         }
3277
3278
3279                         DEBUG_TRIE_COMPILE_r({
3280                             regprop(RExC_rx, mysv, tail );
3281                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3282                                 (int)depth * 2 + 2, "",
3283                                 "Looking for TRIE'able sequences. Tail node is: ",
3284                                 SvPV_nolen_const( mysv )
3285                             );
3286                         });
3287
3288                         /*
3289
3290                             Step through the branches
3291                                 cur represents each branch,
3292                                 noper is the first thing to be matched as part of that branch
3293                                 noper_next is the regnext() of that node.
3294
3295                             We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3296                             via a "jump trie" but we also support building with NOJUMPTRIE,
3297                             which restricts the trie logic to structures like /FOO|BAR/.
3298
3299                             If noper is a trieable nodetype then the branch is a possible optimization
3300                             target. If we are building under NOJUMPTRIE then we require that noper_next
3301                             is the same as scan (our current position in the regex program).
3302
3303                             Once we have two or more consecutive such branches we can create a
3304                             trie of the EXACT's contents and stitch it in place into the program.
3305
3306                             If the sequence represents all of the branches in the alternation we
3307                             replace the entire thing with a single TRIE node.
3308
3309                             Otherwise when it is a subsequence we need to stitch it in place and
3310                             replace only the relevant branches. This means the first branch has
3311                             to remain as it is used by the alternation logic, and its next pointer,
3312                             and needs to be repointed at the item on the branch chain following
3313                             the last branch we have optimized away.
3314
3315                             This could be either a BRANCH, in which case the subsequence is internal,
3316                             or it could be the item following the branch sequence in which case the
3317                             subsequence is at the end (which does not necessarily mean the first node
3318                             is the start of the alternation).
3319
3320                             TRIE_TYPE(X) is a define which maps the optype to a trietype.
3321
3322                                 optype          |  trietype
3323                                 ----------------+-----------
3324                                 NOTHING         | NOTHING
3325                                 EXACT           | EXACT
3326                                 EXACTFU         | EXACTFU
3327                                 EXACTFU_SS      | EXACTFU
3328                                 EXACTFU_TRICKYFOLD | EXACTFU
3329                                 EXACTFA         | 0
3330
3331
3332                         */
3333 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3334                        ( EXACT == (X) )   ? EXACT :        \
3335                        ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3336                        0 )
3337
3338                         /* dont use tail as the end marker for this traverse */
3339                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3340                             regnode * const noper = NEXTOPER( cur );
3341                             U8 noper_type = OP( noper );
3342                             U8 noper_trietype = TRIE_TYPE( noper_type );
3343 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3344                             regnode * const noper_next = regnext( noper );
3345                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3346                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3347 #endif
3348
3349                             DEBUG_TRIE_COMPILE_r({
3350                                 regprop(RExC_rx, mysv, cur);
3351                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3352                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3353
3354                                 regprop(RExC_rx, mysv, noper);
3355                                 PerlIO_printf( Perl_debug_log, " -> %s",
3356                                     SvPV_nolen_const(mysv));
3357
3358                                 if ( noper_next ) {
3359                                   regprop(RExC_rx, mysv, noper_next );
3360                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3361                                     SvPV_nolen_const(mysv));
3362                                 }
3363                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3364                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3365                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3366                                 );
3367                             });
3368
3369                             /* Is noper a trieable nodetype that can be merged with the
3370                              * current trie (if there is one)? */
3371                             if ( noper_trietype
3372                                   &&
3373                                   (
3374                                         ( noper_trietype == NOTHING)
3375                                         || ( trietype == NOTHING )
3376                                         || ( trietype == noper_trietype )
3377                                   )
3378 #ifdef NOJUMPTRIE
3379                                   && noper_next == tail
3380 #endif
3381                                   && count < U16_MAX)
3382                             {
3383                                 /* Handle mergable triable node
3384                                  * Either we are the first node in a new trieable sequence,
3385                                  * in which case we do some bookkeeping, otherwise we update
3386                                  * the end pointer. */
3387                                 if ( !first ) {
3388                                     first = cur;
3389                                     if ( noper_trietype == NOTHING ) {
3390 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3391                                         regnode * const noper_next = regnext( noper );
3392                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3393                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3394 #endif
3395
3396                                         if ( noper_next_trietype ) {
3397                                             trietype = noper_next_trietype;
3398                                         } else if (noper_next_type)  {
3399                                             /* a NOTHING regop is 1 regop wide. We need at least two
3400                                              * for a trie so we can't merge this in */
3401                                             first = NULL;
3402                                         }
3403                                     } else {
3404                                         trietype = noper_trietype;
3405                                     }
3406                                 } else {
3407                                     if ( trietype == NOTHING )
3408                                         trietype = noper_trietype;
3409                                     last = cur;
3410                                 }
3411                                 if (first)
3412                                     count++;
3413                             } /* end handle mergable triable node */
3414                             else {
3415                                 /* handle unmergable node -
3416                                  * noper may either be a triable node which can not be tried
3417                                  * together with the current trie, or a non triable node */
3418                                 if ( last ) {
3419                                     /* If last is set and trietype is not NOTHING then we have found
3420                                      * at least two triable branch sequences in a row of a similar
3421                                      * trietype so we can turn them into a trie. If/when we
3422                                      * allow NOTHING to start a trie sequence this condition will be
3423                                      * required, and it isn't expensive so we leave it in for now. */
3424                                     if ( trietype != NOTHING )
3425                                         make_trie( pRExC_state,
3426                                                 startbranch, first, cur, tail, count,
3427                                                 trietype, depth+1 );
3428                                     last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3429                                 }
3430                                 if ( noper_trietype
3431 #ifdef NOJUMPTRIE
3432                                      && noper_next == tail
3433 #endif
3434                                 ){
3435                                     /* noper is triable, so we can start a new trie sequence */
3436                                     count = 1;
3437                                     first = cur;
3438                                     trietype = noper_trietype;
3439                                 } else if (first) {
3440                                     /* if we already saw a first but the current node is not triable then we have
3441                                      * to reset the first information. */
3442                                     count = 0;
3443                                     first = NULL;
3444                                     trietype = 0;
3445                                 }
3446                             } /* end handle unmergable node */
3447                         } /* loop over branches */
3448                         DEBUG_TRIE_COMPILE_r({
3449                             regprop(RExC_rx, mysv, cur);
3450                             PerlIO_printf( Perl_debug_log,
3451                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3452                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3453
3454                         });
3455                         if ( last ) {
3456                             if ( trietype != NOTHING ) {
3457                                 /* the last branch of the sequence was part of a trie,
3458                                  * so we have to construct it here outside of the loop
3459                                  */
3460                                 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3461 #ifdef TRIE_STUDY_OPT
3462                                 if ( ((made == MADE_EXACT_TRIE &&
3463                                      startbranch == first)
3464                                      || ( first_non_open == first )) &&
3465                                      depth==0 ) {
3466                                     flags |= SCF_TRIE_RESTUDY;
3467                                     if ( startbranch == first
3468                                          && scan == tail )
3469                                     {
3470                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3471                                     }
3472                                 }
3473 #endif
3474                             } else {
3475                                 /* at this point we know whatever we have is a NOTHING sequence/branch
3476                                  * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3477                                  */
3478                                 if ( startbranch == first ) {
3479                                     regnode *opt;
3480                                     /* the entire thing is a NOTHING sequence, something like this:
3481                                      * (?:|) So we can turn it into a plain NOTHING op. */
3482                                     DEBUG_TRIE_COMPILE_r({
3483                                         regprop(RExC_rx, mysv, cur);
3484                                         PerlIO_printf( Perl_debug_log,
3485                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3486                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3487
3488                                     });
3489                                     OP(startbranch)= NOTHING;
3490                                     NEXT_OFF(startbranch)= tail - startbranch;
3491                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
3492                                         OP(opt)= OPTIMIZED;
3493                                 }
3494                             }
3495                         } /* end if ( last) */
3496                     } /* TRIE_MAXBUF is non zero */
3497
3498                 } /* do trie */
3499
3500             }
3501             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3502                 scan = NEXTOPER(NEXTOPER(scan));
3503             } else                      /* single branch is optimized. */
3504                 scan = NEXTOPER(scan);
3505             continue;
3506         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3507             scan_frame *newframe = NULL;
3508             I32 paren;
3509             regnode *start;
3510             regnode *end;
3511
3512             if (OP(scan) != SUSPEND) {
3513             /* set the pointer */
3514                 if (OP(scan) == GOSUB) {
3515                     paren = ARG(scan);
3516                     RExC_recurse[ARG2L(scan)] = scan;
3517                     start = RExC_open_parens[paren-1];
3518                     end   = RExC_close_parens[paren-1];
3519                 } else {
3520                     paren = 0;
3521                     start = RExC_rxi->program + 1;
3522                     end   = RExC_opend;
3523                 }
3524                 if (!recursed) {
3525                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3526                     SAVEFREEPV(recursed);
3527                 }
3528                 if (!PAREN_TEST(recursed,paren+1)) {
3529                     PAREN_SET(recursed,paren+1);
3530                     Newx(newframe,1,scan_frame);
3531                 } else {
3532                     if (flags & SCF_DO_SUBSTR) {
3533                         SCAN_COMMIT(pRExC_state,data,minlenp);
3534                         data->longest = &(data->longest_float);
3535                     }
3536                     is_inf = is_inf_internal = 1;
3537                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3538                         cl_anything(pRExC_state, data->start_class);
3539                     flags &= ~SCF_DO_STCLASS;
3540                 }
3541             } else {
3542                 Newx(newframe,1,scan_frame);
3543                 paren = stopparen;
3544                 start = scan+2;
3545                 end = regnext(scan);
3546             }
3547             if (newframe) {
3548                 assert(start);
3549                 assert(end);
3550                 SAVEFREEPV(newframe);
3551                 newframe->next = regnext(scan);
3552                 newframe->last = last;
3553                 newframe->stop = stopparen;
3554                 newframe->prev = frame;
3555
3556                 frame = newframe;
3557                 scan =  start;
3558                 stopparen = paren;
3559                 last = end;
3560
3561                 continue;
3562             }
3563         }
3564         else if (OP(scan) == EXACT) {
3565             I32 l = STR_LEN(scan);
3566             UV uc;
3567             if (UTF) {
3568                 const U8 * const s = (U8*)STRING(scan);
3569                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3570                 l = utf8_length(s, s + l);
3571             } else {
3572                 uc = *((U8*)STRING(scan));
3573             }
3574             min += l;
3575             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3576                 /* The code below prefers earlier match for fixed
3577                    offset, later match for variable offset.  */
3578                 if (data->last_end == -1) { /* Update the start info. */
3579                     data->last_start_min = data->pos_min;
3580                     data->last_start_max = is_inf
3581                         ? I32_MAX : data->pos_min + data->pos_delta;
3582                 }
3583                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3584                 if (UTF)
3585                     SvUTF8_on(data->last_found);
3586                 {
3587                     SV * const sv = data->last_found;
3588                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3589                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3590                     if (mg && mg->mg_len >= 0)
3591                         mg->mg_len += utf8_length((U8*)STRING(scan),
3592                                                   (U8*)STRING(scan)+STR_LEN(scan));
3593                 }
3594                 data->last_end = data->pos_min + l;
3595                 data->pos_min += l; /* As in the first entry. */
3596                 data->flags &= ~SF_BEFORE_EOL;
3597             }
3598             if (flags & SCF_DO_STCLASS_AND) {
3599                 /* Check whether it is compatible with what we know already! */
3600                 int compat = 1;
3601
3602
3603                 /* If compatible, we or it in below.  It is compatible if is
3604                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3605                  * it's for a locale.  Even if there isn't unicode semantics
3606                  * here, at runtime there may be because of matching against a
3607                  * utf8 string, so accept a possible false positive for
3608                  * latin1-range folds */
3609                 if (uc >= 0x100 ||
3610                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3611                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3612                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3613                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3614                     )
3615                 {
3616                     compat = 0;
3617                 }
3618                 ANYOF_CLASS_ZERO(data->start_class);
3619                 ANYOF_BITMAP_ZERO(data->start_class);
3620                 if (compat)
3621                     ANYOF_BITMAP_SET(data->start_class, uc);
3622                 else if (uc >= 0x100) {
3623                     int i;
3624
3625                     /* Some Unicode code points fold to the Latin1 range; as
3626                      * XXX temporary code, instead of figuring out if this is
3627                      * one, just assume it is and set all the start class bits
3628                      * that could be some such above 255 code point's fold
3629                      * which will generate fals positives.  As the code
3630                      * elsewhere that does compute the fold settles down, it
3631                      * can be extracted out and re-used here */
3632                     for (i = 0; i < 256; i++){
3633                         if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3634                             ANYOF_BITMAP_SET(data->start_class, i);
3635                         }
3636                     }
3637                 }
3638                 data->start_class->flags &= ~ANYOF_EOS;
3639                 if (uc < 0x100)
3640                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3641             }
3642             else if (flags & SCF_DO_STCLASS_OR) {
3643                 /* false positive possible if the class is case-folded */
3644                 if (uc < 0x100)
3645                     ANYOF_BITMAP_SET(data->start_class, uc);
3646                 else
3647                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3648                 data->start_class->flags &= ~ANYOF_EOS;
3649                 cl_and(data->start_class, and_withp);
3650             }
3651             flags &= ~SCF_DO_STCLASS;
3652         }
3653         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3654             I32 l = STR_LEN(scan);
3655             UV uc = *((U8*)STRING(scan));
3656
3657             /* Search for fixed substrings supports EXACT only. */
3658             if (flags & SCF_DO_SUBSTR) {
3659                 assert(data);
3660                 SCAN_COMMIT(pRExC_state, data, minlenp);
3661             }
3662             if (UTF) {
3663                 const U8 * const s = (U8 *)STRING(scan);
3664                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3665                 l = utf8_length(s, s + l);
3666             }
3667             if (has_exactf_sharp_s) {
3668                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3669             }
3670             min += l - min_subtract;
3671             if (min < 0) {
3672                 min = 0;
3673             }
3674             delta += min_subtract;
3675             if (flags & SCF_DO_SUBSTR) {
3676                 data->pos_min += l - min_subtract;
3677                 if (data->pos_min < 0) {
3678                     data->pos_min = 0;
3679                 }
3680                 data->pos_delta += min_subtract;
3681                 if (min_subtract) {
3682                     data->longest = &(data->longest_float);
3683                 }
3684             }
3685             if (flags & SCF_DO_STCLASS_AND) {
3686                 /* Check whether it is compatible with what we know already! */
3687                 int compat = 1;
3688                 if (uc >= 0x100 ||
3689                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3690                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3691                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3692                 {
3693                     compat = 0;
3694                 }
3695                 ANYOF_CLASS_ZERO(data->start_class);
3696                 ANYOF_BITMAP_ZERO(data->start_class);
3697                 if (compat) {
3698                     ANYOF_BITMAP_SET(data->start_class, uc);
3699                     data->start_class->flags &= ~ANYOF_EOS;
3700                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3701                     if (OP(scan) == EXACTFL) {
3702                         /* XXX This set is probably no longer necessary, and
3703                          * probably wrong as LOCALE now is on in the initial
3704                          * state */
3705                         data->start_class->flags |= ANYOF_LOCALE;
3706                     }
3707                     else {
3708
3709                         /* Also set the other member of the fold pair.  In case
3710                          * that unicode semantics is called for at runtime, use
3711                          * the full latin1 fold.  (Can't do this for locale,
3712                          * because not known until runtime) */
3713                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3714
3715                         /* All other (EXACTFL handled above) folds except under
3716                          * /iaa that include s, S, and sharp_s also may include
3717                          * the others */
3718                         if (OP(scan) != EXACTFA) {
3719                             if (uc == 's' || uc == 'S') {
3720                                 ANYOF_BITMAP_SET(data->start_class,
3721                                                  LATIN_SMALL_LETTER_SHARP_S);
3722                             }
3723                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3724                                 ANYOF_BITMAP_SET(data->start_class, 's');
3725                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3726                             }
3727                         }
3728                     }
3729                 }
3730                 else if (uc >= 0x100) {
3731                     int i;
3732                     for (i = 0; i < 256; i++){
3733                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3734                             ANYOF_BITMAP_SET(data->start_class, i);
3735                         }
3736                     }
3737                 }
3738             }
3739             else if (flags & SCF_DO_STCLASS_OR) {
3740                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3741                     /* false positive possible if the class is case-folded.
3742                        Assume that the locale settings are the same... */
3743                     if (uc < 0x100) {
3744                         ANYOF_BITMAP_SET(data->start_class, uc);
3745                         if (OP(scan) != EXACTFL) {
3746
3747                             /* And set the other member of the fold pair, but
3748                              * can't do that in locale because not known until
3749                              * run-time */
3750                             ANYOF_BITMAP_SET(data->start_class,
3751                                              PL_fold_latin1[uc]);
3752
3753                             /* All folds except under /iaa that include s, S,
3754                              * and sharp_s also may include the others */
3755                             if (OP(scan) != EXACTFA) {
3756                                 if (uc == 's' || uc == 'S') {
3757                                     ANYOF_BITMAP_SET(data->start_class,
3758                                                    LATIN_SMALL_LETTER_SHARP_S);
3759                                 }
3760                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3761                                     ANYOF_BITMAP_SET(data->start_class, 's');
3762                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3763                                 }
3764                             }
3765                         }
3766                     }
3767                     data->start_class->flags &= ~ANYOF_EOS;
3768                 }
3769                 cl_and(data->start_class, and_withp);
3770             }
3771             flags &= ~SCF_DO_STCLASS;
3772         }
3773         else if (REGNODE_VARIES(OP(scan))) {
3774             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3775             I32 f = flags, pos_before = 0;
3776             regnode * const oscan = scan;
3777             struct regnode_charclass_class this_class;
3778             struct regnode_charclass_class *oclass = NULL;
3779             I32 next_is_eval = 0;
3780
3781             switch (PL_regkind[OP(scan)]) {
3782             case WHILEM:                /* End of (?:...)* . */
3783                 scan = NEXTOPER(scan);
3784                 goto finish;
3785             case PLUS:
3786                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3787                     next = NEXTOPER(scan);
3788                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3789                         mincount = 1;
3790                         maxcount = REG_INFTY;
3791                         next = regnext(scan);
3792                         scan = NEXTOPER(scan);
3793                         goto do_curly;
3794                     }
3795                 }
3796                 if (flags & SCF_DO_SUBSTR)
3797                     data->pos_min++;
3798                 min++;
3799                 /* Fall through. */
3800             case STAR:
3801                 if (flags & SCF_DO_STCLASS) {
3802                     mincount = 0;
3803                     maxcount = REG_INFTY;
3804                     next = regnext(scan);
3805                     scan = NEXTOPER(scan);
3806                     goto do_curly;
3807                 }
3808                 is_inf = is_inf_internal = 1;
3809                 scan = regnext(scan);
3810                 if (flags & SCF_DO_SUBSTR) {
3811                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3812                     data->longest = &(data->longest_float);
3813                 }
3814                 goto optimize_curly_tail;
3815             case CURLY:
3816                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3817                     && (scan->flags == stopparen))
3818                 {
3819                     mincount = 1;
3820                     maxcount = 1;
3821                 } else {
3822                     mincount = ARG1(scan);
3823                     maxcount = ARG2(scan);
3824                 }
3825                 next = regnext(scan);
3826                 if (OP(scan) == CURLYX) {
3827                     I32 lp = (data ? *(data->last_closep) : 0);
3828                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3829                 }
3830                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3831                 next_is_eval = (OP(scan) == EVAL);
3832               do_curly:
3833                 if (flags & SCF_DO_SUBSTR) {
3834                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3835                     pos_before = data->pos_min;
3836                 }
3837                 if (data) {
3838                     fl = data->flags;
3839                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3840                     if (is_inf)
3841                         data->flags |= SF_IS_INF;
3842                 }
3843                 if (flags & SCF_DO_STCLASS) {
3844                     cl_init(pRExC_state, &this_class);
3845                     oclass = data->start_class;
3846                     data->start_class = &this_class;
3847                     f |= SCF_DO_STCLASS_AND;
3848                     f &= ~SCF_DO_STCLASS_OR;
3849                 }
3850                 /* Exclude from super-linear cache processing any {n,m}
3851                    regops for which the combination of input pos and regex
3852                    pos is not enough information to determine if a match
3853                    will be possible.
3854
3855                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3856                    regex pos at the \s*, the prospects for a match depend not
3857                    only on the input position but also on how many (bar\s*)
3858                    repeats into the {4,8} we are. */
3859                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3860                     f &= ~SCF_WHILEM_VISITED_POS;
3861
3862                 /* This will finish on WHILEM, setting scan, or on NULL: */
3863                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3864                                       last, data, stopparen, recursed, NULL,
3865                                       (mincount == 0
3866                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3867
3868                 if (flags & SCF_DO_STCLASS)
3869                     data->start_class = oclass;
3870                 if (mincount == 0 || minnext == 0) {
3871                     if (flags & SCF_DO_STCLASS_OR) {
3872                         cl_or(pRExC_state, data->start_class, &this_class);
3873                     }
3874                     else if (flags & SCF_DO_STCLASS_AND) {
3875                         /* Switch to OR mode: cache the old value of
3876                          * data->start_class */
3877                         INIT_AND_WITHP;
3878                         StructCopy(data->start_class, and_withp,
3879                                    struct regnode_charclass_class);
3880                         flags &= ~SCF_DO_STCLASS_AND;
3881                         StructCopy(&this_class, data->start_class,
3882                                    struct regnode_charclass_class);
3883                         flags |= SCF_DO_STCLASS_OR;
3884                         data->start_class->flags |= ANYOF_EOS;
3885                     }
3886                 } else {                /* Non-zero len */
3887                     if (flags & SCF_DO_STCLASS_OR) {
3888                         cl_or(pRExC_state, data->start_class, &this_class);
3889                         cl_and(data->start_class, and_withp);
3890                     }
3891                     else if (flags & SCF_DO_STCLASS_AND)
3892                         cl_and(data->start_class, &this_class);
3893                     flags &= ~SCF_DO_STCLASS;
3894                 }
3895                 if (!scan)              /* It was not CURLYX, but CURLY. */
3896                     scan = next;
3897                 if ( /* ? quantifier ok, except for (?{ ... }) */
3898                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3899                     && (minnext == 0) && (deltanext == 0)
3900                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3901                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3902                 {
3903                     ckWARNreg(RExC_parse,
3904                               "Quantifier unexpected on zero-length expression");
3905                 }
3906
3907                 min += minnext * mincount;
3908                 is_inf_internal |= ((maxcount == REG_INFTY
3909                                      && (minnext + deltanext) > 0)
3910                                     || deltanext == I32_MAX);
3911                 is_inf |= is_inf_internal;
3912                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3913
3914                 /* Try powerful optimization CURLYX => CURLYN. */
3915                 if (  OP(oscan) == CURLYX && data
3916                       && data->flags & SF_IN_PAR
3917                       && !(data->flags & SF_HAS_EVAL)
3918                       && !deltanext && minnext == 1 ) {
3919                     /* Try to optimize to CURLYN.  */
3920                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3921                     regnode * const nxt1 = nxt;
3922 #ifdef DEBUGGING
3923                     regnode *nxt2;
3924 #endif
3925
3926                     /* Skip open. */
3927                     nxt = regnext(nxt);
3928                     if (!REGNODE_SIMPLE(OP(nxt))
3929                         && !(PL_regkind[OP(nxt)] == EXACT
3930                              && STR_LEN(nxt) == 1))
3931                         goto nogo;
3932 #ifdef DEBUGGING
3933                     nxt2 = nxt;
3934 #endif
3935                     nxt = regnext(nxt);
3936                     if (OP(nxt) != CLOSE)
3937                         goto nogo;
3938                     if (RExC_open_parens) {
3939                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3940                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3941                     }
3942                     /* Now we know that nxt2 is the only contents: */
3943                     oscan->flags = (U8)ARG(nxt);
3944                     OP(oscan) = CURLYN;
3945                     OP(nxt1) = NOTHING; /* was OPEN. */
3946
3947 #ifdef DEBUGGING
3948                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3949                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3950                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3951                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3952                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3953                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3954 #endif
3955                 }
3956               nogo:
3957
3958                 /* Try optimization CURLYX => CURLYM. */
3959                 if (  OP(oscan) == CURLYX && data
3960                       && !(data->flags & SF_HAS_PAR)
3961                       && !(data->flags & SF_HAS_EVAL)
3962                       && !deltanext     /* atom is fixed width */
3963                       && minnext != 0   /* CURLYM can't handle zero width */
3964                       && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3965                 ) {
3966                     /* XXXX How to optimize if data == 0? */
3967                     /* Optimize to a simpler form.  */
3968                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3969                     regnode *nxt2;
3970
3971                     OP(oscan) = CURLYM;
3972                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3973                             && (OP(nxt2) != WHILEM))
3974                         nxt = nxt2;
3975                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3976                     /* Need to optimize away parenths. */
3977                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3978                         /* Set the parenth number.  */
3979                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3980
3981                         oscan->flags = (U8)ARG(nxt);
3982                         if (RExC_open_parens) {
3983                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3984                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3985                         }
3986                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3987                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3988
3989 #ifdef DEBUGGING
3990                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3991                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3992                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3993                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3994 #endif
3995 #if 0
3996                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3997                             regnode *nnxt = regnext(nxt1);
3998                             if (nnxt == nxt) {
3999                                 if (reg_off_by_arg[OP(nxt1)])
4000                                     ARG_SET(nxt1, nxt2 - nxt1);
4001                                 else if (nxt2 - nxt1 < U16_MAX)
4002                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4003                                 else
4004                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4005                             }
4006                             nxt1 = nnxt;
4007                         }
4008 #endif
4009                         /* Optimize again: */
4010                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4011                                     NULL, stopparen, recursed, NULL, 0,depth+1);
4012                     }
4013                     else
4014                         oscan->flags = 0;
4015                 }
4016                 else if ((OP(oscan) == CURLYX)
4017                          && (flags & SCF_WHILEM_VISITED_POS)
4018                          /* See the comment on a similar expression above.
4019                             However, this time it's not a subexpression
4020                             we care about, but the expression itself. */
4021                          && (maxcount == REG_INFTY)
4022                          && data && ++data->whilem_c < 16) {
4023                     /* This stays as CURLYX, we can put the count/of pair. */
4024                     /* Find WHILEM (as in regexec.c) */
4025                     regnode *nxt = oscan + NEXT_OFF(oscan);
4026
4027                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4028                         nxt += ARG(nxt);
4029                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4030                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4031                 }
4032                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4033                     pars++;
4034                 if (flags & SCF_DO_SUBSTR) {
4035                     SV *last_str = NULL;
4036                     int counted = mincount != 0;
4037
4038                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4039 #if defined(SPARC64_GCC_WORKAROUND)
4040                         I32 b = 0;
4041                         STRLEN l = 0;
4042                         const char *s = NULL;
4043                         I32 old = 0;
4044
4045                         if (pos_before >= data->last_start_min)
4046                             b = pos_before;
4047                         else
4048                             b = data->last_start_min;
4049
4050                         l = 0;
4051                         s = SvPV_const(data->last_found, l);
4052                         old = b - data->last_start_min;
4053
4054 #else
4055                         I32 b = pos_before >= data->last_start_min
4056                             ? pos_before : data->last_start_min;
4057                         STRLEN l;
4058                         const char * const s = SvPV_const(data->last_found, l);
4059                         I32 old = b - data->last_start_min;
4060 #endif
4061
4062                         if (UTF)
4063                             old = utf8_hop((U8*)s, old) - (U8*)s;
4064                         l -= old;
4065                         /* Get the added string: */
4066                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4067                         if (deltanext == 0 && pos_before == b) {
4068                             /* What was added is a constant string */
4069                             if (mincount > 1) {
4070                                 SvGROW(last_str, (mincount * l) + 1);
4071                                 repeatcpy(SvPVX(last_str) + l,
4072                                           SvPVX_const(last_str), l, mincount - 1);
4073                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4074                                 /* Add additional parts. */
4075                                 SvCUR_set(data->last_found,
4076                                           SvCUR(data->last_found) - l);
4077                                 sv_catsv(data->last_found, last_str);
4078                                 {
4079                                     SV * sv = data->last_found;
4080                                     MAGIC *mg =
4081                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4082                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4083                                     if (mg && mg->mg_len >= 0)
4084                                         mg->mg_len += CHR_SVLEN(last_str) - l;
4085                                 }
4086                                 data->last_end += l * (mincount - 1);
4087                             }
4088                         } else {
4089                             /* start offset must point into the last copy */
4090                             data->last_start_min += minnext * (mincount - 1);
4091                             data->last_start_max += is_inf ? I32_MAX
4092                                 : (maxcount - 1) * (minnext + data->pos_delta);
4093                         }
4094                     }
4095                     /* It is counted once already... */
4096                     data->pos_min += minnext * (mincount - counted);
4097                     data->pos_delta += - counted * deltanext +
4098                         (minnext + deltanext) * maxcount - minnext * mincount;
4099                     if (mincount != maxcount) {
4100                          /* Cannot extend fixed substrings found inside
4101                             the group.  */
4102                         SCAN_COMMIT(pRExC_state,data,minlenp);
4103                         if (mincount && last_str) {
4104                             SV * const sv = data->last_found;
4105                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4106                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4107
4108                             if (mg)
4109                                 mg->mg_len = -1;
4110                             sv_setsv(sv, last_str);
4111                             data->last_end = data->pos_min;
4112                             data->last_start_min =
4113                                 data->pos_min - CHR_SVLEN(last_str);
4114                             data->last_start_max = is_inf
4115                                 ? I32_MAX
4116                                 : data->pos_min + data->pos_delta
4117                                 - CHR_SVLEN(last_str);
4118                         }
4119                         data->longest = &(data->longest_float);
4120                     }
4121                     SvREFCNT_dec(last_str);
4122                 }
4123                 if (data && (fl & SF_HAS_EVAL))
4124                     data->flags |= SF_HAS_EVAL;
4125               optimize_curly_tail:
4126                 if (OP(oscan) != CURLYX) {
4127                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4128                            && NEXT_OFF(next))
4129                         NEXT_OFF(oscan) += NEXT_OFF(next);
4130                 }
4131                 continue;
4132             default:                    /* REF, ANYOFV, and CLUMP only? */
4133                 if (flags & SCF_DO_SUBSTR) {
4134                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
4135                     data->longest = &(data->longest_float);
4136                 }
4137                 is_inf = is_inf_internal = 1;
4138                 if (flags & SCF_DO_STCLASS_OR)
4139                     cl_anything(pRExC_state, data->start_class);
4140                 flags &= ~SCF_DO_STCLASS;
4141                 break;
4142             }
4143         }
4144         else if (OP(scan) == LNBREAK) {
4145             if (flags & SCF_DO_STCLASS) {
4146                 int value = 0;
4147                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4148                 if (flags & SCF_DO_STCLASS_AND) {
4149                     for (value = 0; value < 256; value++)
4150                         if (!is_VERTWS_cp(value))
4151                             ANYOF_BITMAP_CLEAR(data->start_class, value);
4152                 }
4153                 else {
4154                     for (value = 0; value < 256; value++)
4155                         if (is_VERTWS_cp(value))
4156                             ANYOF_BITMAP_SET(data->start_class, value);
4157                 }
4158                 if (flags & SCF_DO_STCLASS_OR)
4159                     cl_and(data->start_class, and_withp);
4160                 flags &= ~SCF_DO_STCLASS;
4161             }
4162             min += 1;
4163             delta += 1;
4164             if (flags & SCF_DO_SUBSTR) {
4165                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4166                 data->pos_min += 1;
4167                 data->pos_delta += 1;
4168                 data->longest = &(data->longest_float);
4169             }
4170         }
4171         else if (REGNODE_SIMPLE(OP(scan))) {
4172             int value = 0;
4173
4174             if (flags & SCF_DO_SUBSTR) {
4175                 SCAN_COMMIT(pRExC_state,data,minlenp);
4176                 data->pos_min++;
4177             }
4178             min++;
4179             if (flags & SCF_DO_STCLASS) {
4180                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4181
4182                 /* Some of the logic below assumes that switching
4183                    locale on will only add false positives. */
4184                 switch (PL_regkind[OP(scan)]) {
4185                 case SANY:
4186                 default:
4187                   do_default:
4188                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4189                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4190                         cl_anything(pRExC_state, data->start_class);
4191                     break;
4192                 case REG_ANY:
4193                     if (OP(scan) == SANY)
4194                         goto do_default;
4195                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4196                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4197                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4198                         cl_anything(pRExC_state, data->start_class);
4199                     }
4200                     if (flags & SCF_DO_STCLASS_AND || !value)
4201                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4202                     break;
4203                 case ANYOF:
4204                     if (flags & SCF_DO_STCLASS_AND)
4205                         cl_and(data->start_class,
4206                                (struct regnode_charclass_class*)scan);
4207                     else
4208                         cl_or(pRExC_state, data->start_class,
4209                               (struct regnode_charclass_class*)scan);
4210                     break;
4211                 case ALNUM:
4212                     if (flags & SCF_DO_STCLASS_AND) {
4213                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4214                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4215                             if (OP(scan) == ALNUMU) {
4216                                 for (value = 0; value < 256; value++) {
4217                                     if (!isWORDCHAR_L1(value)) {
4218                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4219                                     }
4220                                 }
4221                             } else {
4222                                 for (value = 0; value < 256; value++) {
4223                                     if (!isALNUM(value)) {
4224                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4225                                     }
4226                                 }
4227                             }
4228                         }
4229                     }
4230                     else {
4231                         if (data->start_class->flags & ANYOF_LOCALE)
4232                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4233
4234                         /* Even if under locale, set the bits for non-locale
4235                          * in case it isn't a true locale-node.  This will
4236                          * create false positives if it truly is locale */
4237                         if (OP(scan) == ALNUMU) {
4238                             for (value = 0; value < 256; value++) {
4239                                 if (isWORDCHAR_L1(value)) {
4240                                     ANYOF_BITMAP_SET(data->start_class, value);
4241                                 }
4242                             }
4243                         } else {
4244                             for (value = 0; value < 256; value++) {
4245                                 if (isALNUM(value)) {
4246                                     ANYOF_BITMAP_SET(data->start_class, value);
4247                                 }
4248                             }
4249                         }
4250                     }
4251                     break;
4252                 case NALNUM:
4253                     if (flags & SCF_DO_STCLASS_AND) {
4254                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4255                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4256                             if (OP(scan) == NALNUMU) {
4257                                 for (value = 0; value < 256; value++) {
4258                                     if (isWORDCHAR_L1(value)) {
4259                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4260                                     }
4261                                 }
4262                             } else {
4263                                 for (value = 0; value < 256; value++) {
4264                                     if (isALNUM(value)) {
4265                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4266                                     }
4267                                 }
4268                             }
4269                         }
4270                     }
4271                     else {
4272                         if (data->start_class->flags & ANYOF_LOCALE)
4273                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4274
4275                         /* Even if under locale, set the bits for non-locale in
4276                          * case it isn't a true locale-node.  This will create
4277                          * false positives if it truly is locale */
4278                         if (OP(scan) == NALNUMU) {
4279                             for (value = 0; value < 256; value++) {
4280                                 if (! isWORDCHAR_L1(value)) {
4281                                     ANYOF_BITMAP_SET(data->start_class, value);
4282                                 }
4283                             }
4284                         } else {
4285                             for (value = 0; value < 256; value++) {
4286                                 if (! isALNUM(value)) {
4287                                     ANYOF_BITMAP_SET(data->start_class, value);
4288                                 }
4289                             }
4290                         }
4291                     }
4292                     break;
4293                 case SPACE:
4294                     if (flags & SCF_DO_STCLASS_AND) {
4295                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4296                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4297                             if (OP(scan) == SPACEU) {
4298                                 for (value = 0; value < 256; value++) {
4299                                     if (!isSPACE_L1(value)) {
4300                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4301                                     }
4302                                 }
4303                             } else {
4304                                 for (value = 0; value < 256; value++) {
4305                                     if (!isSPACE(value)) {
4306                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4307                                     }
4308                                 }
4309                             }
4310                         }
4311                     }
4312                     else {
4313                         if (data->start_class->flags & ANYOF_LOCALE) {
4314                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4315                         }
4316                         if (OP(scan) == SPACEU) {
4317                             for (value = 0; value < 256; value++) {
4318                                 if (isSPACE_L1(value)) {
4319                                     ANYOF_BITMAP_SET(data->start_class, value);
4320                                 }
4321                             }
4322                         } else {
4323                             for (value = 0; value < 256; value++) {
4324                                 if (isSPACE(value)) {
4325                                     ANYOF_BITMAP_SET(data->start_class, value);
4326                                 }
4327                             }
4328                         }
4329                     }
4330                     break;
4331                 case NSPACE:
4332                     if (flags & SCF_DO_STCLASS_AND) {
4333                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4334                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4335                             if (OP(scan) == NSPACEU) {
4336                                 for (value = 0; value < 256; value++) {
4337                                     if (isSPACE_L1(value)) {
4338                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4339                                     }
4340                                 }
4341                             } else {
4342                                 for (value = 0; value < 256; value++) {
4343                                     if (isSPACE(value)) {
4344                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4345                                     }
4346                                 }
4347                             }
4348                         }
4349                     }
4350                     else {
4351                         if (data->start_class->flags & ANYOF_LOCALE)
4352                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4353                         if (OP(scan) == NSPACEU) {
4354                             for (value = 0; value < 256; value++) {
4355                                 if (!isSPACE_L1(value)) {
4356                                     ANYOF_BITMAP_SET(data->start_class, value);
4357                                 }
4358                             }
4359                         }
4360                         else {
4361                             for (value = 0; value < 256; value++) {
4362                                 if (!isSPACE(value)) {
4363                                     ANYOF_BITMAP_SET(data->start_class, value);
4364                                 }
4365                             }
4366                         }
4367                     }
4368                     break;
4369                 case DIGIT:
4370                     if (flags & SCF_DO_STCLASS_AND) {
4371                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4372                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4373                             for (value = 0; value < 256; value++)
4374                                 if (!isDIGIT(value))
4375                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4376                         }
4377                     }
4378                     else {
4379                         if (data->start_class->flags & ANYOF_LOCALE)
4380                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4381                         for (value = 0; value < 256; value++)
4382                             if (isDIGIT(value))
4383                                 ANYOF_BITMAP_SET(data->start_class, value);
4384                     }
4385                     break;
4386                 case NDIGIT:
4387                     if (flags & SCF_DO_STCLASS_AND) {
4388                         if (!(data->start_class->flags & ANYOF_LOCALE))
4389                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4390                         for (value = 0; value < 256; value++)
4391                             if (isDIGIT(value))
4392                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4393                     }
4394                     else {
4395                         if (data->start_class->flags & ANYOF_LOCALE)
4396                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4397                         for (value = 0; value < 256; value++)
4398                             if (!isDIGIT(value))
4399                                 ANYOF_BITMAP_SET(data->start_class, value);
4400                     }
4401                     break;
4402                 CASE_SYNST_FNC(VERTWS);
4403                 CASE_SYNST_FNC(HORIZWS);
4404
4405                 }
4406                 if (flags & SCF_DO_STCLASS_OR)
4407                     cl_and(data->start_class, and_withp);
4408                 flags &= ~SCF_DO_STCLASS;
4409             }
4410         }
4411         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4412             data->flags |= (OP(scan) == MEOL
4413                             ? SF_BEFORE_MEOL
4414                             : SF_BEFORE_SEOL);
4415             SCAN_COMMIT(pRExC_state, data, minlenp);
4416
4417         }
4418         else if (  PL_regkind[OP(scan)] == BRANCHJ
4419                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4420                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4421                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4422             if ( OP(scan) == UNLESSM &&
4423                  scan->flags == 0 &&
4424                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4425                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4426             ) {
4427                 regnode *opt;
4428                 regnode *upto= regnext(scan);
4429                 DEBUG_PARSE_r({
4430                     SV * const mysv_val=sv_newmortal();
4431                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4432
4433                     /*DEBUG_PARSE_MSG("opfail");*/
4434                     regprop(RExC_rx, mysv_val, upto);
4435                     PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4436                                   SvPV_nolen_const(mysv_val),
4437                                   (IV)REG_NODE_NUM(upto),
4438                                   (IV)(upto - scan)
4439                     );
4440                 });
4441                 OP(scan) = OPFAIL;
4442                 NEXT_OFF(scan) = upto - scan;
4443                 for (opt= scan + 1; opt < upto ; opt++)
4444                     OP(opt) = OPTIMIZED;
4445                 scan= upto;
4446                 continue;
4447             }
4448             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4449                 || OP(scan) == UNLESSM )
4450             {
4451                 /* Negative Lookahead/lookbehind
4452                    In this case we can't do fixed string optimisation.
4453                 */
4454
4455                 I32 deltanext, minnext, fake = 0;
4456                 regnode *nscan;
4457                 struct regnode_charclass_class intrnl;
4458                 int f = 0;
4459
4460                 data_fake.flags = 0;
4461                 if (data) {
4462                     data_fake.whilem_c = data->whilem_c;
4463                     data_fake.last_closep = data->last_closep;
4464                 }
4465                 else
4466                     data_fake.last_closep = &fake;
4467                 data_fake.pos_delta = delta;
4468                 if ( flags & SCF_DO_STCLASS && !scan->flags
4469                      && OP(scan) == IFMATCH ) { /* Lookahead */
4470                     cl_init(pRExC_state, &intrnl);
4471                     data_fake.start_class = &intrnl;
4472                     f |= SCF_DO_STCLASS_AND;
4473                 }
4474                 if (flags & SCF_WHILEM_VISITED_POS)
4475                     f |= SCF_WHILEM_VISITED_POS;
4476                 next = regnext(scan);
4477                 nscan = NEXTOPER(NEXTOPER(scan));
4478                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4479                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4480                 if (scan->flags) {
4481                     if (deltanext) {
4482                         FAIL("Variable length lookbehind not implemented");
4483                     }
4484                     else if (minnext > (I32)U8_MAX) {
4485                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4486                     }
4487                     scan->flags = (U8)minnext;
4488                 }
4489                 if (data) {
4490                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4491                         pars++;
4492                     if (data_fake.flags & SF_HAS_EVAL)
4493                         data->flags |= SF_HAS_EVAL;
4494                     data->whilem_c = data_fake.whilem_c;
4495                 }
4496                 if (f & SCF_DO_STCLASS_AND) {
4497                     if (flags & SCF_DO_STCLASS_OR) {
4498                         /* OR before, AND after: ideally we would recurse with
4499                          * data_fake to get the AND applied by study of the
4500                          * remainder of the pattern, and then derecurse;
4501                          * *** HACK *** for now just treat as "no information".
4502                          * See [perl #56690].
4503                          */
4504                         cl_init(pRExC_state, data->start_class);
4505                     }  else {
4506                         /* AND before and after: combine and continue */
4507                         const int was = (data->start_class->flags & ANYOF_EOS);
4508
4509                         cl_and(data->start_class, &intrnl);
4510                         if (was)
4511                             data->start_class->flags |= ANYOF_EOS;
4512                     }
4513                 }
4514             }
4515 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4516             else {
4517                 /* Positive Lookahead/lookbehind
4518                    In this case we can do fixed string optimisation,
4519                    but we must be careful about it. Note in the case of
4520                    lookbehind the positions will be offset by the minimum
4521                    length of the pattern, something we won't know about
4522                    until after the recurse.
4523                 */
4524                 I32 deltanext, fake = 0;
4525                 regnode *nscan;
4526                 struct regnode_charclass_class intrnl;
4527                 int f = 0;
4528                 /* We use SAVEFREEPV so that when the full compile
4529                     is finished perl will clean up the allocated
4530                     minlens when it's all done. This way we don't
4531                     have to worry about freeing them when we know
4532                     they wont be used, which would be a pain.
4533                  */
4534                 I32 *minnextp;
4535                 Newx( minnextp, 1, I32 );
4536                 SAVEFREEPV(minnextp);
4537
4538                 if (data) {
4539                     StructCopy(data, &data_fake, scan_data_t);
4540                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4541                         f |= SCF_DO_SUBSTR;
4542                         if (scan->flags)
4543                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4544                         data_fake.last_found=newSVsv(data->last_found);
4545                     }
4546                 }
4547                 else
4548                     data_fake.last_closep = &fake;
4549                 data_fake.flags = 0;
4550                 data_fake.pos_delta = delta;
4551                 if (is_inf)
4552                     data_fake.flags |= SF_IS_INF;
4553                 if ( flags & SCF_DO_STCLASS && !scan->flags
4554                      && OP(scan) == IFMATCH ) { /* Lookahead */
4555                     cl_init(pRExC_state, &intrnl);
4556                     data_fake.start_class = &intrnl;
4557                     f |= SCF_DO_STCLASS_AND;
4558                 }
4559                 if (flags & SCF_WHILEM_VISITED_POS)
4560                     f |= SCF_WHILEM_VISITED_POS;
4561                 next = regnext(scan);
4562                 nscan = NEXTOPER(NEXTOPER(scan));
4563
4564                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4565                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4566                 if (scan->flags) {
4567                     if (deltanext) {
4568                         FAIL("Variable length lookbehind not implemented");
4569                     }
4570                     else if (*minnextp > (I32)U8_MAX) {
4571                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4572                     }
4573                     scan->flags = (U8)*minnextp;
4574                 }
4575
4576                 *minnextp += min;
4577
4578                 if (f & SCF_DO_STCLASS_AND) {
4579                     const int was = (data->start_class->flags & ANYOF_EOS);
4580
4581                     cl_and(data->start_class, &intrnl);
4582                     if (was)
4583                         data->start_class->flags |= ANYOF_EOS;
4584                 }
4585                 if (data) {
4586                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4587                         pars++;
4588                     if (data_fake.flags & SF_HAS_EVAL)
4589                         data->flags |= SF_HAS_EVAL;
4590                     data->whilem_c = data_fake.whilem_c;
4591                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4592                         if (RExC_rx->minlen<*minnextp)
4593                             RExC_rx->minlen=*minnextp;
4594                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4595                         SvREFCNT_dec(data_fake.last_found);
4596
4597                         if ( data_fake.minlen_fixed != minlenp )
4598                         {
4599                             data->offset_fixed= data_fake.offset_fixed;
4600                             data->minlen_fixed= data_fake.minlen_fixed;
4601                             data->lookbehind_fixed+= scan->flags;
4602                         }
4603                         if ( data_fake.minlen_float != minlenp )
4604                         {
4605                             data->minlen_float= data_fake.minlen_float;
4606                             data->offset_float_min=data_fake.offset_float_min;
4607                             data->offset_float_max=data_fake.offset_float_max;
4608                             data->lookbehind_float+= scan->flags;
4609                         }
4610                     }
4611                 }
4612             }
4613 #endif
4614         }
4615         else if (OP(scan) == OPEN) {
4616             if (stopparen != (I32)ARG(scan))
4617                 pars++;
4618         }
4619         else if (OP(scan) == CLOSE) {
4620             if (stopparen == (I32)ARG(scan)) {
4621                 break;
4622             }
4623             if ((I32)ARG(scan) == is_par) {
4624                 next = regnext(scan);
4625
4626                 if ( next && (OP(next) != WHILEM) && next < last)
4627                     is_par = 0;         /* Disable optimization */
4628             }
4629             if (data)
4630                 *(data->last_closep) = ARG(scan);
4631         }
4632         else if (OP(scan) == EVAL) {
4633                 if (data)
4634                     data->flags |= SF_HAS_EVAL;
4635         }
4636         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4637             if (flags & SCF_DO_SUBSTR) {
4638                 SCAN_COMMIT(pRExC_state,data,minlenp);
4639                 flags &= ~SCF_DO_SUBSTR;
4640             }
4641             if (data && OP(scan)==ACCEPT) {
4642                 data->flags |= SCF_SEEN_ACCEPT;
4643                 if (stopmin > min)
4644                     stopmin = min;
4645             }
4646         }
4647         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4648         {
4649                 if (flags & SCF_DO_SUBSTR) {
4650                     SCAN_COMMIT(pRExC_state,data,minlenp);
4651                     data->longest = &(data->longest_float);
4652                 }
4653                 is_inf = is_inf_internal = 1;
4654                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4655                     cl_anything(pRExC_state, data->start_class);
4656                 flags &= ~SCF_DO_STCLASS;
4657         }
4658         else if (OP(scan) == GPOS) {
4659             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4660                 !(delta || is_inf || (data && data->pos_delta)))
4661             {
4662                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4663                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4664                 if (RExC_rx->gofs < (U32)min)
4665                     RExC_rx->gofs = min;
4666             } else {
4667                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4668                 RExC_rx->gofs = 0;
4669             }
4670         }
4671 #ifdef TRIE_STUDY_OPT
4672 #ifdef FULL_TRIE_STUDY
4673         else if (PL_regkind[OP(scan)] == TRIE) {
4674             /* NOTE - There is similar code to this block above for handling
4675                BRANCH nodes on the initial study.  If you change stuff here
4676                check there too. */
4677             regnode *trie_node= scan;
4678             regnode *tail= regnext(scan);
4679             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4680             I32 max1 = 0, min1 = I32_MAX;
4681             struct regnode_charclass_class accum;
4682
4683             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4684                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4685             if (flags & SCF_DO_STCLASS)
4686                 cl_init_zero(pRExC_state, &accum);
4687
4688             if (!trie->jump) {
4689                 min1= trie->minlen;
4690                 max1= trie->maxlen;
4691             } else {
4692                 const regnode *nextbranch= NULL;
4693                 U32 word;
4694
4695                 for ( word=1 ; word <= trie->wordcount ; word++)
4696                 {
4697                     I32 deltanext=0, minnext=0, f = 0, fake;
4698                     struct regnode_charclass_class this_class;
4699
4700                     data_fake.flags = 0;
4701                     if (data) {
4702                         data_fake.whilem_c = data->whilem_c;
4703                         data_fake.last_closep = data->last_closep;
4704                     }
4705                     else
4706                         data_fake.last_closep = &fake;
4707                     data_fake.pos_delta = delta;
4708                     if (flags & SCF_DO_STCLASS) {
4709                         cl_init(pRExC_state, &this_class);
4710                         data_fake.start_class = &this_class;
4711                         f = SCF_DO_STCLASS_AND;
4712                     }
4713                     if (flags & SCF_WHILEM_VISITED_POS)
4714                         f |= SCF_WHILEM_VISITED_POS;
4715
4716                     if (trie->jump[word]) {
4717                         if (!nextbranch)
4718                             nextbranch = trie_node + trie->jump[0];
4719                         scan= trie_node + trie->jump[word];
4720                         /* We go from the jump point to the branch that follows
4721                            it. Note this means we need the vestigal unused branches
4722                            even though they arent otherwise used.
4723                          */
4724                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4725                             &deltanext, (regnode *)nextbranch, &data_fake,
4726                             stopparen, recursed, NULL, f,depth+1);
4727                     }
4728                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4729                         nextbranch= regnext((regnode*)nextbranch);
4730
4731                     if (min1 > (I32)(minnext + trie->minlen))
4732                         min1 = minnext + trie->minlen;
4733                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4734                         max1 = minnext + deltanext + trie->maxlen;
4735                     if (deltanext == I32_MAX)
4736                         is_inf = is_inf_internal = 1;
4737
4738                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4739                         pars++;
4740                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4741                         if ( stopmin > min + min1)
4742                             stopmin = min + min1;
4743                         flags &= ~SCF_DO_SUBSTR;
4744                         if (data)
4745                             data->flags |= SCF_SEEN_ACCEPT;
4746                     }
4747                     if (data) {
4748                         if (data_fake.flags & SF_HAS_EVAL)
4749                             data->flags |= SF_HAS_EVAL;
4750                         data->whilem_c = data_fake.whilem_c;
4751                     }
4752                     if (flags & SCF_DO_STCLASS)
4753                         cl_or(pRExC_state, &accum, &this_class);
4754                 }
4755             }
4756             if (flags & SCF_DO_SUBSTR) {
4757                 data->pos_min += min1;
4758                 data->pos_delta += max1 - min1;
4759                 if (max1 != min1 || is_inf)
4760                     data->longest = &(data->longest_float);
4761             }
4762             min += min1;
4763             delta += max1 - min1;
4764             if (flags & SCF_DO_STCLASS_OR) {
4765                 cl_or(pRExC_state, data->start_class, &accum);
4766                 if (min1) {
4767                     cl_and(data->start_class, and_withp);
4768                     flags &= ~SCF_DO_STCLASS;
4769                 }
4770             }
4771             else if (flags & SCF_DO_STCLASS_AND) {
4772                 if (min1) {
4773                     cl_and(data->start_class, &accum);
4774                     flags &= ~SCF_DO_STCLASS;
4775                 }
4776                 else {
4777                     /* Switch to OR mode: cache the old value of
4778                      * data->start_class */
4779                     INIT_AND_WITHP;
4780                     StructCopy(data->start_class, and_withp,
4781                                struct regnode_charclass_class);
4782                     flags &= ~SCF_DO_STCLASS_AND;
4783                     StructCopy(&accum, data->start_class,
4784                                struct regnode_charclass_class);
4785                     flags |= SCF_DO_STCLASS_OR;
4786                     data->start_class->flags |= ANYOF_EOS;
4787                 }
4788             }
4789             scan= tail;
4790             continue;
4791         }
4792 #else
4793         else if (PL_regkind[OP(scan)] == TRIE) {
4794             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4795             U8*bang=NULL;
4796
4797             min += trie->minlen;
4798             delta += (trie->maxlen - trie->minlen);
4799             flags &= ~SCF_DO_STCLASS; /* xxx */
4800             if (flags & SCF_DO_SUBSTR) {
4801                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4802                 data->pos_min += trie->minlen;
4803                 data->pos_delta += (trie->maxlen - trie->minlen);
4804                 if (trie->maxlen != trie->minlen)
4805                     data->longest = &(data->longest_float);
4806             }
4807             if (trie->jump) /* no more substrings -- for now /grr*/
4808                 flags &= ~SCF_DO_SUBSTR;
4809         }
4810 #endif /* old or new */
4811 #endif /* TRIE_STUDY_OPT */
4812
4813         /* Else: zero-length, ignore. */
4814         scan = regnext(scan);
4815     }
4816     if (frame) {
4817         last = frame->last;
4818         scan = frame->next;
4819         stopparen = frame->stop;
4820         frame = frame->prev;
4821         goto fake_study_recurse;
4822     }
4823
4824   finish:
4825     assert(!frame);
4826     DEBUG_STUDYDATA("pre-fin:",data,depth);
4827
4828     *scanp = scan;
4829     *deltap = is_inf_internal ? I32_MAX : delta;
4830     if (flags & SCF_DO_SUBSTR && is_inf)
4831         data->pos_delta = I32_MAX - data->pos_min;
4832     if (is_par > (I32)U8_MAX)
4833         is_par = 0;
4834     if (is_par && pars==1 && data) {
4835         data->flags |= SF_IN_PAR;
4836         data->flags &= ~SF_HAS_PAR;
4837     }
4838     else if (pars && data) {
4839         data->flags |= SF_HAS_PAR;
4840         data->flags &= ~SF_IN_PAR;
4841     }
4842     if (flags & SCF_DO_STCLASS_OR)
4843         cl_and(data->start_class, and_withp);
4844     if (flags & SCF_TRIE_RESTUDY)
4845         data->flags |=  SCF_TRIE_RESTUDY;
4846
4847     DEBUG_STUDYDATA("post-fin:",data,depth);
4848
4849     return min < stopmin ? min : stopmin;
4850 }
4851
4852 STATIC U32
4853 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4854 {
4855     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4856
4857     PERL_ARGS_ASSERT_ADD_DATA;
4858
4859     Renewc(RExC_rxi->data,
4860            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4861            char, struct reg_data);
4862     if(count)
4863         Renew(RExC_rxi->data->what, count + n, U8);
4864     else
4865         Newx(RExC_rxi->data->what, n, U8);
4866     RExC_rxi->data->count = count + n;
4867     Copy(s, RExC_rxi->data->what + count, n, U8);
4868     return count;
4869 }
4870
4871 /*XXX: todo make this not included in a non debugging perl */
4872 #ifndef PERL_IN_XSUB_RE
4873 void
4874 Perl_reginitcolors(pTHX)
4875 {
4876     dVAR;
4877     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4878     if (s) {
4879         char *t = savepv(s);
4880         int i = 0;
4881         PL_colors[0] = t;
4882         while (++i < 6) {
4883             t = strchr(t, '\t');
4884             if (t) {
4885                 *t = '\0';
4886                 PL_colors[i] = ++t;
4887             }
4888             else
4889                 PL_colors[i] = t = (char *)"";
4890         }
4891     } else {
4892         int i = 0;
4893         while (i < 6)
4894             PL_colors[i++] = (char *)"";
4895     }
4896     PL_colorset = 1;
4897 }
4898 #endif
4899
4900
4901 #ifdef TRIE_STUDY_OPT
4902 #define CHECK_RESTUDY_GOTO                                  \
4903         if (                                                \
4904               (data.flags & SCF_TRIE_RESTUDY)               \
4905               && ! restudied++                              \
4906         )     goto reStudy
4907 #else
4908 #define CHECK_RESTUDY_GOTO
4909 #endif
4910
4911 /*
4912  * pregcomp - compile a regular expression into internal code
4913  *
4914  * Decides which engine's compiler to call based on the hint currently in
4915  * scope
4916  */
4917
4918 #ifndef PERL_IN_XSUB_RE
4919
4920 /* return the currently in-scope regex engine (or the default if none)  */
4921
4922 regexp_engine const *
4923 Perl_current_re_engine(pTHX)
4924 {
4925     dVAR;
4926
4927     if (IN_PERL_COMPILETIME) {
4928         HV * const table = GvHV(PL_hintgv);
4929         SV **ptr;
4930
4931         if (!table)
4932             return &PL_core_reg_engine;
4933         ptr = hv_fetchs(table, "regcomp", FALSE);
4934         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4935             return &PL_core_reg_engine;
4936         return INT2PTR(regexp_engine*,SvIV(*ptr));
4937     }
4938     else {
4939         SV *ptr;
4940         if (!PL_curcop->cop_hints_hash)
4941             return &PL_core_reg_engine;
4942         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4943         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4944             return &PL_core_reg_engine;
4945         return INT2PTR(regexp_engine*,SvIV(ptr));
4946     }
4947 }
4948
4949
4950 REGEXP *
4951 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4952 {
4953     dVAR;
4954     regexp_engine const *eng = current_re_engine();
4955     GET_RE_DEBUG_FLAGS_DECL;
4956
4957     PERL_ARGS_ASSERT_PREGCOMP;
4958
4959     /* Dispatch a request to compile a regexp to correct regexp engine. */
4960     DEBUG_COMPILE_r({
4961         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4962                         PTR2UV(eng));
4963     });
4964     return CALLREGCOMP_ENG(eng, pattern, flags);
4965 }
4966 #endif
4967
4968 /* public(ish) entry point for the perl core's own regex compiling code.
4969  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4970  * pattern rather than a list of OPs, and uses the internal engine rather
4971  * than the current one */
4972
4973 REGEXP *
4974 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4975 {
4976     SV *pat = pattern; /* defeat constness! */
4977     PERL_ARGS_ASSERT_RE_COMPILE;
4978     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4979 #ifdef PERL_IN_XSUB_RE
4980                                 &my_reg_engine,
4981 #else
4982                                 &PL_core_reg_engine,
4983 #endif
4984                                 NULL, NULL, rx_flags, 0);
4985 }
4986
4987 /* see if there are any run-time code blocks in the pattern.
4988  * False positives are allowed */
4989
4990 static bool
4991 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4992                     U32 pm_flags, char *pat, STRLEN plen)
4993 {
4994     int n = 0;
4995     STRLEN s;
4996
4997     /* avoid infinitely recursing when we recompile the pattern parcelled up
4998      * as qr'...'. A single constant qr// string can't have have any
4999      * run-time component in it, and thus, no runtime code. (A non-qr
5000      * string, however, can, e.g. $x =~ '(?{})') */
5001     if  ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
5002         return 0;
5003
5004     for (s = 0; s < plen; s++) {
5005         if (n < pRExC_state->num_code_blocks
5006             && s == pRExC_state->code_blocks[n].start)
5007         {
5008             s = pRExC_state->code_blocks[n].end;
5009             n++;
5010             continue;
5011         }
5012         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5013          * positives here */
5014         if (pat[s] == '(' && pat[s+1] == '?' &&
5015             (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
5016         )
5017             return 1;
5018     }
5019     return 0;
5020 }
5021
5022 /* Handle run-time code blocks. We will already have compiled any direct
5023  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5024  * copy of it, but with any literal code blocks blanked out and
5025  * appropriate chars escaped; then feed it into
5026  *
5027  *    eval "qr'modified_pattern'"
5028  *
5029  * For example,
5030  *
5031  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5032  *
5033  * becomes
5034  *
5035  *    qr'a\\bc                       def\'ghi\\\\jkl(?{"this is runtime"})mno'
5036  *
5037  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5038  * and merge them with any code blocks of the original regexp.
5039  *
5040  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5041  * instead, just save the qr and return FALSE; this tells our caller that
5042  * the original pattern needs upgrading to utf8.
5043  */
5044
5045 static bool
5046 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5047     char *pat, STRLEN plen)
5048 {
5049     SV *qr;
5050
5051     GET_RE_DEBUG_FLAGS_DECL;
5052
5053     if (pRExC_state->runtime_code_qr) {
5054         /* this is the second time we've been called; this should
5055          * only happen if the main pattern got upgraded to utf8
5056          * during compilation; re-use the qr we compiled first time
5057          * round (which should be utf8 too)
5058          */
5059         qr = pRExC_state->runtime_code_qr;
5060         pRExC_state->runtime_code_qr = NULL;
5061         assert(RExC_utf8 && SvUTF8(qr));
5062     }
5063     else {
5064         int n = 0;
5065         STRLEN s;
5066         char *p, *newpat;
5067         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5068         SV *sv, *qr_ref;
5069         dSP;
5070
5071         /* determine how many extra chars we need for ' and \ escaping */
5072         for (s = 0; s < plen; s++) {
5073             if (pat[s] == '\'' || pat[s] == '\\')
5074                 newlen++;
5075         }
5076
5077         Newx(newpat, newlen, char);
5078         p = newpat;
5079         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5080
5081         for (s = 0; s < plen; s++) {
5082             if (n < pRExC_state->num_code_blocks
5083                 && s == pRExC_state->code_blocks[n].start)
5084             {
5085                 /* blank out literal code block */
5086                 assert(pat[s] == '(');
5087                 while (s <= pRExC_state->code_blocks[n].end) {
5088                     *p++ = ' ';
5089                     s++;
5090                 }
5091                 s--;
5092                 n++;
5093                 continue;
5094             }
5095             if (pat[s] == '\'' || pat[s] == '\\')
5096                 *p++ = '\\';
5097             *p++ = pat[s];
5098         }
5099         *p++ = '\'';
5100         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5101             *p++ = 'x';
5102         *p++ = '\0';
5103         DEBUG_COMPILE_r({
5104             PerlIO_printf(Perl_debug_log,
5105                 "%sre-parsing pattern for runtime code:%s %s\n",
5106                 PL_colors[4],PL_colors[5],newpat);
5107         });
5108
5109         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5110         Safefree(newpat);
5111
5112         ENTER;
5113         SAVETMPS;
5114         save_re_context();
5115         PUSHSTACKi(PERLSI_REQUIRE);
5116         /* this causes the toker to collapse \\ into \ when parsing
5117          * qr''; normally only q'' does this. It also alters hints
5118          * handling */
5119         PL_reg_state.re_reparsing = TRUE;
5120         eval_sv(sv, G_SCALAR);
5121         SvREFCNT_dec(sv);
5122         SPAGAIN;
5123         qr_ref = POPs;
5124         PUTBACK;
5125         if (SvTRUE(ERRSV))
5126             Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5127         assert(SvROK(qr_ref));
5128         qr = SvRV(qr_ref);
5129         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5130         /* the leaving below frees the tmp qr_ref.
5131          * Give qr a life of its own */
5132         SvREFCNT_inc(qr);
5133         POPSTACK;
5134         FREETMPS;
5135         LEAVE;
5136
5137     }
5138
5139     if (!RExC_utf8 && SvUTF8(qr)) {
5140         /* first time through; the pattern got upgraded; save the
5141          * qr for the next time through */
5142         assert(!pRExC_state->runtime_code_qr);
5143         pRExC_state->runtime_code_qr = qr;
5144         return 0;
5145     }
5146
5147
5148     /* extract any code blocks within the returned qr//  */
5149
5150
5151     /* merge the main (r1) and run-time (r2) code blocks into one */
5152     {
5153         RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5154         struct reg_code_block *new_block, *dst;
5155         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5156         int i1 = 0, i2 = 0;
5157
5158         if (!r2->num_code_blocks) /* we guessed wrong */
5159             return 1;
5160
5161         Newx(new_block,
5162             r1->num_code_blocks + r2->num_code_blocks,
5163             struct reg_code_block);
5164         dst = new_block;
5165
5166         while (    i1 < r1->num_code_blocks
5167                 || i2 < r2->num_code_blocks)
5168         {
5169             struct reg_code_block *src;
5170             bool is_qr = 0;
5171
5172             if (i1 == r1->num_code_blocks) {
5173                 src = &r2->code_blocks[i2++];
5174                 is_qr = 1;
5175             }
5176             else if (i2 == r2->num_code_blocks)
5177                 src = &r1->code_blocks[i1++];
5178             else if (  r1->code_blocks[i1].start
5179                      < r2->code_blocks[i2].start)
5180             {
5181                 src = &r1->code_blocks[i1++];
5182                 assert(src->end < r2->code_blocks[i2].start);
5183             }
5184             else {
5185                 assert(  r1->code_blocks[i1].start
5186                        > r2->code_blocks[i2].start);
5187                 src = &r2->code_blocks[i2++];
5188                 is_qr = 1;
5189                 assert(src->end < r1->code_blocks[i1].start);
5190             }
5191
5192             assert(pat[src->start] == '(');
5193             assert(pat[src->end]   == ')');
5194             dst->start      = src->start;
5195             dst->end        = src->end;
5196             dst->block      = src->block;
5197             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5198                                     : src->src_regex;
5199             dst++;
5200         }
5201         r1->num_code_blocks += r2->num_code_blocks;
5202         Safefree(r1->code_blocks);
5203         r1->code_blocks = new_block;
5204     }
5205
5206     SvREFCNT_dec(qr);
5207     return 1;
5208 }
5209
5210
5211 STATIC bool
5212 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5213 {
5214     /* This is the common code for setting up the floating and fixed length
5215      * string data extracted from Perlre_op_compile() below.  Returns a boolean
5216      * as to whether succeeded or not */
5217
5218     I32 t,ml;
5219
5220     if (! (longest_length
5221            || (eol /* Can't have SEOL and MULTI */
5222                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5223           )
5224             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5225         || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5226     {
5227         return FALSE;
5228     }
5229
5230     /* copy the information about the longest from the reg_scan_data
5231         over to the program. */
5232     if (SvUTF8(sv_longest)) {
5233         *rx_utf8 = sv_longest;
5234         *rx_substr = NULL;
5235     } else {
5236         *rx_substr = sv_longest;
5237         *rx_utf8 = NULL;
5238     }
5239     /* end_shift is how many chars that must be matched that
5240         follow this item. We calculate it ahead of time as once the
5241         lookbehind offset is added in we lose the ability to correctly
5242         calculate it.*/
5243     ml = minlen ? *(minlen) : (I32)longest_length;
5244     *rx_end_shift = ml - offset
5245         - longest_length + (SvTAIL(sv_longest) != 0)
5246         + lookbehind;
5247
5248     t = (eol/* Can't have SEOL and MULTI */
5249          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5250     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5251
5252     return TRUE;
5253 }
5254
5255 /*
5256  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5257  * regular expression into internal code.
5258  * The pattern may be passed either as:
5259  *    a list of SVs (patternp plus pat_count)
5260  *    a list of OPs (expr)
5261  * If both are passed, the SV list is used, but the OP list indicates
5262  * which SVs are actually pre-compiled code blocks
5263  *
5264  * The SVs in the list have magic and qr overloading applied to them (and
5265  * the list may be modified in-place with replacement SVs in the latter
5266  * case).
5267  *
5268  * If the pattern hasn't changed from old_re, then old_re will be
5269  * returned.
5270  *
5271  * eng is the current engine. If that engine has an op_comp method, then
5272  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5273  * do the initial concatenation of arguments and pass on to the external
5274  * engine.
5275  *
5276  * If is_bare_re is not null, set it to a boolean indicating whether the
5277  * arg list reduced (after overloading) to a single bare regex which has
5278  * been returned (i.e. /$qr/).
5279  *
5280  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5281  *
5282  * pm_flags contains the PMf_* flags, typically based on those from the
5283  * pm_flags field of the related PMOP. Currently we're only interested in
5284  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5285  *
5286  * We can't allocate space until we know how big the compiled form will be,
5287  * but we can't compile it (and thus know how big it is) until we've got a
5288  * place to put the code.  So we cheat:  we compile it twice, once with code
5289  * generation turned off and size counting turned on, and once "for real".
5290  * This also means that we don't allocate space until we are sure that the
5291  * thing really will compile successfully, and we never have to move the
5292  * code and thus invalidate pointers into it.  (Note that it has to be in
5293  * one piece because free() must be able to free it all.) [NB: not true in perl]
5294  *
5295  * Beware that the optimization-preparation code in here knows about some
5296  * of the structure of the compiled regexp.  [I'll say.]
5297  */
5298
5299 REGEXP *
5300 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5301                     OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5302                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5303 {
5304     dVAR;
5305     REGEXP *rx;
5306     struct regexp *r;
5307     regexp_internal *ri;
5308     STRLEN plen;
5309     char  * VOL exp;
5310     char* xend;
5311     regnode *scan;
5312     I32 flags;
5313     I32 minlen = 0;
5314     U32 rx_flags;
5315     SV * VOL pat;
5316
5317     /* these are all flags - maybe they should be turned
5318      * into a single int with different bit masks */
5319     I32 sawlookahead = 0;
5320     I32 sawplus = 0;
5321     I32 sawopen = 0;
5322     bool used_setjump = FALSE;
5323     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5324     bool code_is_utf8 = 0;
5325     bool VOL recompile = 0;
5326     bool runtime_code = 0;
5327     U8 jump_ret = 0;
5328     dJMPENV;
5329     scan_data_t data;
5330     RExC_state_t RExC_state;
5331     RExC_state_t * const pRExC_state = &RExC_state;
5332 #ifdef TRIE_STUDY_OPT
5333     int restudied;
5334     RExC_state_t copyRExC_state;
5335 #endif
5336     GET_RE_DEBUG_FLAGS_DECL;
5337
5338     PERL_ARGS_ASSERT_RE_OP_COMPILE;
5339
5340     DEBUG_r(if (!PL_colorset) reginitcolors());
5341
5342 #ifndef PERL_IN_XSUB_RE
5343     /* Initialize these here instead of as-needed, as is quick and avoids
5344      * having to test them each time otherwise */
5345     if (! PL_AboveLatin1) {
5346         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5347         PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5348         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5349
5350         PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5351         PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5352
5353         PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5354         PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5355
5356         PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5357         PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5358
5359         PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5360
5361         PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5362         PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5363
5364         PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5365
5366         PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5367         PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5368
5369         PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5370         PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5371
5372         PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5373         PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5374
5375         PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5376         PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5377
5378         PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5379         PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5380
5381         PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5382         PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5383
5384         PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5385         PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5386
5387         PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5388
5389         PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5390         PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5391
5392         PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5393         PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5394     }
5395 #endif
5396
5397     pRExC_state->code_blocks = NULL;
5398     pRExC_state->num_code_blocks = 0;
5399
5400     if (is_bare_re)
5401         *is_bare_re = FALSE;
5402
5403     if (expr && (expr->op_type == OP_LIST ||
5404                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5405
5406         /* is the source UTF8, and how many code blocks are there? */
5407         OP *o;
5408         int ncode = 0;
5409
5410         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5411             if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5412                 code_is_utf8 = 1;
5413             else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5414                 /* count of DO blocks */
5415                 ncode++;
5416         }
5417         if (ncode) {
5418             pRExC_state->num_code_blocks = ncode;
5419             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5420         }
5421     }
5422
5423     if (pat_count) {
5424         /* handle a list of SVs */
5425
5426         SV **svp;
5427
5428         /* apply magic and RE overloading to each arg */
5429         for (svp = patternp; svp < patternp + pat_count; svp++) {
5430             SV *rx = *svp;
5431             SvGETMAGIC(rx);
5432             if (SvROK(rx) && SvAMAGIC(rx)) {
5433                 SV *sv = AMG_CALLunary(rx, regexp_amg);
5434                 if (sv) {
5435                     if (SvROK(sv))
5436                         sv = SvRV(sv);
5437                     if (SvTYPE(sv) != SVt_REGEXP)
5438                         Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5439                     *svp = sv;
5440                 }
5441             }
5442         }
5443
5444         if (pat_count > 1) {
5445             /* concat multiple args and find any code block indexes */
5446
5447             OP *o = NULL;
5448             int n = 0;
5449             bool utf8 = 0;
5450             STRLEN orig_patlen = 0;
5451
5452             if (pRExC_state->num_code_blocks) {
5453                 o = cLISTOPx(expr)->op_first;
5454                 assert(o->op_type == OP_PUSHMARK);
5455                 o = o->op_sibling;
5456             }
5457
5458             pat = newSVpvn("", 0);
5459             SAVEFREESV(pat);
5460
5461             /* determine if the pattern is going to be utf8 (needed
5462              * in advance to align code block indices correctly).
5463              * XXX This could fail to be detected for an arg with
5464              * overloading but not concat overloading; but the main effect
5465              * in this obscure case is to need a 'use re eval' for a
5466              * literal code block */
5467             for (svp = patternp; svp < patternp + pat_count; svp++) {
5468                 if (SvUTF8(*svp))
5469                     utf8 = 1;
5470             }
5471             if (utf8)
5472                 SvUTF8_on(pat);
5473
5474             for (svp = patternp; svp < patternp + pat_count; svp++) {
5475                 SV *sv, *msv = *svp;
5476                 SV *rx;
5477                 bool code = 0;
5478                 if (o) {
5479                     if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5480                         assert(n < pRExC_state->num_code_blocks);
5481                         pRExC_state->code_blocks[n].start = SvCUR(pat);
5482                         pRExC_state->code_blocks[n].block = o;
5483                         pRExC_state->code_blocks[n].src_regex = NULL;
5484                         n++;
5485                         code = 1;
5486                         o = o->op_sibling; /* skip CONST */
5487                         assert(o);
5488                     }
5489                     o = o->op_sibling;;
5490                 }
5491
5492                 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5493                         (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5494                 {
5495                     sv_setsv(pat, sv);
5496                     /* overloading involved: all bets are off over literal
5497                      * code. Pretend we haven't seen it */
5498                     pRExC_state->num_code_blocks -= n;
5499                     n = 0;
5500                     rx = NULL;
5501
5502                 }
5503                 else  {
5504                     while (SvAMAGIC(msv)
5505                             && (sv = AMG_CALLunary(msv, string_amg))
5506                             && sv != msv)
5507                     {
5508                         msv = sv;
5509                         SvGETMAGIC(msv);
5510                     }
5511                     if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5512                         msv = SvRV(msv);
5513                     orig_patlen = SvCUR(pat);
5514                     sv_catsv_nomg(pat, msv);
5515                     rx = msv;
5516                     if (code)
5517                         pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5518                 }
5519
5520                 /* extract any code blocks within any embedded qr//'s */
5521                 if (rx && SvTYPE(rx) == SVt_REGEXP
5522                     && RX_ENGINE((REGEXP*)rx)->op_comp)
5523                 {
5524
5525                     RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5526                     if (ri->num_code_blocks) {
5527                         int i;
5528                         /* the presence of an embedded qr// with code means
5529                          * we should always recompile: the text of the
5530                          * qr// may not have changed, but it may be a
5531                          * different closure than last time */
5532                         recompile = 1;
5533                         Renew(pRExC_state->code_blocks,
5534                             pRExC_state->num_code_blocks + ri->num_code_blocks,
5535                             struct reg_code_block);
5536                         pRExC_state->num_code_blocks += ri->num_code_blocks;
5537                         for (i=0; i < ri->num_code_blocks; i++) {
5538                             struct reg_code_block *src, *dst;
5539                             STRLEN offset =  orig_patlen
5540                                 + ((struct regexp *)SvANY(rx))->pre_prefix;
5541                             assert(n < pRExC_state->num_code_blocks);
5542                             src = &ri->code_blocks[i];
5543                             dst = &pRExC_state->code_blocks[n];
5544                             dst->start      = src->start + offset;
5545                             dst->end        = src->end   + offset;
5546                             dst->block      = src->block;
5547                             dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5548                                                     src->src_regex
5549                                                         ? src->src_regex
5550                                                         : (REGEXP*)rx);
5551                             n++;
5552                         }
5553                     }
5554                 }
5555             }
5556             SvSETMAGIC(pat);
5557         }
5558         else {
5559             SV *sv;
5560             pat = *patternp;
5561             while (SvAMAGIC(pat)
5562                     && (sv = AMG_CALLunary(pat, string_amg))
5563                     && sv != pat)
5564             {
5565                 pat = sv;
5566                 SvGETMAGIC(pat);
5567             }
5568         }
5569
5570         /* handle bare regex: foo =~ $re */
5571         {
5572             SV *re = pat;
5573             if (SvROK(re))
5574                 re = SvRV(re);
5575             if (SvTYPE(re) == SVt_REGEXP) {
5576                 if (is_bare_re)
5577                     *is_bare_re = TRUE;
5578                 SvREFCNT_inc(re);
5579                 Safefree(pRExC_state->code_blocks);
5580                 return (REGEXP*)re;
5581             }
5582         }
5583     }
5584     else {
5585         /* not a list of SVs, so must be a list of OPs */
5586         assert(expr);
5587         if (expr->op_type == OP_LIST) {
5588             int i = -1;
5589             bool is_code = 0;
5590             OP *o;
5591
5592             pat = newSVpvn("", 0);
5593             SAVEFREESV(pat);
5594             if (code_is_utf8)
5595                 SvUTF8_on(pat);
5596
5597             /* given a list of CONSTs and DO blocks in expr, append all
5598              * the CONSTs to pat, and record the start and end of each
5599              * code block in code_blocks[] (each DO{} op is followed by an
5600              * OP_CONST containing the corresponding literal '(?{...})
5601              * text)
5602              */
5603             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5604                 if (o->op_type == OP_CONST) {
5605                     sv_catsv(pat, cSVOPo_sv);
5606                     if (is_code) {
5607                         pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5608                         is_code = 0;
5609                     }
5610                 }
5611                 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5612                     assert(i+1 < pRExC_state->num_code_blocks);
5613                     pRExC_state->code_blocks[++i].start = SvCUR(pat);
5614                     pRExC_state->code_blocks[i].block = o;
5615                     pRExC_state->code_blocks[i].src_regex = NULL;
5616                     is_code = 1;
5617                 }
5618             }
5619         }
5620         else {
5621             assert(expr->op_type == OP_CONST);
5622             pat = cSVOPx_sv(expr);
5623         }
5624     }
5625
5626     exp = SvPV_nomg(pat, plen);
5627
5628     if (!eng->op_comp) {
5629         if ((SvUTF8(pat) && IN_BYTES)
5630                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5631         {
5632             /* make a temporary copy; either to convert to bytes,
5633              * or to avoid repeating get-magic / overloaded stringify */
5634             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5635                                         (IN_BYTES ? 0 : SvUTF8(pat)));
5636         }
5637         Safefree(pRExC_state->code_blocks);
5638         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5639     }
5640
5641     /* ignore the utf8ness if the pattern is 0 length */
5642     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5643     RExC_uni_semantics = 0;
5644     RExC_contains_locale = 0;
5645     pRExC_state->runtime_code_qr = NULL;
5646
5647     /****************** LONG JUMP TARGET HERE***********************/
5648     /* Longjmp back to here if have to switch in midstream to utf8 */
5649     if (! RExC_orig_utf8) {
5650         JMPENV_PUSH(jump_ret);
5651         used_setjump = TRUE;
5652     }
5653
5654     if (jump_ret == 0) {    /* First time through */
5655         xend = exp + plen;
5656
5657         DEBUG_COMPILE_r({
5658             SV *dsv= sv_newmortal();
5659             RE_PV_QUOTED_DECL(s, RExC_utf8,
5660                 dsv, exp, plen, 60);
5661             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5662                            PL_colors[4],PL_colors[5],s);
5663         });
5664     }
5665     else {  /* longjumped back */
5666         U8 *src, *dst;
5667         int n=0;
5668         STRLEN s = 0, d = 0;
5669         bool do_end = 0;
5670
5671         /* If the cause for the longjmp was other than changing to utf8, pop
5672          * our own setjmp, and longjmp to the correct handler */
5673         if (jump_ret != UTF8_LONGJMP) {
5674             JMPENV_POP;
5675             JMPENV_JUMP(jump_ret);
5676         }
5677
5678         GET_RE_DEBUG_FLAGS;
5679
5680         /* It's possible to write a regexp in ascii that represents Unicode
5681         codepoints outside of the byte range, such as via \x{100}. If we
5682         detect such a sequence we have to convert the entire pattern to utf8
5683         and then recompile, as our sizing calculation will have been based
5684         on 1 byte == 1 character, but we will need to use utf8 to encode
5685         at least some part of the pattern, and therefore must convert the whole
5686         thing.
5687         -- dmq */
5688         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5689             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5690
5691         /* upgrade pattern to UTF8, and if there are code blocks,
5692          * recalculate the indices.
5693          * This is essentially an unrolled Perl_bytes_to_utf8() */
5694
5695         src = (U8*)SvPV_nomg(pat, plen);
5696         Newx(dst, plen * 2 + 1, U8);
5697
5698         while (s < plen) {
5699             const UV uv = NATIVE_TO_ASCII(src[s]);
5700             if (UNI_IS_INVARIANT(uv))
5701                 dst[d]   = (U8)UTF_TO_NATIVE(uv);
5702             else {
5703                 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5704                 dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
5705             }
5706             if (n < pRExC_state->num_code_blocks) {
5707                 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5708                     pRExC_state->code_blocks[n].start = d;
5709                     assert(dst[d] == '(');
5710                     do_end = 1;
5711                 }
5712                 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5713                     pRExC_state->code_blocks[n].end = d;
5714                     assert(dst[d] == ')');
5715                     do_end = 0;
5716                     n++;
5717                 }
5718             }
5719             s++;
5720             d++;
5721         }
5722         dst[d] = '\0';
5723         plen = d;
5724         exp = (char*) dst;
5725         xend = exp + plen;
5726         SAVEFREEPV(exp);
5727         RExC_orig_utf8 = RExC_utf8 = 1;
5728     }
5729
5730     /* return old regex if pattern hasn't changed */
5731
5732     if (   old_re
5733         && !recompile
5734         && !!RX_UTF8(old_re) == !!RExC_utf8
5735         && RX_PRECOMP(old_re)
5736         && RX_PRELEN(old_re) == plen
5737         && memEQ(RX_PRECOMP(old_re), exp, plen))
5738     {
5739         /* with runtime code, always recompile */
5740         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5741                                             exp, plen);
5742         if (!runtime_code) {
5743             if (used_setjump) {
5744                 JMPENV_POP;
5745             }
5746             Safefree(pRExC_state->code_blocks);
5747             return old_re;
5748         }
5749     }
5750     else if ((pm_flags & PMf_USE_RE_EVAL)
5751                 /* this second condition covers the non-regex literal case,
5752                  * i.e.  $foo =~ '(?{})'. */
5753                 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5754                     && (PL_hints & HINT_RE_EVAL))
5755     )
5756         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5757                             exp, plen);
5758
5759 #ifdef TRIE_STUDY_OPT
5760     restudied = 0;
5761 #endif
5762
5763     rx_flags = orig_rx_flags;
5764
5765     if (initial_charset == REGEX_LOCALE_CHARSET) {
5766         RExC_contains_locale = 1;
5767     }
5768     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5769
5770         /* Set to use unicode semantics if the pattern is in utf8 and has the
5771          * 'depends' charset specified, as it means unicode when utf8  */
5772         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5773     }
5774
5775     RExC_precomp = exp;
5776     RExC_flags = rx_flags;
5777     RExC_pm_flags = pm_flags;
5778
5779     if (runtime_code) {
5780         if (PL_tainting && PL_tainted)
5781             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5782
5783         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5784             /* whoops, we have a non-utf8 pattern, whilst run-time code
5785              * got compiled as utf8. Try again with a utf8 pattern */
5786              JMPENV_JUMP(UTF8_LONGJMP);
5787         }
5788     }
5789     assert(!pRExC_state->runtime_code_qr);
5790
5791     RExC_sawback = 0;
5792
5793     RExC_seen = 0;
5794     RExC_in_lookbehind = 0;
5795     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5796     RExC_extralen = 0;
5797     RExC_override_recoding = 0;
5798
5799     /* First pass: determine size, legality. */
5800     RExC_parse = exp;
5801     RExC_start = exp;
5802     RExC_end = xend;
5803     RExC_naughty = 0;
5804     RExC_npar = 1;
5805     RExC_nestroot = 0;
5806     RExC_size = 0L;
5807     RExC_emit = &PL_regdummy;
5808     RExC_whilem_seen = 0;
5809     RExC_open_parens = NULL;
5810     RExC_close_parens = NULL;
5811     RExC_opend = NULL;
5812     RExC_paren_names = NULL;
5813 #ifdef DEBUGGING
5814     RExC_paren_name_list = NULL;
5815 #endif
5816     RExC_recurse = NULL;
5817     RExC_recurse_count = 0;
5818     pRExC_state->code_index = 0;
5819
5820 #if 0 /* REGC() is (currently) a NOP at the first pass.
5821        * Clever compilers notice this and complain. --jhi */
5822     REGC((U8)REG_MAGIC, (char*)RExC_emit);
5823 #endif
5824     DEBUG_PARSE_r(
5825         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5826         RExC_lastnum=0;
5827         RExC_lastparse=NULL;
5828     );
5829     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5830         RExC_precomp = NULL;
5831         Safefree(pRExC_state->code_blocks);
5832         return(NULL);
5833     }
5834
5835     /* Here, finished first pass.  Get rid of any added setjmp */
5836     if (used_setjump) {
5837         JMPENV_POP;
5838     }
5839
5840     DEBUG_PARSE_r({
5841         PerlIO_printf(Perl_debug_log,
5842             "Required size %"IVdf" nodes\n"
5843             "Starting second pass (creation)\n",
5844             (IV)RExC_size);
5845         RExC_lastnum=0;
5846         RExC_lastparse=NULL;
5847     });
5848
5849     /* The first pass could have found things that force Unicode semantics */
5850     if ((RExC_utf8 || RExC_uni_semantics)
5851          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5852     {
5853         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5854     }
5855
5856     /* Small enough for pointer-storage convention?
5857        If extralen==0, this means that we will not need long jumps. */
5858     if (RExC_size >= 0x10000L && RExC_extralen)
5859         RExC_size += RExC_extralen;
5860     else
5861         RExC_extralen = 0;
5862     if (RExC_whilem_seen > 15)
5863         RExC_whilem_seen = 15;
5864
5865     /* Allocate space and zero-initialize. Note, the two step process
5866        of zeroing when in debug mode, thus anything assigned has to
5867        happen after that */
5868     rx = (REGEXP*) newSV_type(SVt_REGEXP);
5869     r = (struct regexp*)SvANY(rx);
5870     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5871          char, regexp_internal);
5872     if ( r == NULL || ri == NULL )
5873         FAIL("Regexp out of space");
5874 #ifdef DEBUGGING
5875     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5876     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5877 #else
5878     /* bulk initialize base fields with 0. */
5879     Zero(ri, sizeof(regexp_internal), char);
5880 #endif
5881
5882     /* non-zero initialization begins here */
5883     RXi_SET( r, ri );
5884     r->engine= eng;
5885     r->extflags = rx_flags;
5886     if (pm_flags & PMf_IS_QR) {
5887         ri->code_blocks = pRExC_state->code_blocks;
5888         ri->num_code_blocks = pRExC_state->num_code_blocks;
5889     }
5890     else
5891         SAVEFREEPV(pRExC_state->code_blocks);
5892
5893     {
5894         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5895         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5896
5897         /* The caret is output if there are any defaults: if not all the STD
5898          * flags are set, or if no character set specifier is needed */
5899         bool has_default =
5900                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5901                     || ! has_charset);
5902         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5903         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5904                             >> RXf_PMf_STD_PMMOD_SHIFT);
5905         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5906         char *p;
5907         /* Allocate for the worst case, which is all the std flags are turned
5908          * on.  If more precision is desired, we could do a population count of
5909          * the flags set.  This could be done with a small lookup table, or by
5910          * shifting, masking and adding, or even, when available, assembly
5911          * language for a machine-language population count.
5912          * We never output a minus, as all those are defaults, so are
5913          * covered by the caret */
5914         const STRLEN wraplen = plen + has_p + has_runon
5915             + has_default       /* If needs a caret */
5916
5917                 /* If needs a character set specifier */
5918             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5919             + (sizeof(STD_PAT_MODS) - 1)
5920             + (sizeof("(?:)") - 1);
5921
5922         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5923         SvPOK_on(rx);
5924         if (RExC_utf8)
5925             SvFLAGS(rx) |= SVf_UTF8;
5926         *p++='('; *p++='?';
5927
5928         /* If a default, cover it using the caret */
5929         if (has_default) {
5930             *p++= DEFAULT_PAT_MOD;
5931         }
5932         if (has_charset) {
5933             STRLEN len;
5934             const char* const name = get_regex_charset_name(r->extflags, &len);
5935             Copy(name, p, len, char);
5936             p += len;
5937         }
5938         if (has_p)
5939             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5940         {
5941             char ch;
5942             while((ch = *fptr++)) {
5943                 if(reganch & 1)
5944                     *p++ = ch;
5945                 reganch >>= 1;
5946             }
5947         }
5948
5949         *p++ = ':';
5950         Copy(RExC_precomp, p, plen, char);
5951         assert ((RX_WRAPPED(rx) - p) < 16);
5952         r->pre_prefix = p - RX_WRAPPED(rx);
5953         p += plen;
5954         if (has_runon)
5955             *p++ = '\n';
5956         *p++ = ')';
5957         *p = 0;
5958         SvCUR_set(rx, p - SvPVX_const(rx));
5959     }
5960
5961     r->intflags = 0;
5962     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5963
5964     if (RExC_seen & REG_SEEN_RECURSE) {
5965         Newxz(RExC_open_parens, RExC_npar,regnode *);
5966         SAVEFREEPV(RExC_open_parens);
5967         Newxz(RExC_close_parens,RExC_npar,regnode *);
5968         SAVEFREEPV(RExC_close_parens);
5969     }
5970
5971     /* Useful during FAIL. */
5972 #ifdef RE_TRACK_PATTERN_OFFSETS
5973     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5974     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5975                           "%s %"UVuf" bytes for offset annotations.\n",
5976                           ri->u.offsets ? "Got" : "Couldn't get",
5977                           (UV)((2*RExC_size+1) * sizeof(U32))));
5978 #endif
5979     SetProgLen(ri,RExC_size);
5980     RExC_rx_sv = rx;
5981     RExC_rx = r;
5982     RExC_rxi = ri;
5983     REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5984
5985     /* Second pass: emit code. */
5986     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
5987     RExC_pm_flags = pm_flags;
5988     RExC_parse = exp;
5989     RExC_end = xend;
5990     RExC_naughty = 0;
5991     RExC_npar = 1;
5992     RExC_emit_start = ri->program;
5993     RExC_emit = ri->program;
5994     RExC_emit_bound = ri->program + RExC_size + 1;
5995     pRExC_state->code_index = 0;
5996
5997     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5998     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5999         ReREFCNT_dec(rx);
6000         return(NULL);
6001     }
6002     /* XXXX To minimize changes to RE engine we always allocate
6003        3-units-long substrs field. */
6004     Newx(r->substrs, 1, struct reg_substr_data);
6005     if (RExC_recurse_count) {
6006         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6007         SAVEFREEPV(RExC_recurse);
6008     }
6009
6010 reStudy:
6011     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6012     Zero(r->substrs, 1, struct reg_substr_data);
6013
6014 #ifdef TRIE_STUDY_OPT
6015     if (!restudied) {
6016         StructCopy(&zero_scan_data, &data, scan_data_t);
6017         copyRExC_state = RExC_state;
6018     } else {
6019         U32 seen=RExC_seen;
6020         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6021
6022         RExC_state = copyRExC_state;
6023         if (seen & REG_TOP_LEVEL_BRANCHES)
6024             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6025         else
6026             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6027         if (data.last_found) {
6028             SvREFCNT_dec(data.longest_fixed);
6029             SvREFCNT_dec(data.longest_float);
6030             SvREFCNT_dec(data.last_found);
6031         }
6032         StructCopy(&zero_scan_data, &data, scan_data_t);
6033     }
6034 #else
6035     StructCopy(&zero_scan_data, &data, scan_data_t);
6036 #endif
6037
6038     /* Dig out information for optimizations. */
6039     r->extflags = RExC_flags; /* was pm_op */
6040     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6041
6042     if (UTF)
6043         SvUTF8_on(rx);  /* Unicode in it? */
6044     ri->regstclass = NULL;
6045     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6046         r->intflags |= PREGf_NAUGHTY;
6047     scan = ri->program + 1;             /* First BRANCH. */
6048
6049     /* testing for BRANCH here tells us whether there is "must appear"
6050        data in the pattern. If there is then we can use it for optimisations */
6051     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6052         I32 fake;
6053         STRLEN longest_float_length, longest_fixed_length;
6054         struct regnode_charclass_class ch_class; /* pointed to by data */
6055         int stclass_flag;
6056         I32 last_close = 0; /* pointed to by data */
6057         regnode *first= scan;
6058         regnode *first_next= regnext(first);
6059         /*
6060          * Skip introductions and multiplicators >= 1
6061          * so that we can extract the 'meat' of the pattern that must
6062          * match in the large if() sequence following.
6063          * NOTE that EXACT is NOT covered here, as it is normally
6064          * picked up by the optimiser separately.
6065          *
6066          * This is unfortunate as the optimiser isnt handling lookahead
6067          * properly currently.
6068          *
6069          */
6070         while ((OP(first) == OPEN && (sawopen = 1)) ||
6071                /* An OR of *one* alternative - should not happen now. */
6072             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6073             /* for now we can't handle lookbehind IFMATCH*/
6074             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6075             (OP(first) == PLUS) ||
6076             (OP(first) == MINMOD) ||
6077                /* An {n,m} with n>0 */
6078             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6079             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6080         {
6081                 /*
6082                  * the only op that could be a regnode is PLUS, all the rest
6083                  * will be regnode_1 or regnode_2.
6084                  *
6085                  */
6086                 if (OP(first) == PLUS)
6087                     sawplus = 1;
6088                 else
6089                     first += regarglen[OP(first)];
6090
6091                 first = NEXTOPER(first);
6092                 first_next= regnext(first);
6093         }
6094
6095         /* Starting-point info. */
6096       again:
6097         DEBUG_PEEP("first:",first,0);
6098         /* Ignore EXACT as we deal with it later. */
6099         if (PL_regkind[OP(first)] == EXACT) {
6100             if (OP(first) == EXACT)
6101                 NOOP;   /* Empty, get anchored substr later. */
6102             else
6103                 ri->regstclass = first;
6104         }
6105 #ifdef TRIE_STCLASS
6106         else if (PL_regkind[OP(first)] == TRIE &&
6107                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6108         {
6109             regnode *trie_op;
6110             /* this can happen only on restudy */
6111             if ( OP(first) == TRIE ) {
6112                 struct regnode_1 *trieop = (struct regnode_1 *)
6113                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6114                 StructCopy(first,trieop,struct regnode_1);
6115                 trie_op=(regnode *)trieop;
6116             } else {
6117                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6118                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6119                 StructCopy(first,trieop,struct regnode_charclass);
6120                 trie_op=(regnode *)trieop;
6121             }
6122             OP(trie_op)+=2;
6123             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6124             ri->regstclass = trie_op;
6125         }
6126 #endif
6127         else if (REGNODE_SIMPLE(OP(first)))
6128             ri->regstclass = first;
6129         else if (PL_regkind[OP(first)] == BOUND ||
6130                  PL_regkind[OP(first)] == NBOUND)
6131             ri->regstclass = first;
6132         else if (PL_regkind[OP(first)] == BOL) {
6133             r->extflags |= (OP(first) == MBOL
6134                            ? RXf_ANCH_MBOL
6135                            : (OP(first) == SBOL
6136                               ? RXf_ANCH_SBOL
6137                               : RXf_ANCH_BOL));
6138             first = NEXTOPER(first);
6139             goto again;
6140         }
6141         else if (OP(first) == GPOS) {
6142             r->extflags |= RXf_ANCH_GPOS;
6143             first = NEXTOPER(first);
6144             goto again;
6145         }
6146         else if ((!sawopen || !RExC_sawback) &&
6147             (OP(first) == STAR &&
6148             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6149             !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6150         {
6151             /* turn .* into ^.* with an implied $*=1 */
6152             const int type =
6153                 (OP(NEXTOPER(first)) == REG_ANY)
6154                     ? RXf_ANCH_MBOL
6155                     : RXf_ANCH_SBOL;
6156             r->extflags |= type;
6157             r->intflags |= PREGf_IMPLICIT;
6158             first = NEXTOPER(first);
6159             goto again;
6160         }
6161         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6162             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6163             /* x+ must match at the 1st pos of run of x's */
6164             r->intflags |= PREGf_SKIP;
6165
6166         /* Scan is after the zeroth branch, first is atomic matcher. */
6167 #ifdef TRIE_STUDY_OPT
6168         DEBUG_PARSE_r(
6169             if (!restudied)
6170                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6171                               (IV)(first - scan + 1))
6172         );
6173 #else
6174         DEBUG_PARSE_r(
6175             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6176                 (IV)(first - scan + 1))
6177         );
6178 #endif
6179
6180
6181         /*
6182         * If there's something expensive in the r.e., find the
6183         * longest literal string that must appear and make it the
6184         * regmust.  Resolve ties in favor of later strings, since
6185         * the regstart check works with the beginning of the r.e.
6186         * and avoiding duplication strengthens checking.  Not a
6187         * strong reason, but sufficient in the absence of others.
6188         * [Now we resolve ties in favor of the earlier string if
6189         * it happens that c_offset_min has been invalidated, since the
6190         * earlier string may buy us something the later one won't.]
6191         */
6192
6193         data.longest_fixed = newSVpvs("");
6194         data.longest_float = newSVpvs("");
6195         data.last_found = newSVpvs("");
6196         data.longest = &(data.longest_fixed);
6197         first = scan;
6198         if (!ri->regstclass) {
6199             cl_init(pRExC_state, &ch_class);
6200             data.start_class = &ch_class;
6201             stclass_flag = SCF_DO_STCLASS_AND;
6202         } else                          /* XXXX Check for BOUND? */
6203             stclass_flag = 0;
6204         data.last_closep = &last_close;
6205
6206         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6207             &data, -1, NULL, NULL,
6208             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6209
6210
6211         CHECK_RESTUDY_GOTO;
6212
6213
6214         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6215              && data.last_start_min == 0 && data.last_end > 0
6216              && !RExC_seen_zerolen
6217              && !(RExC_seen & REG_SEEN_VERBARG)
6218              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6219             r->extflags |= RXf_CHECK_ALL;
6220         scan_commit(pRExC_state, &data,&minlen,0);
6221         SvREFCNT_dec(data.last_found);
6222
6223         longest_float_length = CHR_SVLEN(data.longest_float);
6224
6225         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6226                    && data.offset_fixed == data.offset_float_min
6227                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6228             && S_setup_longest (aTHX_ pRExC_state,
6229                                     data.longest_float,
6230                                     &(r->float_utf8),
6231                                     &(r->float_substr),
6232                                     &(r->float_end_shift),
6233                                     data.lookbehind_float,
6234                                     data.offset_float_min,
6235                                     data.minlen_float,
6236                                     longest_float_length,
6237                                     data.flags & SF_FL_BEFORE_EOL,
6238                                     data.flags & SF_FL_BEFORE_MEOL))
6239         {
6240             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6241             r->float_max_offset = data.offset_float_max;
6242             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6243                 r->float_max_offset -= data.lookbehind_float;
6244         }
6245         else {
6246             r->float_substr = r->float_utf8 = NULL;
6247             SvREFCNT_dec(data.longest_float);
6248             longest_float_length = 0;
6249         }
6250
6251         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6252
6253         if (S_setup_longest (aTHX_ pRExC_state,
6254                                 data.longest_fixed,
6255                                 &(r->anchored_utf8),
6256                                 &(r->anchored_substr),
6257                                 &(r->anchored_end_shift),
6258                                 data.lookbehind_fixed,
6259                                 data.offset_fixed,
6260                                 data.minlen_fixed,
6261                                 longest_fixed_length,
6262                                 data.flags & SF_FIX_BEFORE_EOL,
6263                                 data.flags & SF_FIX_BEFORE_MEOL))
6264         {
6265             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6266         }
6267         else {
6268             r->anchored_substr = r->anchored_utf8 = NULL;
6269             SvREFCNT_dec(data.longest_fixed);
6270             longest_fixed_length = 0;
6271         }
6272
6273         if (ri->regstclass
6274             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6275             ri->regstclass = NULL;
6276
6277         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6278             && stclass_flag
6279             && !(data.start_class->flags & ANYOF_EOS)
6280             && !cl_is_anything(data.start_class))
6281         {
6282             const U32 n = add_data(pRExC_state, 1, "f");
6283             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6284
6285             Newx(RExC_rxi->data->data[n], 1,
6286                 struct regnode_charclass_class);
6287             StructCopy(data.start_class,
6288                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6289                        struct regnode_charclass_class);
6290             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6291             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6292             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6293                       regprop(r, sv, (regnode*)data.start_class);
6294                       PerlIO_printf(Perl_debug_log,
6295                                     "synthetic stclass \"%s\".\n",
6296                                     SvPVX_const(sv));});
6297         }
6298
6299         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6300         if (longest_fixed_length > longest_float_length) {
6301             r->check_end_shift = r->anchored_end_shift;
6302             r->check_substr = r->anchored_substr;
6303             r->check_utf8 = r->anchored_utf8;
6304             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6305             if (r->extflags & RXf_ANCH_SINGLE)
6306                 r->extflags |= RXf_NOSCAN;
6307         }
6308         else {
6309             r->check_end_shift = r->float_end_shift;
6310             r->check_substr = r->float_substr;
6311             r->check_utf8 = r->float_utf8;
6312             r->check_offset_min = r->float_min_offset;
6313             r->check_offset_max = r->float_max_offset;
6314         }
6315         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6316            This should be changed ASAP!  */
6317         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6318             r->extflags |= RXf_USE_INTUIT;
6319             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6320                 r->extflags |= RXf_INTUIT_TAIL;
6321         }
6322         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6323         if ( (STRLEN)minlen < longest_float_length )
6324             minlen= longest_float_length;
6325         if ( (STRLEN)minlen < longest_fixed_length )
6326             minlen= longest_fixed_length;
6327         */
6328     }
6329     else {
6330         /* Several toplevels. Best we can is to set minlen. */
6331         I32 fake;
6332         struct regnode_charclass_class ch_class;
6333         I32 last_close = 0;
6334
6335         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6336
6337         scan = ri->program + 1;
6338         cl_init(pRExC_state, &ch_class);
6339         data.start_class = &ch_class;
6340         data.last_closep = &last_close;
6341
6342
6343         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6344             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6345
6346         CHECK_RESTUDY_GOTO;
6347
6348         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6349                 = r->float_substr = r->float_utf8 = NULL;
6350
6351         if (!(data.start_class->flags & ANYOF_EOS)
6352             && !cl_is_anything(data.start_class))
6353         {
6354             const U32 n = add_data(pRExC_state, 1, "f");
6355             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6356
6357             Newx(RExC_rxi->data->data[n], 1,
6358                 struct regnode_charclass_class);
6359             StructCopy(data.start_class,
6360                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6361                        struct regnode_charclass_class);
6362             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6363             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6364             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6365                       regprop(r, sv, (regnode*)data.start_class);
6366                       PerlIO_printf(Perl_debug_log,
6367                                     "synthetic stclass \"%s\".\n",
6368                                     SvPVX_const(sv));});
6369         }
6370     }
6371
6372     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6373        the "real" pattern. */
6374     DEBUG_OPTIMISE_r({
6375         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6376                       (IV)minlen, (IV)r->minlen);
6377     });
6378     r->minlenret = minlen;
6379     if (r->minlen < minlen)
6380         r->minlen = minlen;
6381
6382     if (RExC_seen & REG_SEEN_GPOS)
6383         r->extflags |= RXf_GPOS_SEEN;
6384     if (RExC_seen & REG_SEEN_LOOKBEHIND)
6385         r->extflags |= RXf_LOOKBEHIND_SEEN;
6386     if (pRExC_state->num_code_blocks)
6387         r->extflags |= RXf_EVAL_SEEN;
6388     if (RExC_seen & REG_SEEN_CANY)
6389         r->extflags |= RXf_CANY_SEEN;
6390     if (RExC_seen & REG_SEEN_VERBARG)
6391         r->intflags |= PREGf_VERBARG_SEEN;
6392     if (RExC_seen & REG_SEEN_CUTGROUP)
6393         r->intflags |= PREGf_CUTGROUP_SEEN;
6394     if (pm_flags & PMf_USE_RE_EVAL)
6395         r->intflags |= PREGf_USE_RE_EVAL;
6396     if (RExC_paren_names)
6397         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6398     else
6399         RXp_PAREN_NAMES(r) = NULL;
6400
6401 #ifdef STUPID_PATTERN_CHECKS
6402     if (RX_PRELEN(rx) == 0)
6403         r->extflags |= RXf_NULL;
6404     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6405         /* XXX: this should happen BEFORE we compile */
6406         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6407     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6408         r->extflags |= RXf_WHITE;
6409     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6410         r->extflags |= RXf_START_ONLY;
6411 #else
6412     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6413             /* XXX: this should happen BEFORE we compile */
6414             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6415     else {
6416         regnode *first = ri->program + 1;
6417         U8 fop = OP(first);
6418
6419         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6420             r->extflags |= RXf_NULL;
6421         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6422             r->extflags |= RXf_START_ONLY;
6423         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6424                              && OP(regnext(first)) == END)
6425             r->extflags |= RXf_WHITE;
6426     }
6427 #endif
6428 #ifdef DEBUGGING
6429     if (RExC_paren_names) {
6430         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6431         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6432     } else
6433 #endif
6434         ri->name_list_idx = 0;
6435
6436     if (RExC_recurse_count) {
6437         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6438             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6439             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6440         }
6441     }
6442     Newxz(r->offs, RExC_npar, regexp_paren_pair);
6443     /* assume we don't need to swap parens around before we match */
6444
6445     DEBUG_DUMP_r({
6446         PerlIO_printf(Perl_debug_log,"Final program:\n");
6447         regdump(r);
6448     });
6449 #ifdef RE_TRACK_PATTERN_OFFSETS
6450     DEBUG_OFFSETS_r(if (ri->u.offsets) {
6451         const U32 len = ri->u.offsets[0];
6452         U32 i;
6453         GET_RE_DEBUG_FLAGS_DECL;
6454         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6455         for (i = 1; i <= len; i++) {
6456             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6457                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6458                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6459             }
6460         PerlIO_printf(Perl_debug_log, "\n");
6461     });
6462 #endif
6463     return rx;
6464 }
6465
6466
6467 SV*
6468 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6469                     const U32 flags)
6470 {
6471     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6472
6473     PERL_UNUSED_ARG(value);
6474
6475     if (flags & RXapif_FETCH) {
6476         return reg_named_buff_fetch(rx, key, flags);
6477     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6478         Perl_croak_no_modify(aTHX);
6479         return NULL;
6480     } else if (flags & RXapif_EXISTS) {
6481         return reg_named_buff_exists(rx, key, flags)
6482             ? &PL_sv_yes
6483             : &PL_sv_no;
6484     } else if (flags & RXapif_REGNAMES) {
6485         return reg_named_buff_all(rx, flags);
6486     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6487         return reg_named_buff_scalar(rx, flags);
6488     } else {
6489         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6490         return NULL;
6491     }
6492 }
6493
6494 SV*
6495 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6496                          const U32 flags)
6497 {
6498     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6499     PERL_UNUSED_ARG(lastkey);
6500
6501     if (flags & RXapif_FIRSTKEY)
6502         return reg_named_buff_firstkey(rx, flags);
6503     else if (flags & RXapif_NEXTKEY)
6504         return reg_named_buff_nextkey(rx, flags);
6505     else {
6506         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6507         return NULL;
6508     }
6509 }
6510
6511 SV*
6512 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6513                           const U32 flags)
6514 {
6515     AV *retarray = NULL;
6516     SV *ret;
6517     struct regexp *const rx = (struct regexp *)SvANY(r);
6518
6519     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6520
6521     if (flags & RXapif_ALL)
6522         retarray=newAV();
6523
6524     if (rx && RXp_PAREN_NAMES(rx)) {
6525         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6526         if (he_str) {
6527             IV i;
6528             SV* sv_dat=HeVAL(he_str);
6529             I32 *nums=(I32*)SvPVX(sv_dat);
6530             for ( i=0; i<SvIVX(sv_dat); i++ ) {
6531                 if ((I32)(rx->nparens) >= nums[i]
6532                     && rx->offs[nums[i]].start != -1
6533                     && rx->offs[nums[i]].end != -1)
6534                 {
6535                     ret = newSVpvs("");
6536                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6537                     if (!retarray)
6538                         return ret;
6539                 } else {
6540                     if (retarray)
6541                         ret = newSVsv(&PL_sv_undef);
6542                 }
6543                 if (retarray)
6544                     av_push(retarray, ret);
6545             }
6546             if (retarray)
6547                 return newRV_noinc(MUTABLE_SV(retarray));
6548         }
6549     }
6550     return NULL;
6551 }
6552
6553 bool
6554 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6555                            const U32 flags)
6556 {
6557     struct regexp *const rx = (struct regexp *)SvANY(r);
6558
6559     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6560
6561     if (rx && RXp_PAREN_NAMES(rx)) {
6562         if (flags & RXapif_ALL) {
6563             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6564         } else {
6565             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6566             if (sv) {
6567                 SvREFCNT_dec(sv);
6568                 return TRUE;
6569             } else {
6570                 return FALSE;
6571             }
6572         }
6573     } else {
6574         return FALSE;
6575     }
6576 }
6577
6578 SV*
6579 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6580 {
6581     struct regexp *const rx = (struct regexp *)SvANY(r);
6582
6583     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6584
6585     if ( rx && RXp_PAREN_NAMES(rx) ) {
6586         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6587
6588         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6589     } else {
6590         return FALSE;
6591     }
6592 }
6593
6594 SV*
6595 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6596 {
6597     struct regexp *const rx = (struct regexp *)SvANY(r);
6598     GET_RE_DEBUG_FLAGS_DECL;
6599
6600     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6601
6602     if (rx && RXp_PAREN_NAMES(rx)) {
6603         HV *hv = RXp_PAREN_NAMES(rx);
6604         HE *temphe;
6605         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6606             IV i;
6607             IV parno = 0;
6608             SV* sv_dat = HeVAL(temphe);
6609             I32 *nums = (I32*)SvPVX(sv_dat);
6610             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6611                 if ((I32)(rx->lastparen) >= nums[i] &&
6612                     rx->offs[nums[i]].start != -1 &&
6613                     rx->offs[nums[i]].end != -1)
6614                 {
6615                     parno = nums[i];
6616                     break;
6617                 }
6618             }
6619             if (parno || flags & RXapif_ALL) {
6620                 return newSVhek(HeKEY_hek(temphe));
6621             }
6622         }
6623     }
6624     return NULL;
6625 }
6626
6627 SV*
6628 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6629 {
6630     SV *ret;
6631     AV *av;
6632     I32 length;
6633     struct regexp *const rx = (struct regexp *)SvANY(r);
6634
6635     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6636
6637     if (rx && RXp_PAREN_NAMES(rx)) {
6638         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6639             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6640         } else if (flags & RXapif_ONE) {
6641             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6642             av = MUTABLE_AV(SvRV(ret));
6643             length = av_len(av);
6644             SvREFCNT_dec(ret);
6645             return newSViv(length + 1);
6646         } else {
6647             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6648             return NULL;
6649         }
6650     }
6651     return &PL_sv_undef;
6652 }
6653
6654 SV*
6655 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6656 {
6657     struct regexp *const rx = (struct regexp *)SvANY(r);
6658     AV *av = newAV();
6659
6660     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6661
6662     if (rx && RXp_PAREN_NAMES(rx)) {
6663         HV *hv= RXp_PAREN_NAMES(rx);
6664         HE *temphe;
6665         (void)hv_iterinit(hv);
6666         while ( (temphe = hv_iternext_flags(hv,0)) ) {
6667             IV i;
6668             IV parno = 0;
6669             SV* sv_dat = HeVAL(temphe);
6670             I32 *nums = (I32*)SvPVX(sv_dat);
6671             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6672                 if ((I32)(rx->lastparen) >= nums[i] &&
6673                     rx->offs[nums[i]].start != -1 &&
6674                     rx->offs[nums[i]].end != -1)
6675                 {
6676                     parno = nums[i];
6677                     break;
6678                 }
6679             }
6680             if (parno || flags & RXapif_ALL) {
6681                 av_push(av, newSVhek(HeKEY_hek(temphe)));
6682             }
6683         }
6684     }
6685
6686     return newRV_noinc(MUTABLE_SV(av));
6687 }
6688
6689 void
6690 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6691                              SV * const sv)
6692 {
6693     struct regexp *const rx = (struct regexp *)SvANY(r);
6694     char *s = NULL;
6695     I32 i = 0;
6696     I32 s1, t1;
6697
6698     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6699
6700     if (!rx->subbeg) {
6701         sv_setsv(sv,&PL_sv_undef);
6702         return;
6703     }
6704     else
6705     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6706         /* $` */
6707         i = rx->offs[0].start;
6708         s = rx->subbeg;
6709     }
6710     else
6711     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6712         /* $' */
6713         s = rx->subbeg + rx->offs[0].end;
6714         i = rx->sublen - rx->offs[0].end;
6715     }
6716     else
6717     if ( 0 <= paren && paren <= (I32)rx->nparens &&
6718         (s1 = rx->offs[paren].start) != -1 &&
6719         (t1 = rx->offs[paren].end) != -1)
6720     {
6721         /* $& $1 ... */
6722         i = t1 - s1;
6723         s = rx->subbeg + s1;
6724     } else {
6725         sv_setsv(sv,&PL_sv_undef);
6726         return;
6727     }
6728     assert(rx->sublen >= (s - rx->subbeg) + i );
6729     if (i >= 0) {
6730         const int oldtainted = PL_tainted;
6731         TAINT_NOT;
6732         sv_setpvn(sv, s, i);
6733         PL_tainted = oldtainted;
6734         if ( (rx->extflags & RXf_CANY_SEEN)
6735             ? (RXp_MATCH_UTF8(rx)
6736                         && (!i || is_utf8_string((U8*)s, i)))
6737             : (RXp_MATCH_UTF8(rx)) )
6738         {
6739             SvUTF8_on(sv);
6740         }
6741         else
6742             SvUTF8_off(sv);
6743         if (PL_tainting) {
6744             if (RXp_MATCH_TAINTED(rx)) {
6745                 if (SvTYPE(sv) >= SVt_PVMG) {
6746                     MAGIC* const mg = SvMAGIC(sv);
6747                     MAGIC* mgt;
6748                     PL_tainted = 1;
6749                     SvMAGIC_set(sv, mg->mg_moremagic);
6750                     SvTAINT(sv);
6751                     if ((mgt = SvMAGIC(sv))) {
6752                         mg->mg_moremagic = mgt;
6753                         SvMAGIC_set(sv, mg);
6754                     }
6755                 } else {
6756                     PL_tainted = 1;
6757                     SvTAINT(sv);
6758                 }
6759             } else
6760                 SvTAINTED_off(sv);
6761         }
6762     } else {
6763         sv_setsv(sv,&PL_sv_undef);
6764         return;
6765     }
6766 }
6767
6768 void
6769 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6770                                                          SV const * const value)
6771 {
6772     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6773
6774     PERL_UNUSED_ARG(rx);
6775     PERL_UNUSED_ARG(paren);
6776     PERL_UNUSED_ARG(value);
6777
6778     if (!PL_localizing)
6779         Perl_croak_no_modify(aTHX);
6780 }
6781
6782 I32
6783 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6784                               const I32 paren)
6785 {
6786     struct regexp *const rx = (struct regexp *)SvANY(r);
6787     I32 i;
6788     I32 s1, t1;
6789
6790     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6791
6792     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6793         switch (paren) {
6794       /* $` / ${^PREMATCH} */
6795       case RX_BUFF_IDX_PREMATCH:
6796         if (rx->offs[0].start != -1) {
6797                         i = rx->offs[0].start;
6798                         if (i > 0) {
6799                                 s1 = 0;
6800                                 t1 = i;
6801                                 goto getlen;
6802                         }
6803             }
6804         return 0;
6805       /* $' / ${^POSTMATCH} */
6806       case RX_BUFF_IDX_POSTMATCH:
6807             if (rx->offs[0].end != -1) {
6808                         i = rx->sublen - rx->offs[0].end;
6809                         if (i > 0) {
6810                                 s1 = rx->offs[0].end;
6811                                 t1 = rx->sublen;
6812                                 goto getlen;
6813                         }
6814             }
6815         return 0;
6816       /* $& / ${^MATCH}, $1, $2, ... */
6817       default:
6818             if (paren <= (I32)rx->nparens &&
6819             (s1 = rx->offs[paren].start) != -1 &&
6820             (t1 = rx->offs[paren].end) != -1)
6821             {
6822             i = t1 - s1;
6823             goto getlen;
6824         } else {
6825             if (ckWARN(WARN_UNINITIALIZED))
6826                 report_uninit((const SV *)sv);
6827             return 0;
6828         }
6829     }
6830   getlen:
6831     if (i > 0 && RXp_MATCH_UTF8(rx)) {
6832         const char * const s = rx->subbeg + s1;
6833         const U8 *ep;
6834         STRLEN el;
6835
6836         i = t1 - s1;
6837         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6838                         i = el;
6839     }
6840     return i;
6841 }
6842
6843 SV*
6844 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6845 {
6846     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6847         PERL_UNUSED_ARG(rx);
6848         if (0)
6849             return NULL;
6850         else
6851             return newSVpvs("Regexp");
6852 }
6853
6854 /* Scans the name of a named buffer from the pattern.
6855  * If flags is REG_RSN_RETURN_NULL returns null.
6856  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6857  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6858  * to the parsed name as looked up in the RExC_paren_names hash.
6859  * If there is an error throws a vFAIL().. type exception.
6860  */
6861
6862 #define REG_RSN_RETURN_NULL    0
6863 #define REG_RSN_RETURN_NAME    1
6864 #define REG_RSN_RETURN_DATA    2
6865
6866 STATIC SV*
6867 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6868 {
6869     char *name_start = RExC_parse;
6870
6871     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6872
6873     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6874          /* skip IDFIRST by using do...while */
6875         if (UTF)
6876             do {
6877                 RExC_parse += UTF8SKIP(RExC_parse);
6878             } while (isALNUM_utf8((U8*)RExC_parse));
6879         else
6880             do {
6881                 RExC_parse++;
6882             } while (isALNUM(*RExC_parse));
6883     } else {
6884         RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6885         vFAIL("Group name must start with a non-digit word character");
6886     }
6887     if ( flags ) {
6888         SV* sv_name
6889             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6890                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6891         if ( flags == REG_RSN_RETURN_NAME)
6892             return sv_name;
6893         else if (flags==REG_RSN_RETURN_DATA) {
6894             HE *he_str = NULL;
6895             SV *sv_dat = NULL;
6896             if ( ! sv_name )      /* should not happen*/
6897                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6898             if (RExC_paren_names)
6899                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6900             if ( he_str )
6901                 sv_dat = HeVAL(he_str);
6902             if ( ! sv_dat )
6903                 vFAIL("Reference to nonexistent named group");
6904             return sv_dat;
6905         }
6906         else {
6907             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6908                        (unsigned long) flags);
6909         }
6910         assert(0); /* NOT REACHED */
6911     }
6912     return NULL;
6913 }
6914
6915 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6916     int rem=(int)(RExC_end - RExC_parse);                       \
6917     int cut;                                                    \
6918     int num;                                                    \
6919     int iscut=0;                                                \
6920     if (rem>10) {                                               \
6921         rem=10;                                                 \
6922         iscut=1;                                                \
6923     }                                                           \
6924     cut=10-rem;                                                 \
6925     if (RExC_lastparse!=RExC_parse)                             \
6926         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6927             rem, RExC_parse,                                    \
6928             cut + 4,                                            \
6929             iscut ? "..." : "<"                                 \
6930         );                                                      \
6931     else                                                        \
6932         PerlIO_printf(Perl_debug_log,"%16s","");                \
6933                                                                 \
6934     if (SIZE_ONLY)                                              \
6935        num = RExC_size + 1;                                     \
6936     else                                                        \
6937        num=REG_NODE_NUM(RExC_emit);                             \
6938     if (RExC_lastnum!=num)                                      \
6939        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6940     else                                                        \
6941        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6942     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6943         (int)((depth*2)), "",                                   \
6944         (funcname)                                              \
6945     );                                                          \
6946     RExC_lastnum=num;                                           \
6947     RExC_lastparse=RExC_parse;                                  \
6948 })
6949
6950
6951
6952 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6953     DEBUG_PARSE_MSG((funcname));                            \
6954     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6955 })
6956 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6957     DEBUG_PARSE_MSG((funcname));                            \
6958     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6959 })
6960
6961 /* This section of code defines the inversion list object and its methods.  The
6962  * interfaces are highly subject to change, so as much as possible is static to
6963  * this file.  An inversion list is here implemented as a malloc'd C UV array
6964  * with some added info that is placed as UVs at the beginning in a header
6965  * portion.  An inversion list for Unicode is an array of code points, sorted
6966  * by ordinal number.  The zeroth element is the first code point in the list.
6967  * The 1th element is the first element beyond that not in the list.  In other
6968  * words, the first range is
6969  *  invlist[0]..(invlist[1]-1)
6970  * The other ranges follow.  Thus every element whose index is divisible by two
6971  * marks the beginning of a range that is in the list, and every element not
6972  * divisible by two marks the beginning of a range not in the list.  A single
6973  * element inversion list that contains the single code point N generally
6974  * consists of two elements
6975  *  invlist[0] == N
6976  *  invlist[1] == N+1
6977  * (The exception is when N is the highest representable value on the
6978  * machine, in which case the list containing just it would be a single
6979  * element, itself.  By extension, if the last range in the list extends to
6980  * infinity, then the first element of that range will be in the inversion list
6981  * at a position that is divisible by two, and is the final element in the
6982  * list.)
6983  * Taking the complement (inverting) an inversion list is quite simple, if the
6984  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6985  * This implementation reserves an element at the beginning of each inversion list
6986  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6987  * beginning of the list is either that element if 0, or the next one if 1.
6988  *
6989  * More about inversion lists can be found in "Unicode Demystified"
6990  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6991  * More will be coming when functionality is added later.
6992  *
6993  * The inversion list data structure is currently implemented as an SV pointing
6994  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6995  * array of UV whose memory management is automatically handled by the existing
6996  * facilities for SV's.
6997  *
6998  * Some of the methods should always be private to the implementation, and some
6999  * should eventually be made public */
7000
7001 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
7002 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
7003
7004 /* This is a combination of a version and data structure type, so that one
7005  * being passed in can be validated to be an inversion list of the correct
7006  * vintage.  When the structure of the header is changed, a new random number
7007  * in the range 2**31-1 should be generated and the new() method changed to
7008  * insert that at this location.  Then, if an auxiliary program doesn't change
7009  * correspondingly, it will be discovered immediately */
7010 #define INVLIST_VERSION_ID_OFFSET 2
7011 #define INVLIST_VERSION_ID 1064334010
7012
7013 /* For safety, when adding new elements, remember to #undef them at the end of
7014  * the inversion list code section */
7015
7016 #define INVLIST_ZERO_OFFSET 3   /* 0 or 1; must be last element in header */
7017 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
7018  * contains the code point U+00000, and begins here.  If 1, the inversion list
7019  * doesn't contain U+0000, and it begins at the next UV in the array.
7020  * Inverting an inversion list consists of adding or removing the 0 at the
7021  * beginning of it.  By reserving a space for that 0, inversion can be made
7022  * very fast */
7023
7024 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
7025
7026 /* Internally things are UVs */
7027 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7028 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7029
7030 #define INVLIST_INITIAL_LEN 10
7031
7032 PERL_STATIC_INLINE UV*
7033 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7034 {
7035     /* Returns a pointer to the first element in the inversion list's array.
7036      * This is called upon initialization of an inversion list.  Where the
7037      * array begins depends on whether the list has the code point U+0000
7038      * in it or not.  The other parameter tells it whether the code that
7039      * follows this call is about to put a 0 in the inversion list or not.
7040      * The first element is either the element with 0, if 0, or the next one,
7041      * if 1 */
7042
7043     UV* zero = get_invlist_zero_addr(invlist);
7044
7045     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7046
7047     /* Must be empty */
7048     assert(! *get_invlist_len_addr(invlist));
7049
7050     /* 1^1 = 0; 1^0 = 1 */
7051     *zero = 1 ^ will_have_0;
7052     return zero + *zero;
7053 }
7054
7055 PERL_STATIC_INLINE UV*
7056 S_invlist_array(pTHX_ SV* const invlist)
7057 {
7058     /* Returns the pointer to the inversion list's array.  Every time the
7059      * length changes, this needs to be called in case malloc or realloc moved
7060      * it */
7061
7062     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7063
7064     /* Must not be empty.  If these fail, you probably didn't check for <len>
7065      * being non-zero before trying to get the array */
7066     assert(*get_invlist_len_addr(invlist));
7067     assert(*get_invlist_zero_addr(invlist) == 0
7068            || *get_invlist_zero_addr(invlist) == 1);
7069
7070     /* The array begins either at the element reserved for zero if the
7071      * list contains 0 (that element will be set to 0), or otherwise the next
7072      * element (in which case the reserved element will be set to 1). */
7073     return (UV *) (get_invlist_zero_addr(invlist)
7074                    + *get_invlist_zero_addr(invlist));
7075 }
7076
7077 PERL_STATIC_INLINE UV*
7078 S_get_invlist_len_addr(pTHX_ SV* invlist)
7079 {
7080     /* Return the address of the UV that contains the current number
7081      * of used elements in the inversion list */
7082
7083     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
7084
7085     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
7086 }
7087
7088 PERL_STATIC_INLINE UV
7089 S_invlist_len(pTHX_ SV* const invlist)
7090 {
7091     /* Returns the current number of elements stored in the inversion list's
7092      * array */
7093
7094     PERL_ARGS_ASSERT_INVLIST_LEN;
7095
7096     return *get_invlist_len_addr(invlist);
7097 }
7098
7099 PERL_STATIC_INLINE void
7100 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7101 {
7102     /* Sets the current number of elements stored in the inversion list */
7103
7104     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7105
7106     *get_invlist_len_addr(invlist) = len;
7107
7108     assert(len <= SvLEN(invlist));
7109
7110     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7111     /* If the list contains U+0000, that element is part of the header,
7112      * and should not be counted as part of the array.  It will contain
7113      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7114      * subtract:
7115      *  SvCUR_set(invlist,
7116      *            TO_INTERNAL_SIZE(len
7117      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
7118      * But, this is only valid if len is not 0.  The consequences of not doing
7119      * this is that the memory allocation code may think that 1 more UV is
7120      * being used than actually is, and so might do an unnecessary grow.  That
7121      * seems worth not bothering to make this the precise amount.
7122      *
7123      * Note that when inverting, SvCUR shouldn't change */
7124 }
7125
7126 PERL_STATIC_INLINE UV
7127 S_invlist_max(pTHX_ SV* const invlist)
7128 {
7129     /* Returns the maximum number of elements storable in the inversion list's
7130      * array, without having to realloc() */
7131
7132     PERL_ARGS_ASSERT_INVLIST_MAX;
7133
7134     return FROM_INTERNAL_SIZE(SvLEN(invlist));
7135 }
7136
7137 PERL_STATIC_INLINE UV*
7138 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7139 {
7140     /* Return the address of the UV that is reserved to hold 0 if the inversion
7141      * list contains 0.  This has to be the last element of the heading, as the
7142      * list proper starts with either it if 0, or the next element if not.
7143      * (But we force it to contain either 0 or 1) */
7144
7145     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7146
7147     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7148 }
7149
7150 #ifndef PERL_IN_XSUB_RE
7151 SV*
7152 Perl__new_invlist(pTHX_ IV initial_size)
7153 {
7154
7155     /* Return a pointer to a newly constructed inversion list, with enough
7156      * space to store 'initial_size' elements.  If that number is negative, a
7157      * system default is used instead */
7158
7159     SV* new_list;
7160
7161     if (initial_size < 0) {
7162         initial_size = INVLIST_INITIAL_LEN;
7163     }
7164
7165     /* Allocate the initial space */
7166     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7167     invlist_set_len(new_list, 0);
7168
7169     /* Force iterinit() to be used to get iteration to work */
7170     *get_invlist_iter_addr(new_list) = UV_MAX;
7171
7172     /* This should force a segfault if a method doesn't initialize this
7173      * properly */
7174     *get_invlist_zero_addr(new_list) = UV_MAX;
7175
7176     *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7177 #if HEADER_LENGTH != 4
7178 #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7179 #endif
7180
7181     return new_list;
7182 }
7183 #endif
7184
7185 STATIC SV*
7186 S__new_invlist_C_array(pTHX_ UV* list)
7187 {
7188     /* Return a pointer to a newly constructed inversion list, initialized to
7189      * point to <list>, which has to be in the exact correct inversion list
7190      * form, including internal fields.  Thus this is a dangerous routine that
7191      * should not be used in the wrong hands */
7192
7193     SV* invlist = newSV_type(SVt_PV);
7194
7195     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7196
7197     SvPV_set(invlist, (char *) list);
7198     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7199                                shouldn't touch it */
7200     SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
7201
7202     if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7203         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7204     }
7205
7206     return invlist;
7207 }
7208
7209 STATIC void
7210 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7211 {
7212     /* Grow the maximum size of an inversion list */
7213
7214     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7215
7216     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7217 }
7218
7219 PERL_STATIC_INLINE void
7220 S_invlist_trim(pTHX_ SV* const invlist)
7221 {
7222     PERL_ARGS_ASSERT_INVLIST_TRIM;
7223
7224     /* Change the length of the inversion list to how many entries it currently
7225      * has */
7226
7227     SvPV_shrink_to_cur((SV *) invlist);
7228 }
7229
7230 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
7231  * etc */
7232 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
7233 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
7234
7235 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7236
7237 STATIC void
7238 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7239 {
7240    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7241     * the end of the inversion list.  The range must be above any existing
7242     * ones. */
7243
7244     UV* array;
7245     UV max = invlist_max(invlist);
7246     UV len = invlist_len(invlist);
7247
7248     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7249
7250     if (len == 0) { /* Empty lists must be initialized */
7251         array = _invlist_array_init(invlist, start == 0);
7252     }
7253     else {
7254         /* Here, the existing list is non-empty. The current max entry in the
7255          * list is generally the first value not in the set, except when the
7256          * set extends to the end of permissible values, in which case it is
7257          * the first entry in that final set, and so this call is an attempt to
7258          * append out-of-order */
7259
7260         UV final_element = len - 1;
7261         array = invlist_array(invlist);
7262         if (array[final_element] > start
7263             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7264         {
7265             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7266                        array[final_element], start,
7267                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7268         }
7269
7270         /* Here, it is a legal append.  If the new range begins with the first
7271          * value not in the set, it is extending the set, so the new first
7272          * value not in the set is one greater than the newly extended range.
7273          * */
7274         if (array[final_element] == start) {
7275             if (end != UV_MAX) {
7276                 array[final_element] = end + 1;
7277             }
7278             else {
7279                 /* But if the end is the maximum representable on the machine,
7280                  * just let the range that this would extend to have no end */
7281                 invlist_set_len(invlist, len - 1);
7282             }
7283             return;
7284         }
7285     }
7286
7287     /* Here the new range doesn't extend any existing set.  Add it */
7288
7289     len += 2;   /* Includes an element each for the start and end of range */
7290
7291     /* If overflows the existing space, extend, which may cause the array to be
7292      * moved */
7293     if (max < len) {
7294         invlist_extend(invlist, len);
7295         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
7296                                            failure in invlist_array() */
7297         array = invlist_array(invlist);
7298     }
7299     else {
7300         invlist_set_len(invlist, len);
7301     }
7302
7303     /* The next item on the list starts the range, the one after that is
7304      * one past the new range.  */
7305     array[len - 2] = start;
7306     if (end != UV_MAX) {
7307         array[len - 1] = end + 1;
7308     }
7309     else {
7310         /* But if the end is the maximum representable on the machine, just let
7311          * the range have no end */
7312         invlist_set_len(invlist, len - 1);
7313     }
7314 }
7315
7316 #ifndef PERL_IN_XSUB_RE
7317
7318 IV
7319 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7320 {
7321     /* Searches the inversion list for the entry that contains the input code
7322      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7323      * return value is the index into the list's array of the range that
7324      * contains <cp> */
7325
7326     IV low = 0;
7327     IV high = invlist_len(invlist);
7328     const UV * const array = invlist_array(invlist);
7329
7330     PERL_ARGS_ASSERT__INVLIST_SEARCH;
7331
7332     /* If list is empty or the code point is before the first element, return
7333      * failure. */
7334     if (high == 0 || cp < array[0]) {
7335         return -1;
7336     }
7337
7338     /* Binary search.  What we are looking for is <i> such that
7339      *  array[i] <= cp < array[i+1]
7340      * The loop below converges on the i+1. */
7341     while (low < high) {
7342         IV mid = (low + high) / 2;
7343         if (array[mid] <= cp) {
7344             low = mid + 1;
7345
7346             /* We could do this extra test to exit the loop early.
7347             if (cp < array[low]) {
7348                 return mid;
7349             }
7350             */
7351         }
7352         else { /* cp < array[mid] */
7353             high = mid;
7354         }
7355     }
7356
7357     return high - 1;
7358 }
7359
7360 void
7361 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7362 {
7363     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7364      * but is used when the swash has an inversion list.  This makes this much
7365      * faster, as it uses a binary search instead of a linear one.  This is
7366      * intimately tied to that function, and perhaps should be in utf8.c,
7367      * except it is intimately tied to inversion lists as well.  It assumes
7368      * that <swatch> is all 0's on input */
7369
7370     UV current = start;
7371     const IV len = invlist_len(invlist);
7372     IV i;
7373     const UV * array;
7374
7375     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7376
7377     if (len == 0) { /* Empty inversion list */
7378         return;
7379     }
7380
7381     array = invlist_array(invlist);
7382
7383     /* Find which element it is */
7384     i = _invlist_search(invlist, start);
7385
7386     /* We populate from <start> to <end> */
7387     while (current < end) {
7388         UV upper;
7389
7390         /* The inversion list gives the results for every possible code point
7391          * after the first one in the list.  Only those ranges whose index is
7392          * even are ones that the inversion list matches.  For the odd ones,
7393          * and if the initial code point is not in the list, we have to skip
7394          * forward to the next element */
7395         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7396             i++;
7397             if (i >= len) { /* Finished if beyond the end of the array */
7398                 return;
7399             }
7400             current = array[i];
7401             if (current >= end) {   /* Finished if beyond the end of what we
7402                                        are populating */
7403                 return;
7404             }
7405         }
7406         assert(current >= start);
7407
7408         /* The current range ends one below the next one, except don't go past
7409          * <end> */
7410         i++;
7411         upper = (i < len && array[i] < end) ? array[i] : end;
7412
7413         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7414          * for each code point in it */
7415         for (; current < upper; current++) {
7416             const STRLEN offset = (STRLEN)(current - start);
7417             swatch[offset >> 3] |= 1 << (offset & 7);
7418         }
7419
7420         /* Quit if at the end of the list */
7421         if (i >= len) {
7422
7423             /* But first, have to deal with the highest possible code point on
7424              * the platform.  The previous code assumes that <end> is one
7425              * beyond where we want to populate, but that is impossible at the
7426              * platform's infinity, so have to handle it specially */
7427             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7428             {
7429                 const STRLEN offset = (STRLEN)(end - start);
7430                 swatch[offset >> 3] |= 1 << (offset & 7);
7431             }
7432             return;
7433         }
7434
7435         /* Advance to the next range, which will be for code points not in the
7436          * inversion list */
7437         current = array[i];
7438     }
7439
7440     return;
7441 }
7442
7443 void
7444 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7445 {
7446     /* Take the union of two inversion lists and point <output> to it.  *output
7447      * should be defined upon input, and if it points to one of the two lists,
7448      * the reference count to that list will be decremented.  The first list,
7449      * <a>, may be NULL, in which case a copy of the second list is returned.
7450      * If <complement_b> is TRUE, the union is taken of the complement
7451      * (inversion) of <b> instead of b itself.
7452      *
7453      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7454      * Richard Gillam, published by Addison-Wesley, and explained at some
7455      * length there.  The preface says to incorporate its examples into your
7456      * code at your own risk.
7457      *
7458      * The algorithm is like a merge sort.
7459      *
7460      * XXX A potential performance improvement is to keep track as we go along
7461      * if only one of the inputs contributes to the result, meaning the other
7462      * is a subset of that one.  In that case, we can skip the final copy and
7463      * return the larger of the input lists, but then outside code might need
7464      * to keep track of whether to free the input list or not */
7465
7466     UV* array_a;    /* a's array */
7467     UV* array_b;
7468     UV len_a;       /* length of a's array */
7469     UV len_b;
7470
7471     SV* u;                      /* the resulting union */
7472     UV* array_u;
7473     UV len_u;
7474
7475     UV i_a = 0;             /* current index into a's array */
7476     UV i_b = 0;
7477     UV i_u = 0;
7478
7479     /* running count, as explained in the algorithm source book; items are
7480      * stopped accumulating and are output when the count changes to/from 0.
7481      * The count is incremented when we start a range that's in the set, and
7482      * decremented when we start a range that's not in the set.  So its range
7483      * is 0 to 2.  Only when the count is zero is something not in the set.
7484      */
7485     UV count = 0;
7486
7487     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7488     assert(a != b);
7489
7490     /* If either one is empty, the union is the other one */
7491     if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7492         if (*output == a) {
7493             if (a != NULL) {
7494                 SvREFCNT_dec(a);
7495             }
7496         }
7497         if (*output != b) {
7498             *output = invlist_clone(b);
7499             if (complement_b) {
7500                 _invlist_invert(*output);
7501             }
7502         } /* else *output already = b; */
7503         return;
7504     }
7505     else if ((len_b = invlist_len(b)) == 0) {
7506         if (*output == b) {
7507             SvREFCNT_dec(b);
7508         }
7509
7510         /* The complement of an empty list is a list that has everything in it,
7511          * so the union with <a> includes everything too */
7512         if (complement_b) {
7513             if (a == *output) {
7514                 SvREFCNT_dec(a);
7515             }
7516             *output = _new_invlist(1);
7517             _append_range_to_invlist(*output, 0, UV_MAX);
7518         }
7519         else if (*output != a) {
7520             *output = invlist_clone(a);
7521         }
7522         /* else *output already = a; */
7523         return;
7524     }
7525
7526     /* Here both lists exist and are non-empty */
7527     array_a = invlist_array(a);
7528     array_b = invlist_array(b);
7529
7530     /* If are to take the union of 'a' with the complement of b, set it
7531      * up so are looking at b's complement. */
7532     if (complement_b) {
7533
7534         /* To complement, we invert: if the first element is 0, remove it.  To
7535          * do this, we just pretend the array starts one later, and clear the
7536          * flag as we don't have to do anything else later */
7537         if (array_b[0] == 0) {
7538             array_b++;
7539             len_b--;
7540             complement_b = FALSE;
7541         }
7542         else {
7543
7544             /* But if the first element is not zero, we unshift a 0 before the
7545              * array.  The data structure reserves a space for that 0 (which
7546              * should be a '1' right now), so physical shifting is unneeded,
7547              * but temporarily change that element to 0.  Before exiting the
7548              * routine, we must restore the element to '1' */
7549             array_b--;
7550             len_b++;
7551             array_b[0] = 0;
7552         }
7553     }
7554
7555     /* Size the union for the worst case: that the sets are completely
7556      * disjoint */
7557     u = _new_invlist(len_a + len_b);
7558
7559     /* Will contain U+0000 if either component does */
7560     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7561                                       || (len_b > 0 && array_b[0] == 0));
7562
7563     /* Go through each list item by item, stopping when exhausted one of
7564      * them */
7565     while (i_a < len_a && i_b < len_b) {
7566         UV cp;      /* The element to potentially add to the union's array */
7567         bool cp_in_set;   /* is it in the the input list's set or not */
7568
7569         /* We need to take one or the other of the two inputs for the union.
7570          * Since we are merging two sorted lists, we take the smaller of the
7571          * next items.  In case of a tie, we take the one that is in its set
7572          * first.  If we took one not in the set first, it would decrement the
7573          * count, possibly to 0 which would cause it to be output as ending the
7574          * range, and the next time through we would take the same number, and
7575          * output it again as beginning the next range.  By doing it the
7576          * opposite way, there is no possibility that the count will be
7577          * momentarily decremented to 0, and thus the two adjoining ranges will
7578          * be seamlessly merged.  (In a tie and both are in the set or both not
7579          * in the set, it doesn't matter which we take first.) */
7580         if (array_a[i_a] < array_b[i_b]
7581             || (array_a[i_a] == array_b[i_b]
7582                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7583         {
7584             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7585             cp= array_a[i_a++];
7586         }
7587         else {
7588             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7589             cp= array_b[i_b++];
7590         }
7591
7592         /* Here, have chosen which of the two inputs to look at.  Only output
7593          * if the running count changes to/from 0, which marks the
7594          * beginning/end of a range in that's in the set */
7595         if (cp_in_set) {
7596             if (count == 0) {
7597                 array_u[i_u++] = cp;
7598             }
7599             count++;
7600         }
7601         else {
7602             count--;
7603             if (count == 0) {
7604                 array_u[i_u++] = cp;
7605             }
7606         }
7607     }
7608
7609     /* Here, we are finished going through at least one of the lists, which
7610      * means there is something remaining in at most one.  We check if the list
7611      * that hasn't been exhausted is positioned such that we are in the middle
7612      * of a range in its set or not.  (i_a and i_b point to the element beyond
7613      * the one we care about.) If in the set, we decrement 'count'; if 0, there
7614      * is potentially more to output.
7615      * There are four cases:
7616      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
7617      *     in the union is entirely from the non-exhausted set.
7618      *  2) Both were in their sets, count is 2.  Nothing further should
7619      *     be output, as everything that remains will be in the exhausted
7620      *     list's set, hence in the union; decrementing to 1 but not 0 insures
7621      *     that
7622      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
7623      *     Nothing further should be output because the union includes
7624      *     everything from the exhausted set.  Not decrementing ensures that.
7625      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7626      *     decrementing to 0 insures that we look at the remainder of the
7627      *     non-exhausted set */
7628     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7629         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7630     {
7631         count--;
7632     }
7633
7634     /* The final length is what we've output so far, plus what else is about to
7635      * be output.  (If 'count' is non-zero, then the input list we exhausted
7636      * has everything remaining up to the machine's limit in its set, and hence
7637      * in the union, so there will be no further output. */
7638     len_u = i_u;
7639     if (count == 0) {
7640         /* At most one of the subexpressions will be non-zero */
7641         len_u += (len_a - i_a) + (len_b - i_b);
7642     }
7643
7644     /* Set result to final length, which can change the pointer to array_u, so
7645      * re-find it */
7646     if (len_u != invlist_len(u)) {
7647         invlist_set_len(u, len_u);
7648         invlist_trim(u);
7649         array_u = invlist_array(u);
7650     }
7651
7652     /* When 'count' is 0, the list that was exhausted (if one was shorter than
7653      * the other) ended with everything above it not in its set.  That means
7654      * that the remaining part of the union is precisely the same as the
7655      * non-exhausted list, so can just copy it unchanged.  (If both list were
7656      * exhausted at the same time, then the operations below will be both 0.)
7657      */
7658     if (count == 0) {
7659         IV copy_count; /* At most one will have a non-zero copy count */
7660         if ((copy_count = len_a - i_a) > 0) {
7661             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7662         }
7663         else if ((copy_count = len_b - i_b) > 0) {
7664             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7665         }
7666     }
7667
7668     /*  We may be removing a reference to one of the inputs */
7669     if (a == *output || b == *output) {
7670         SvREFCNT_dec(*output);
7671     }
7672
7673     /* If we've changed b, restore it */
7674     if (complement_b) {
7675         array_b[0] = 1;
7676     }
7677
7678     *output = u;
7679     return;
7680 }
7681
7682 void
7683 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7684 {
7685     /* Take the intersection of two inversion lists and point <i> to it.  *i
7686      * should be defined upon input, and if it points to one of the two lists,
7687      * the reference count to that list will be decremented.
7688      * If <complement_b> is TRUE, the result will be the intersection of <a>
7689      * and the complement (or inversion) of <b> instead of <b> directly.
7690      *
7691      * The basis for this comes from "Unicode Demystified" Chapter 13 by
7692      * Richard Gillam, published by Addison-Wesley, and explained at some
7693      * length there.  The preface says to incorporate its examples into your
7694      * code at your own risk.  In fact, it had bugs
7695      *
7696      * The algorithm is like a merge sort, and is essentially the same as the
7697      * union above
7698      */
7699
7700     UV* array_a;                /* a's array */
7701     UV* array_b;
7702     UV len_a;   /* length of a's array */
7703     UV len_b;
7704
7705     SV* r;                   /* the resulting intersection */
7706     UV* array_r;
7707     UV len_r;
7708
7709     UV i_a = 0;             /* current index into a's array */
7710     UV i_b = 0;
7711     UV i_r = 0;
7712
7713     /* running count, as explained in the algorithm source book; items are
7714      * stopped accumulating and are output when the count changes to/from 2.
7715      * The count is incremented when we start a range that's in the set, and
7716      * decremented when we start a range that's not in the set.  So its range
7717      * is 0 to 2.  Only when the count is 2 is something in the intersection.
7718      */
7719     UV count = 0;
7720
7721     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7722     assert(a != b);
7723
7724     /* Special case if either one is empty */
7725     len_a = invlist_len(a);
7726     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7727
7728         if (len_a != 0 && complement_b) {
7729
7730             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7731              * be empty.  Here, also we are using 'b's complement, which hence
7732              * must be every possible code point.  Thus the intersection is
7733              * simply 'a'. */
7734             if (*i != a) {
7735                 *i = invlist_clone(a);
7736
7737                 if (*i == b) {
7738                     SvREFCNT_dec(b);
7739                 }
7740             }
7741             /* else *i is already 'a' */
7742             return;
7743         }
7744
7745         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7746          * intersection must be empty */
7747         if (*i == a) {
7748             SvREFCNT_dec(a);
7749         }
7750         else if (*i == b) {
7751             SvREFCNT_dec(b);
7752         }
7753         *i = _new_invlist(0);
7754         return;
7755     }
7756
7757     /* Here both lists exist and are non-empty */
7758     array_a = invlist_array(a);
7759     array_b = invlist_array(b);
7760
7761     /* If are to take the intersection of 'a' with the complement of b, set it
7762      * up so are looking at b's complement. */
7763     if (complement_b) {
7764
7765         /* To complement, we invert: if the first element is 0, remove it.  To
7766          * do this, we just pretend the array starts one later, and clear the
7767          * flag as we don't have to do anything else later */
7768         if (array_b[0] == 0) {
7769             array_b++;
7770             len_b--;
7771             complement_b = FALSE;
7772         }
7773         else {
7774
7775             /* But if the first element is not zero, we unshift a 0 before the
7776              * array.  The data structure reserves a space for that 0 (which
7777              * should be a '1' right now), so physical shifting is unneeded,
7778              * but temporarily change that element to 0.  Before exiting the
7779              * routine, we must restore the element to '1' */
7780             array_b--;
7781             len_b++;
7782             array_b[0] = 0;
7783         }
7784     }
7785
7786     /* Size the intersection for the worst case: that the intersection ends up
7787      * fragmenting everything to be completely disjoint */
7788     r= _new_invlist(len_a + len_b);
7789
7790     /* Will contain U+0000 iff both components do */
7791     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7792                                      && len_b > 0 && array_b[0] == 0);
7793
7794     /* Go through each list item by item, stopping when exhausted one of
7795      * them */
7796     while (i_a < len_a && i_b < len_b) {
7797         UV cp;      /* The element to potentially add to the intersection's
7798                        array */
7799         bool cp_in_set; /* Is it in the input list's set or not */
7800
7801         /* We need to take one or the other of the two inputs for the
7802          * intersection.  Since we are merging two sorted lists, we take the
7803          * smaller of the next items.  In case of a tie, we take the one that
7804          * is not in its set first (a difference from the union algorithm).  If
7805          * we took one in the set first, it would increment the count, possibly
7806          * to 2 which would cause it to be output as starting a range in the
7807          * intersection, and the next time through we would take that same
7808          * number, and output it again as ending the set.  By doing it the
7809          * opposite of this, there is no possibility that the count will be
7810          * momentarily incremented to 2.  (In a tie and both are in the set or
7811          * both not in the set, it doesn't matter which we take first.) */
7812         if (array_a[i_a] < array_b[i_b]
7813             || (array_a[i_a] == array_b[i_b]
7814                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7815         {
7816             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7817             cp= array_a[i_a++];
7818         }
7819         else {
7820             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7821             cp= array_b[i_b++];
7822         }
7823
7824         /* Here, have chosen which of the two inputs to look at.  Only output
7825          * if the running count changes to/from 2, which marks the
7826          * beginning/end of a range that's in the intersection */
7827         if (cp_in_set) {
7828             count++;
7829             if (count == 2) {
7830                 array_r[i_r++] = cp;
7831             }
7832         }
7833         else {
7834             if (count == 2) {
7835                 array_r[i_r++] = cp;
7836             }
7837             count--;
7838         }
7839     }
7840
7841     /* Here, we are finished going through at least one of the lists, which
7842      * means there is something remaining in at most one.  We check if the list
7843      * that has been exhausted is positioned such that we are in the middle
7844      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7845      * the ones we care about.)  There are four cases:
7846      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
7847      *     nothing left in the intersection.
7848      *  2) Both were in their sets, count is 2 and perhaps is incremented to
7849      *     above 2.  What should be output is exactly that which is in the
7850      *     non-exhausted set, as everything it has is also in the intersection
7851      *     set, and everything it doesn't have can't be in the intersection
7852      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7853      *     gets incremented to 2.  Like the previous case, the intersection is
7854      *     everything that remains in the non-exhausted set.
7855      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7856      *     remains 1.  And the intersection has nothing more. */
7857     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7858         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7859     {
7860         count++;
7861     }
7862
7863     /* The final length is what we've output so far plus what else is in the
7864      * intersection.  At most one of the subexpressions below will be non-zero */
7865     len_r = i_r;
7866     if (count >= 2) {
7867         len_r += (len_a - i_a) + (len_b - i_b);
7868     }
7869
7870     /* Set result to final length, which can change the pointer to array_r, so
7871      * re-find it */
7872     if (len_r != invlist_len(r)) {
7873         invlist_set_len(r, len_r);
7874         invlist_trim(r);
7875         array_r = invlist_array(r);
7876     }
7877
7878     /* Finish outputting any remaining */
7879     if (count >= 2) { /* At most one will have a non-zero copy count */
7880         IV copy_count;
7881         if ((copy_count = len_a - i_a) > 0) {
7882             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7883         }
7884         else if ((copy_count = len_b - i_b) > 0) {
7885             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7886         }
7887     }
7888
7889     /*  We may be removing a reference to one of the inputs */
7890     if (a == *i || b == *i) {
7891         SvREFCNT_dec(*i);
7892     }
7893
7894     /* If we've changed b, restore it */
7895     if (complement_b) {
7896         array_b[0] = 1;
7897     }
7898
7899     *i = r;
7900     return;
7901 }
7902
7903 SV*
7904 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7905 {
7906     /* Add the range from 'start' to 'end' inclusive to the inversion list's
7907      * set.  A pointer to the inversion list is returned.  This may actually be
7908      * a new list, in which case the passed in one has been destroyed.  The
7909      * passed in inversion list can be NULL, in which case a new one is created
7910      * with just the one range in it */
7911
7912     SV* range_invlist;
7913     UV len;
7914
7915     if (invlist == NULL) {
7916         invlist = _new_invlist(2);
7917         len = 0;
7918     }
7919     else {
7920         len = invlist_len(invlist);
7921     }
7922
7923     /* If comes after the final entry, can just append it to the end */
7924     if (len == 0
7925         || start >= invlist_array(invlist)
7926                                     [invlist_len(invlist) - 1])
7927     {
7928         _append_range_to_invlist(invlist, start, end);
7929         return invlist;
7930     }
7931
7932     /* Here, can't just append things, create and return a new inversion list
7933      * which is the union of this range and the existing inversion list */
7934     range_invlist = _new_invlist(2);
7935     _append_range_to_invlist(range_invlist, start, end);
7936
7937     _invlist_union(invlist, range_invlist, &invlist);
7938
7939     /* The temporary can be freed */
7940     SvREFCNT_dec(range_invlist);
7941
7942     return invlist;
7943 }
7944
7945 #endif
7946
7947 PERL_STATIC_INLINE bool
7948 S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp)
7949 {
7950     /* Does <invlist> contain code point <cp> as part of the set? */
7951
7952     IV index = _invlist_search(invlist, cp);
7953
7954     PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP;
7955
7956     return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index);
7957 }
7958
7959 PERL_STATIC_INLINE SV*
7960 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7961     return _add_range_to_invlist(invlist, cp, cp);
7962 }
7963
7964 #ifndef PERL_IN_XSUB_RE
7965 void
7966 Perl__invlist_invert(pTHX_ SV* const invlist)
7967 {
7968     /* Complement the input inversion list.  This adds a 0 if the list didn't
7969      * have a zero; removes it otherwise.  As described above, the data
7970      * structure is set up so that this is very efficient */
7971
7972     UV* len_pos = get_invlist_len_addr(invlist);
7973
7974     PERL_ARGS_ASSERT__INVLIST_INVERT;
7975
7976     /* The inverse of matching nothing is matching everything */
7977     if (*len_pos == 0) {
7978         _append_range_to_invlist(invlist, 0, UV_MAX);
7979         return;
7980     }
7981
7982     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
7983      * zero element was a 0, so it is being removed, so the length decrements
7984      * by 1; and vice-versa.  SvCUR is unaffected */
7985     if (*get_invlist_zero_addr(invlist) ^= 1) {
7986         (*len_pos)--;
7987     }
7988     else {
7989         (*len_pos)++;
7990     }
7991 }
7992
7993 void
7994 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7995 {
7996     /* Complement the input inversion list (which must be a Unicode property,
7997      * all of which don't match above the Unicode maximum code point.)  And
7998      * Perl has chosen to not have the inversion match above that either.  This
7999      * adds a 0x110000 if the list didn't end with it, and removes it if it did
8000      */
8001
8002     UV len;
8003     UV* array;
8004
8005     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8006
8007     _invlist_invert(invlist);
8008
8009     len = invlist_len(invlist);
8010
8011     if (len != 0) { /* If empty do nothing */
8012         array = invlist_array(invlist);
8013         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8014             /* Add 0x110000.  First, grow if necessary */
8015             len++;
8016             if (invlist_max(invlist) < len) {
8017                 invlist_extend(invlist, len);
8018                 array = invlist_array(invlist);
8019             }
8020             invlist_set_len(invlist, len);
8021             array[len - 1] = PERL_UNICODE_MAX + 1;
8022         }
8023         else {  /* Remove the 0x110000 */
8024             invlist_set_len(invlist, len - 1);
8025         }
8026     }
8027
8028     return;
8029 }
8030 #endif
8031
8032 PERL_STATIC_INLINE SV*
8033 S_invlist_clone(pTHX_ SV* const invlist)
8034 {
8035
8036     /* Return a new inversion list that is a copy of the input one, which is
8037      * unchanged */
8038
8039     /* Need to allocate extra space to accommodate Perl's addition of a
8040      * trailing NUL to SvPV's, since it thinks they are always strings */
8041     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
8042     STRLEN length = SvCUR(invlist);
8043
8044     PERL_ARGS_ASSERT_INVLIST_CLONE;
8045
8046     SvCUR_set(new_invlist, length); /* This isn't done automatically */
8047     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8048
8049     return new_invlist;
8050 }
8051
8052 PERL_STATIC_INLINE UV*
8053 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8054 {
8055     /* Return the address of the UV that contains the current iteration
8056      * position */
8057
8058     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8059
8060     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8061 }
8062
8063 PERL_STATIC_INLINE UV*
8064 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8065 {
8066     /* Return the address of the UV that contains the version id. */
8067
8068     PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8069
8070     return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8071 }
8072
8073 PERL_STATIC_INLINE void
8074 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8075 {
8076     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8077
8078     *get_invlist_iter_addr(invlist) = 0;
8079 }
8080
8081 STATIC bool
8082 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8083 {
8084     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8085      * This call sets in <*start> and <*end>, the next range in <invlist>.
8086      * Returns <TRUE> if successful and the next call will return the next
8087      * range; <FALSE> if was already at the end of the list.  If the latter,
8088      * <*start> and <*end> are unchanged, and the next call to this function
8089      * will start over at the beginning of the list */
8090
8091     UV* pos = get_invlist_iter_addr(invlist);
8092     UV len = invlist_len(invlist);
8093     UV *array;
8094
8095     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8096
8097     if (*pos >= len) {
8098         *pos = UV_MAX;  /* Force iternit() to be required next time */
8099         return FALSE;
8100     }
8101
8102     array = invlist_array(invlist);
8103
8104     *start = array[(*pos)++];
8105
8106     if (*pos >= len) {
8107         *end = UV_MAX;
8108     }
8109     else {
8110         *end = array[(*pos)++] - 1;
8111     }
8112
8113     return TRUE;
8114 }
8115
8116 PERL_STATIC_INLINE UV
8117 S_invlist_highest(pTHX_ SV* const invlist)
8118 {
8119     /* Returns the highest code point that matches an inversion list.  This API
8120      * has an ambiguity, as it returns 0 under either the highest is actually
8121      * 0, or if the list is empty.  If this distinction matters to you, check
8122      * for emptiness before calling this function */
8123
8124     UV len = invlist_len(invlist);
8125     UV *array;
8126
8127     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8128
8129     if (len == 0) {
8130         return 0;
8131     }
8132
8133     array = invlist_array(invlist);
8134
8135     /* The last element in the array in the inversion list always starts a
8136      * range that goes to infinity.  That range may be for code points that are
8137      * matched in the inversion list, or it may be for ones that aren't
8138      * matched.  In the latter case, the highest code point in the set is one
8139      * less than the beginning of this range; otherwise it is the final element
8140      * of this range: infinity */
8141     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8142            ? UV_MAX
8143            : array[len - 1] - 1;
8144 }
8145
8146 #ifndef PERL_IN_XSUB_RE
8147 SV *
8148 Perl__invlist_contents(pTHX_ SV* const invlist)
8149 {
8150     /* Get the contents of an inversion list into a string SV so that they can
8151      * be printed out.  It uses the format traditionally done for debug tracing
8152      */
8153
8154     UV start, end;
8155     SV* output = newSVpvs("\n");
8156
8157     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8158
8159     invlist_iterinit(invlist);
8160     while (invlist_iternext(invlist, &start, &end)) {
8161         if (end == UV_MAX) {
8162             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8163         }
8164         else if (end != start) {
8165             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8166                     start,       end);
8167         }
8168         else {
8169             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8170         }
8171     }
8172
8173     return output;
8174 }
8175 #endif
8176
8177 #if 0
8178 void
8179 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8180 {
8181     /* Dumps out the ranges in an inversion list.  The string 'header'
8182      * if present is output on a line before the first range */
8183
8184     UV start, end;
8185
8186     if (header && strlen(header)) {
8187         PerlIO_printf(Perl_debug_log, "%s\n", header);
8188     }
8189     invlist_iterinit(invlist);
8190     while (invlist_iternext(invlist, &start, &end)) {
8191         if (end == UV_MAX) {
8192             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8193         }
8194         else {
8195             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8196         }
8197     }
8198 }
8199 #endif
8200
8201 #if 0
8202 bool
8203 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8204 {
8205     /* Return a boolean as to if the two passed in inversion lists are
8206      * identical.  The final argument, if TRUE, says to take the complement of
8207      * the second inversion list before doing the comparison */
8208
8209     UV* array_a = invlist_array(a);
8210     UV* array_b = invlist_array(b);
8211     UV len_a = invlist_len(a);
8212     UV len_b = invlist_len(b);
8213
8214     UV i = 0;               /* current index into the arrays */
8215     bool retval = TRUE;     /* Assume are identical until proven otherwise */
8216
8217     PERL_ARGS_ASSERT__INVLISTEQ;
8218
8219     /* If are to compare 'a' with the complement of b, set it
8220      * up so are looking at b's complement. */
8221     if (complement_b) {
8222
8223         /* The complement of nothing is everything, so <a> would have to have
8224          * just one element, starting at zero (ending at infinity) */
8225         if (len_b == 0) {
8226             return (len_a == 1 && array_a[0] == 0);
8227         }
8228         else if (array_b[0] == 0) {
8229
8230             /* Otherwise, to complement, we invert.  Here, the first element is
8231              * 0, just remove it.  To do this, we just pretend the array starts
8232              * one later, and clear the flag as we don't have to do anything
8233              * else later */
8234
8235             array_b++;
8236             len_b--;
8237             complement_b = FALSE;
8238         }
8239         else {
8240
8241             /* But if the first element is not zero, we unshift a 0 before the
8242              * array.  The data structure reserves a space for that 0 (which
8243              * should be a '1' right now), so physical shifting is unneeded,
8244              * but temporarily change that element to 0.  Before exiting the
8245              * routine, we must restore the element to '1' */
8246             array_b--;
8247             len_b++;
8248             array_b[0] = 0;
8249         }
8250     }
8251
8252     /* Make sure that the lengths are the same, as well as the final element
8253      * before looping through the remainder.  (Thus we test the length, final,
8254      * and first elements right off the bat) */
8255     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8256         retval = FALSE;
8257     }
8258     else for (i = 0; i < len_a - 1; i++) {
8259         if (array_a[i] != array_b[i]) {
8260             retval = FALSE;
8261             break;
8262         }
8263     }
8264
8265     if (complement_b) {
8266         array_b[0] = 1;
8267     }
8268     return retval;
8269 }
8270 #endif
8271
8272 #undef HEADER_LENGTH
8273 #undef INVLIST_INITIAL_LENGTH
8274 #undef TO_INTERNAL_SIZE
8275 #undef FROM_INTERNAL_SIZE
8276 #undef INVLIST_LEN_OFFSET
8277 #undef INVLIST_ZERO_OFFSET
8278 #undef INVLIST_ITER_OFFSET
8279 #undef INVLIST_VERSION_ID
8280
8281 /* End of inversion list object */
8282
8283 /*
8284  - reg - regular expression, i.e. main body or parenthesized thing
8285  *
8286  * Caller must absorb opening parenthesis.
8287  *
8288  * Combining parenthesis handling with the base level of regular expression
8289  * is a trifle forced, but the need to tie the tails of the branches to what
8290  * follows makes it hard to avoid.
8291  */
8292 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8293 #ifdef DEBUGGING
8294 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8295 #else
8296 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8297 #endif
8298
8299 STATIC regnode *
8300 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8301     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8302 {
8303     dVAR;
8304     regnode *ret;               /* Will be the head of the group. */
8305     regnode *br;
8306     regnode *lastbr;
8307     regnode *ender = NULL;
8308     I32 parno = 0;
8309     I32 flags;
8310     U32 oregflags = RExC_flags;
8311     bool have_branch = 0;
8312     bool is_open = 0;
8313     I32 freeze_paren = 0;
8314     I32 after_freeze = 0;
8315
8316     /* for (?g), (?gc), and (?o) warnings; warning
8317        about (?c) will warn about (?g) -- japhy    */
8318
8319 #define WASTED_O  0x01
8320 #define WASTED_G  0x02
8321 #define WASTED_C  0x04
8322 #define WASTED_GC (0x02|0x04)
8323     I32 wastedflags = 0x00;
8324
8325     char * parse_start = RExC_parse; /* MJD */
8326     char * const oregcomp_parse = RExC_parse;
8327
8328     GET_RE_DEBUG_FLAGS_DECL;
8329
8330     PERL_ARGS_ASSERT_REG;
8331     DEBUG_PARSE("reg ");
8332
8333     *flagp = 0;                         /* Tentatively. */
8334
8335
8336     /* Make an OPEN node, if parenthesized. */
8337     if (paren) {
8338         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8339             char *start_verb = RExC_parse;
8340             STRLEN verb_len = 0;
8341             char *start_arg = NULL;
8342             unsigned char op = 0;
8343             int argok = 1;
8344             int internal_argval = 0; /* internal_argval is only useful if !argok */
8345             while ( *RExC_parse && *RExC_parse != ')' ) {
8346                 if ( *RExC_parse == ':' ) {
8347                     start_arg = RExC_parse + 1;
8348                     break;
8349                 }
8350                 RExC_parse++;
8351             }
8352             ++start_verb;
8353             verb_len = RExC_parse - start_verb;
8354             if ( start_arg ) {
8355                 RExC_parse++;
8356                 while ( *RExC_parse && *RExC_parse != ')' )
8357                     RExC_parse++;
8358                 if ( *RExC_parse != ')' )
8359                     vFAIL("Unterminated verb pattern argument");
8360                 if ( RExC_parse == start_arg )
8361                     start_arg = NULL;
8362             } else {
8363                 if ( *RExC_parse != ')' )
8364                     vFAIL("Unterminated verb pattern");
8365             }
8366
8367             switch ( *start_verb ) {
8368             case 'A':  /* (*ACCEPT) */
8369                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8370                     op = ACCEPT;
8371                     internal_argval = RExC_nestroot;
8372                 }
8373                 break;
8374             case 'C':  /* (*COMMIT) */
8375                 if ( memEQs(start_verb,verb_len,"COMMIT") )
8376                     op = COMMIT;
8377                 break;
8378             case 'F':  /* (*FAIL) */
8379                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8380                     op = OPFAIL;
8381                     argok = 0;
8382                 }
8383                 break;
8384             case ':':  /* (*:NAME) */
8385             case 'M':  /* (*MARK:NAME) */
8386                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8387                     op = MARKPOINT;
8388                     argok = -1;
8389                 }
8390                 break;
8391             case 'P':  /* (*PRUNE) */
8392                 if ( memEQs(start_verb,verb_len,"PRUNE") )
8393                     op = PRUNE;
8394                 break;
8395             case 'S':   /* (*SKIP) */
8396                 if ( memEQs(start_verb,verb_len,"SKIP") )
8397                     op = SKIP;
8398                 break;
8399             case 'T':  /* (*THEN) */
8400                 /* [19:06] <TimToady> :: is then */
8401                 if ( memEQs(start_verb,verb_len,"THEN") ) {
8402                     op = CUTGROUP;
8403                     RExC_seen |= REG_SEEN_CUTGROUP;
8404                 }
8405                 break;
8406             }
8407             if ( ! op ) {
8408                 RExC_parse++;
8409                 vFAIL3("Unknown verb pattern '%.*s'",
8410                     verb_len, start_verb);
8411             }
8412             if ( argok ) {
8413                 if ( start_arg && internal_argval ) {
8414                     vFAIL3("Verb pattern '%.*s' may not have an argument",
8415                         verb_len, start_verb);
8416                 } else if ( argok < 0 && !start_arg ) {
8417                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8418                         verb_len, start_verb);
8419                 } else {
8420                     ret = reganode(pRExC_state, op, internal_argval);
8421                     if ( ! internal_argval && ! SIZE_ONLY ) {
8422                         if (start_arg) {
8423                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8424                             ARG(ret) = add_data( pRExC_state, 1, "S" );
8425                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8426                             ret->flags = 0;
8427                         } else {
8428                             ret->flags = 1;
8429                         }
8430                     }
8431                 }
8432                 if (!internal_argval)
8433                     RExC_seen |= REG_SEEN_VERBARG;
8434             } else if ( start_arg ) {
8435                 vFAIL3("Verb pattern '%.*s' may not have an argument",
8436                         verb_len, start_verb);
8437             } else {
8438                 ret = reg_node(pRExC_state, op);
8439             }
8440             nextchar(pRExC_state);
8441             return ret;
8442         } else
8443         if (*RExC_parse == '?') { /* (?...) */
8444             bool is_logical = 0;
8445             const char * const seqstart = RExC_parse;
8446             bool has_use_defaults = FALSE;
8447
8448             RExC_parse++;
8449             paren = *RExC_parse++;
8450             ret = NULL;                 /* For look-ahead/behind. */
8451             switch (paren) {
8452
8453             case 'P':   /* (?P...) variants for those used to PCRE/Python */
8454                 paren = *RExC_parse++;
8455                 if ( paren == '<')         /* (?P<...>) named capture */
8456                     goto named_capture;
8457                 else if (paren == '>') {   /* (?P>name) named recursion */
8458                     goto named_recursion;
8459                 }
8460                 else if (paren == '=') {   /* (?P=...)  named backref */
8461                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
8462                        you change this make sure you change that */
8463                     char* name_start = RExC_parse;
8464                     U32 num = 0;
8465                     SV *sv_dat = reg_scan_name(pRExC_state,
8466                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8467                     if (RExC_parse == name_start || *RExC_parse != ')')
8468                         vFAIL2("Sequence %.3s... not terminated",parse_start);
8469
8470                     if (!SIZE_ONLY) {
8471                         num = add_data( pRExC_state, 1, "S" );
8472                         RExC_rxi->data->data[num]=(void*)sv_dat;
8473                         SvREFCNT_inc_simple_void(sv_dat);
8474                     }
8475                     RExC_sawback = 1;
8476                     ret = reganode(pRExC_state,
8477                                    ((! FOLD)
8478                                      ? NREF
8479                                      : (ASCII_FOLD_RESTRICTED)
8480                                        ? NREFFA
8481                                        : (AT_LEAST_UNI_SEMANTICS)
8482                                          ? NREFFU
8483                                          : (LOC)
8484                                            ? NREFFL
8485                                            : NREFF),
8486                                     num);
8487                     *flagp |= HASWIDTH;
8488
8489                     Set_Node_Offset(ret, parse_start+1);
8490                     Set_Node_Cur_Length(ret); /* MJD */
8491
8492                     nextchar(pRExC_state);
8493                     return ret;
8494                 }
8495                 RExC_parse++;
8496                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8497                 /*NOTREACHED*/
8498             case '<':           /* (?<...) */
8499                 if (*RExC_parse == '!')
8500                     paren = ',';
8501                 else if (*RExC_parse != '=')
8502               named_capture:
8503                 {               /* (?<...>) */
8504                     char *name_start;
8505                     SV *svname;
8506                     paren= '>';
8507             case '\'':          /* (?'...') */
8508                     name_start= RExC_parse;
8509                     svname = reg_scan_name(pRExC_state,
8510                         SIZE_ONLY ?  /* reverse test from the others */
8511                         REG_RSN_RETURN_NAME :
8512                         REG_RSN_RETURN_NULL);
8513                     if (RExC_parse == name_start) {
8514                         RExC_parse++;
8515                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8516                         /*NOTREACHED*/
8517                     }
8518                     if (*RExC_parse != paren)
8519                         vFAIL2("Sequence (?%c... not terminated",
8520                             paren=='>' ? '<' : paren);
8521                     if (SIZE_ONLY) {
8522                         HE *he_str;
8523                         SV *sv_dat = NULL;
8524                         if (!svname) /* shouldn't happen */
8525                             Perl_croak(aTHX_
8526                                 "panic: reg_scan_name returned NULL");
8527                         if (!RExC_paren_names) {
8528                             RExC_paren_names= newHV();
8529                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
8530 #ifdef DEBUGGING
8531                             RExC_paren_name_list= newAV();
8532                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8533 #endif
8534                         }
8535                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8536                         if ( he_str )
8537                             sv_dat = HeVAL(he_str);
8538                         if ( ! sv_dat ) {
8539                             /* croak baby croak */
8540                             Perl_croak(aTHX_
8541                                 "panic: paren_name hash element allocation failed");
8542                         } else if ( SvPOK(sv_dat) ) {
8543                             /* (?|...) can mean we have dupes so scan to check
8544                                its already been stored. Maybe a flag indicating
8545                                we are inside such a construct would be useful,
8546                                but the arrays are likely to be quite small, so
8547                                for now we punt -- dmq */
8548                             IV count = SvIV(sv_dat);
8549                             I32 *pv = (I32*)SvPVX(sv_dat);
8550                             IV i;
8551                             for ( i = 0 ; i < count ; i++ ) {
8552                                 if ( pv[i] == RExC_npar ) {
8553                                     count = 0;
8554                                     break;
8555                                 }
8556                             }
8557                             if ( count ) {
8558                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8559                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8560                                 pv[count] = RExC_npar;
8561                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8562                             }
8563                         } else {
8564                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
8565                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8566                             SvIOK_on(sv_dat);
8567                             SvIV_set(sv_dat, 1);
8568                         }
8569 #ifdef DEBUGGING
8570                         /* Yes this does cause a memory leak in debugging Perls */
8571                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8572                             SvREFCNT_dec(svname);
8573 #endif
8574
8575                         /*sv_dump(sv_dat);*/
8576                     }
8577                     nextchar(pRExC_state);
8578                     paren = 1;
8579                     goto capturing_parens;
8580                 }
8581                 RExC_seen |= REG_SEEN_LOOKBEHIND;
8582                 RExC_in_lookbehind++;
8583                 RExC_parse++;
8584             case '=':           /* (?=...) */
8585                 RExC_seen_zerolen++;
8586                 break;
8587             case '!':           /* (?!...) */
8588                 RExC_seen_zerolen++;
8589                 if (*RExC_parse == ')') {
8590                     ret=reg_node(pRExC_state, OPFAIL);
8591                     nextchar(pRExC_state);
8592                     return ret;
8593                 }
8594                 break;
8595             case '|':           /* (?|...) */
8596                 /* branch reset, behave like a (?:...) except that
8597                    buffers in alternations share the same numbers */
8598                 paren = ':';
8599                 after_freeze = freeze_paren = RExC_npar;
8600                 break;
8601             case ':':           /* (?:...) */
8602             case '>':           /* (?>...) */
8603                 break;
8604             case '$':           /* (?$...) */
8605             case '@':           /* (?@...) */
8606                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8607                 break;
8608             case '#':           /* (?#...) */
8609                 while (*RExC_parse && *RExC_parse != ')')
8610                     RExC_parse++;
8611                 if (*RExC_parse != ')')
8612                     FAIL("Sequence (?#... not terminated");
8613                 nextchar(pRExC_state);
8614                 *flagp = TRYAGAIN;
8615                 return NULL;
8616             case '0' :           /* (?0) */
8617             case 'R' :           /* (?R) */
8618                 if (*RExC_parse != ')')
8619                     FAIL("Sequence (?R) not terminated");
8620                 ret = reg_node(pRExC_state, GOSTART);
8621                 *flagp |= POSTPONED;
8622                 nextchar(pRExC_state);
8623                 return ret;
8624                 /*notreached*/
8625             { /* named and numeric backreferences */
8626                 I32 num;
8627             case '&':            /* (?&NAME) */
8628                 parse_start = RExC_parse - 1;
8629               named_recursion:
8630                 {
8631                     SV *sv_dat = reg_scan_name(pRExC_state,
8632                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8633                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8634                 }
8635                 goto gen_recurse_regop;
8636                 assert(0); /* NOT REACHED */
8637             case '+':
8638                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8639                     RExC_parse++;
8640                     vFAIL("Illegal pattern");
8641                 }
8642                 goto parse_recursion;
8643                 /* NOT REACHED*/
8644             case '-': /* (?-1) */
8645                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8646                     RExC_parse--; /* rewind to let it be handled later */
8647                     goto parse_flags;
8648                 }
8649                 /*FALLTHROUGH */
8650             case '1': case '2': case '3': case '4': /* (?1) */
8651             case '5': case '6': case '7': case '8': case '9':
8652                 RExC_parse--;
8653               parse_recursion:
8654                 num = atoi(RExC_parse);
8655                 parse_start = RExC_parse - 1; /* MJD */
8656                 if (*RExC_parse == '-')
8657                     RExC_parse++;
8658                 while (isDIGIT(*RExC_parse))
8659                         RExC_parse++;
8660                 if (*RExC_parse!=')')
8661                     vFAIL("Expecting close bracket");
8662
8663               gen_recurse_regop:
8664                 if ( paren == '-' ) {
8665                     /*
8666                     Diagram of capture buffer numbering.
8667                     Top line is the normal capture buffer numbers
8668                     Bottom line is the negative indexing as from
8669                     the X (the (?-2))
8670
8671                     +   1 2    3 4 5 X          6 7
8672                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8673                     -   5 4    3 2 1 X          x x
8674
8675                     */
8676                     num = RExC_npar + num;
8677                     if (num < 1)  {
8678                         RExC_parse++;
8679                         vFAIL("Reference to nonexistent group");
8680                     }
8681                 } else if ( paren == '+' ) {
8682                     num = RExC_npar + num - 1;
8683                 }
8684
8685                 ret = reganode(pRExC_state, GOSUB, num);
8686                 if (!SIZE_ONLY) {
8687                     if (num > (I32)RExC_rx->nparens) {
8688                         RExC_parse++;
8689                         vFAIL("Reference to nonexistent group");
8690                     }
8691                     ARG2L_SET( ret, RExC_recurse_count++);
8692                     RExC_emit++;
8693                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8694                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8695                 } else {
8696                     RExC_size++;
8697                 }
8698                 RExC_seen |= REG_SEEN_RECURSE;
8699                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8700                 Set_Node_Offset(ret, parse_start); /* MJD */
8701
8702                 *flagp |= POSTPONED;
8703                 nextchar(pRExC_state);
8704                 return ret;
8705             } /* named and numeric backreferences */
8706             assert(0); /* NOT REACHED */
8707
8708             case '?':           /* (??...) */
8709                 is_logical = 1;
8710                 if (*RExC_parse != '{') {
8711                     RExC_parse++;
8712                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8713                     /*NOTREACHED*/
8714                 }
8715                 *flagp |= POSTPONED;
8716                 paren = *RExC_parse++;
8717                 /* FALL THROUGH */
8718             case '{':           /* (?{...}) */
8719             {
8720                 U32 n = 0;
8721                 struct reg_code_block *cb;
8722
8723                 RExC_seen_zerolen++;
8724
8725                 if (   !pRExC_state->num_code_blocks
8726                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
8727                     || pRExC_state->code_blocks[pRExC_state->code_index].start
8728                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8729                             - RExC_start)
8730                 ) {
8731                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
8732                         FAIL("panic: Sequence (?{...}): no code block found\n");
8733                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
8734                 }
8735                 /* this is a pre-compiled code block (?{...}) */
8736                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8737                 RExC_parse = RExC_start + cb->end;
8738                 if (!SIZE_ONLY) {
8739                     OP *o = cb->block;
8740                     if (cb->src_regex) {
8741                         n = add_data(pRExC_state, 2, "rl");
8742                         RExC_rxi->data->data[n] =
8743                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
8744                         RExC_rxi->data->data[n+1] = (void*)o;
8745                     }
8746                     else {
8747                         n = add_data(pRExC_state, 1,
8748                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8749                         RExC_rxi->data->data[n] = (void*)o;
8750                     }
8751                 }
8752                 pRExC_state->code_index++;
8753                 nextchar(pRExC_state);
8754
8755                 if (is_logical) {
8756                     regnode *eval;
8757                     ret = reg_node(pRExC_state, LOGICAL);
8758                     eval = reganode(pRExC_state, EVAL, n);
8759                     if (!SIZE_ONLY) {
8760                         ret->flags = 2;
8761                         /* for later propagation into (??{}) return value */
8762                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8763                     }
8764                     REGTAIL(pRExC_state, ret, eval);
8765                     /* deal with the length of this later - MJD */
8766                     return ret;
8767                 }
8768                 ret = reganode(pRExC_state, EVAL, n);
8769                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8770                 Set_Node_Offset(ret, parse_start);
8771                 return ret;
8772             }
8773             case '(':           /* (?(?{...})...) and (?(?=...)...) */
8774             {
8775                 int is_define= 0;
8776                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
8777                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8778                         || RExC_parse[1] == '<'
8779                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
8780                         I32 flag;
8781
8782                         ret = reg_node(pRExC_state, LOGICAL);
8783                         if (!SIZE_ONLY)
8784                             ret->flags = 1;
8785                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8786                         goto insert_if;
8787                     }
8788                 }
8789                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
8790                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8791                 {
8792                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
8793                     char *name_start= RExC_parse++;
8794                     U32 num = 0;
8795                     SV *sv_dat=reg_scan_name(pRExC_state,
8796                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8797                     if (RExC_parse == name_start || *RExC_parse != ch)
8798                         vFAIL2("Sequence (?(%c... not terminated",
8799                             (ch == '>' ? '<' : ch));
8800                     RExC_parse++;
8801                     if (!SIZE_ONLY) {
8802                         num = add_data( pRExC_state, 1, "S" );
8803                         RExC_rxi->data->data[num]=(void*)sv_dat;
8804                         SvREFCNT_inc_simple_void(sv_dat);
8805                     }
8806                     ret = reganode(pRExC_state,NGROUPP,num);
8807                     goto insert_if_check_paren;
8808                 }
8809                 else if (RExC_parse[0] == 'D' &&
8810                          RExC_parse[1] == 'E' &&
8811                          RExC_parse[2] == 'F' &&
8812                          RExC_parse[3] == 'I' &&
8813                          RExC_parse[4] == 'N' &&
8814                          RExC_parse[5] == 'E')
8815                 {
8816                     ret = reganode(pRExC_state,DEFINEP,0);
8817                     RExC_parse +=6 ;
8818                     is_define = 1;
8819                     goto insert_if_check_paren;
8820                 }
8821                 else if (RExC_parse[0] == 'R') {
8822                     RExC_parse++;
8823                     parno = 0;
8824                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8825                         parno = atoi(RExC_parse++);
8826                         while (isDIGIT(*RExC_parse))
8827                             RExC_parse++;
8828                     } else if (RExC_parse[0] == '&') {
8829                         SV *sv_dat;
8830                         RExC_parse++;
8831                         sv_dat = reg_scan_name(pRExC_state,
8832                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8833                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8834                     }
8835                     ret = reganode(pRExC_state,INSUBP,parno);
8836                     goto insert_if_check_paren;
8837                 }
8838                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8839                     /* (?(1)...) */
8840                     char c;
8841                     parno = atoi(RExC_parse++);
8842
8843                     while (isDIGIT(*RExC_parse))
8844                         RExC_parse++;
8845                     ret = reganode(pRExC_state, GROUPP, parno);
8846
8847                  insert_if_check_paren:
8848                     if ((c = *nextchar(pRExC_state)) != ')')
8849                         vFAIL("Switch condition not recognized");
8850                   insert_if:
8851                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8852                     br = regbranch(pRExC_state, &flags, 1,depth+1);
8853                     if (br == NULL)
8854                         br = reganode(pRExC_state, LONGJMP, 0);
8855                     else
8856                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8857                     c = *nextchar(pRExC_state);
8858                     if (flags&HASWIDTH)
8859                         *flagp |= HASWIDTH;
8860                     if (c == '|') {
8861                         if (is_define)
8862                             vFAIL("(?(DEFINE)....) does not allow branches");
8863                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8864                         regbranch(pRExC_state, &flags, 1,depth+1);
8865                         REGTAIL(pRExC_state, ret, lastbr);
8866                         if (flags&HASWIDTH)
8867                             *flagp |= HASWIDTH;
8868                         c = *nextchar(pRExC_state);
8869                     }
8870                     else
8871                         lastbr = NULL;
8872                     if (c != ')')
8873                         vFAIL("Switch (?(condition)... contains too many branches");
8874                     ender = reg_node(pRExC_state, TAIL);
8875                     REGTAIL(pRExC_state, br, ender);
8876                     if (lastbr) {
8877                         REGTAIL(pRExC_state, lastbr, ender);
8878                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8879                     }
8880                     else
8881                         REGTAIL(pRExC_state, ret, ender);
8882                     RExC_size++; /* XXX WHY do we need this?!!
8883                                     For large programs it seems to be required
8884                                     but I can't figure out why. -- dmq*/
8885                     return ret;
8886                 }
8887                 else {
8888                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8889                 }
8890             }
8891             case 0:
8892                 RExC_parse--; /* for vFAIL to print correctly */
8893                 vFAIL("Sequence (? incomplete");
8894                 break;
8895             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
8896                                        that follow */
8897                 has_use_defaults = TRUE;
8898                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8899                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8900                                                 ? REGEX_UNICODE_CHARSET
8901                                                 : REGEX_DEPENDS_CHARSET);
8902                 goto parse_flags;
8903             default:
8904                 --RExC_parse;
8905                 parse_flags:      /* (?i) */
8906             {
8907                 U32 posflags = 0, negflags = 0;
8908                 U32 *flagsp = &posflags;
8909                 char has_charset_modifier = '\0';
8910                 regex_charset cs = get_regex_charset(RExC_flags);
8911                 if (cs == REGEX_DEPENDS_CHARSET
8912                     && (RExC_utf8 || RExC_uni_semantics))
8913                 {
8914                     cs = REGEX_UNICODE_CHARSET;
8915                 }
8916
8917                 while (*RExC_parse) {
8918                     /* && strchr("iogcmsx", *RExC_parse) */
8919                     /* (?g), (?gc) and (?o) are useless here
8920                        and must be globally applied -- japhy */
8921                     switch (*RExC_parse) {
8922                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8923                     case LOCALE_PAT_MOD:
8924                         if (has_charset_modifier) {
8925                             goto excess_modifier;
8926                         }
8927                         else if (flagsp == &negflags) {
8928                             goto neg_modifier;
8929                         }
8930                         cs = REGEX_LOCALE_CHARSET;
8931                         has_charset_modifier = LOCALE_PAT_MOD;
8932                         RExC_contains_locale = 1;
8933                         break;
8934                     case UNICODE_PAT_MOD:
8935                         if (has_charset_modifier) {
8936                             goto excess_modifier;
8937                         }
8938                         else if (flagsp == &negflags) {
8939                             goto neg_modifier;
8940                         }
8941                         cs = REGEX_UNICODE_CHARSET;
8942                         has_charset_modifier = UNICODE_PAT_MOD;
8943                         break;
8944                     case ASCII_RESTRICT_PAT_MOD:
8945                         if (flagsp == &negflags) {
8946                             goto neg_modifier;
8947                         }
8948                         if (has_charset_modifier) {
8949                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8950                                 goto excess_modifier;
8951                             }
8952                             /* Doubled modifier implies more restricted */
8953                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8954                         }
8955                         else {
8956                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
8957                         }
8958                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8959                         break;
8960                     case DEPENDS_PAT_MOD:
8961                         if (has_use_defaults) {
8962                             goto fail_modifiers;
8963                         }
8964                         else if (flagsp == &negflags) {
8965                             goto neg_modifier;
8966                         }
8967                         else if (has_charset_modifier) {
8968                             goto excess_modifier;
8969                         }
8970
8971                         /* The dual charset means unicode semantics if the
8972                          * pattern (or target, not known until runtime) are
8973                          * utf8, or something in the pattern indicates unicode
8974                          * semantics */
8975                         cs = (RExC_utf8 || RExC_uni_semantics)
8976                              ? REGEX_UNICODE_CHARSET
8977                              : REGEX_DEPENDS_CHARSET;
8978                         has_charset_modifier = DEPENDS_PAT_MOD;
8979                         break;
8980                     excess_modifier:
8981                         RExC_parse++;
8982                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8983                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8984                         }
8985                         else if (has_charset_modifier == *(RExC_parse - 1)) {
8986                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8987                         }
8988                         else {
8989                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8990                         }
8991                         /*NOTREACHED*/
8992                     neg_modifier:
8993                         RExC_parse++;
8994                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8995                         /*NOTREACHED*/
8996                     case ONCE_PAT_MOD: /* 'o' */
8997                     case GLOBAL_PAT_MOD: /* 'g' */
8998                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8999                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9000                             if (! (wastedflags & wflagbit) ) {
9001                                 wastedflags |= wflagbit;
9002                                 vWARN5(
9003                                     RExC_parse + 1,
9004                                     "Useless (%s%c) - %suse /%c modifier",
9005                                     flagsp == &negflags ? "?-" : "?",
9006                                     *RExC_parse,
9007                                     flagsp == &negflags ? "don't " : "",
9008                                     *RExC_parse
9009                                 );
9010                             }
9011                         }
9012                         break;
9013
9014                     case CONTINUE_PAT_MOD: /* 'c' */
9015                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9016                             if (! (wastedflags & WASTED_C) ) {
9017                                 wastedflags |= WASTED_GC;
9018                                 vWARN3(
9019                                     RExC_parse + 1,
9020                                     "Useless (%sc) - %suse /gc modifier",
9021                                     flagsp == &negflags ? "?-" : "?",
9022                                     flagsp == &negflags ? "don't " : ""
9023                                 );
9024                             }
9025                         }
9026                         break;
9027                     case KEEPCOPY_PAT_MOD: /* 'p' */
9028                         if (flagsp == &negflags) {
9029                             if (SIZE_ONLY)
9030                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9031                         } else {
9032                             *flagsp |= RXf_PMf_KEEPCOPY;
9033                         }
9034                         break;
9035                     case '-':
9036                         /* A flag is a default iff it is following a minus, so
9037                          * if there is a minus, it means will be trying to
9038                          * re-specify a default which is an error */
9039                         if (has_use_defaults || flagsp == &negflags) {
9040             fail_modifiers:
9041                             RExC_parse++;
9042                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9043                             /*NOTREACHED*/
9044                         }
9045                         flagsp = &negflags;
9046                         wastedflags = 0;  /* reset so (?g-c) warns twice */
9047                         break;
9048                     case ':':
9049                         paren = ':';
9050                         /*FALLTHROUGH*/
9051                     case ')':
9052                         RExC_flags |= posflags;
9053                         RExC_flags &= ~negflags;
9054                         set_regex_charset(&RExC_flags, cs);
9055                         if (paren != ':') {
9056                             oregflags |= posflags;
9057                             oregflags &= ~negflags;
9058                             set_regex_charset(&oregflags, cs);
9059                         }
9060                         nextchar(pRExC_state);
9061                         if (paren != ':') {
9062                             *flagp = TRYAGAIN;
9063                             return NULL;
9064                         } else {
9065                             ret = NULL;
9066                             goto parse_rest;
9067                         }
9068                         /*NOTREACHED*/
9069                     default:
9070                         RExC_parse++;
9071                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9072                         /*NOTREACHED*/
9073                     }
9074                     ++RExC_parse;
9075                 }
9076             }} /* one for the default block, one for the switch */
9077         }
9078         else {                  /* (...) */
9079           capturing_parens:
9080             parno = RExC_npar;
9081             RExC_npar++;
9082
9083             ret = reganode(pRExC_state, OPEN, parno);
9084             if (!SIZE_ONLY ){
9085                 if (!RExC_nestroot)
9086                     RExC_nestroot = parno;
9087                 if (RExC_seen & REG_SEEN_RECURSE
9088                     && !RExC_open_parens[parno-1])
9089                 {
9090                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9091                         "Setting open paren #%"IVdf" to %d\n",
9092                         (IV)parno, REG_NODE_NUM(ret)));
9093                     RExC_open_parens[parno-1]= ret;
9094                 }
9095             }
9096             Set_Node_Length(ret, 1); /* MJD */
9097             Set_Node_Offset(ret, RExC_parse); /* MJD */
9098             is_open = 1;
9099         }
9100     }
9101     else                        /* ! paren */
9102         ret = NULL;
9103
9104    parse_rest:
9105     /* Pick up the branches, linking them together. */
9106     parse_start = RExC_parse;   /* MJD */
9107     br = regbranch(pRExC_state, &flags, 1,depth+1);
9108
9109     /*     branch_len = (paren != 0); */
9110
9111     if (br == NULL)
9112         return(NULL);
9113     if (*RExC_parse == '|') {
9114         if (!SIZE_ONLY && RExC_extralen) {
9115             reginsert(pRExC_state, BRANCHJ, br, depth+1);
9116         }
9117         else {                  /* MJD */
9118             reginsert(pRExC_state, BRANCH, br, depth+1);
9119             Set_Node_Length(br, paren != 0);
9120             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9121         }
9122         have_branch = 1;
9123         if (SIZE_ONLY)
9124             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
9125     }
9126     else if (paren == ':') {
9127         *flagp |= flags&SIMPLE;
9128     }
9129     if (is_open) {                              /* Starts with OPEN. */
9130         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9131     }
9132     else if (paren != '?')              /* Not Conditional */
9133         ret = br;
9134     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9135     lastbr = br;
9136     while (*RExC_parse == '|') {
9137         if (!SIZE_ONLY && RExC_extralen) {
9138             ender = reganode(pRExC_state, LONGJMP,0);
9139             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9140         }
9141         if (SIZE_ONLY)
9142             RExC_extralen += 2;         /* Account for LONGJMP. */
9143         nextchar(pRExC_state);
9144         if (freeze_paren) {
9145             if (RExC_npar > after_freeze)
9146                 after_freeze = RExC_npar;
9147             RExC_npar = freeze_paren;
9148         }
9149         br = regbranch(pRExC_state, &flags, 0, depth+1);
9150
9151         if (br == NULL)
9152             return(NULL);
9153         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9154         lastbr = br;
9155         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9156     }
9157
9158     if (have_branch || paren != ':') {
9159         /* Make a closing node, and hook it on the end. */
9160         switch (paren) {
9161         case ':':
9162             ender = reg_node(pRExC_state, TAIL);
9163             break;
9164         case 1:
9165             ender = reganode(pRExC_state, CLOSE, parno);
9166             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9167                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9168                         "Setting close paren #%"IVdf" to %d\n",
9169                         (IV)parno, REG_NODE_NUM(ender)));
9170                 RExC_close_parens[parno-1]= ender;
9171                 if (RExC_nestroot == parno)
9172                     RExC_nestroot = 0;
9173             }
9174             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9175             Set_Node_Length(ender,1); /* MJD */
9176             break;
9177         case '<':
9178         case ',':
9179         case '=':
9180         case '!':
9181             *flagp &= ~HASWIDTH;
9182             /* FALL THROUGH */
9183         case '>':
9184             ender = reg_node(pRExC_state, SUCCEED);
9185             break;
9186         case 0:
9187             ender = reg_node(pRExC_state, END);
9188             if (!SIZE_ONLY) {
9189                 assert(!RExC_opend); /* there can only be one! */
9190                 RExC_opend = ender;
9191             }
9192             break;
9193         }
9194         DEBUG_PARSE_r(if (!SIZE_ONLY) {
9195             SV * const mysv_val1=sv_newmortal();
9196             SV * const mysv_val2=sv_newmortal();
9197             DEBUG_PARSE_MSG("lsbr");
9198             regprop(RExC_rx, mysv_val1, lastbr);
9199             regprop(RExC_rx, mysv_val2, ender);
9200             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9201                           SvPV_nolen_const(mysv_val1),
9202                           (IV)REG_NODE_NUM(lastbr),
9203                           SvPV_nolen_const(mysv_val2),
9204                           (IV)REG_NODE_NUM(ender),
9205                           (IV)(ender - lastbr)
9206             );
9207         });
9208         REGTAIL(pRExC_state, lastbr, ender);
9209
9210         if (have_branch && !SIZE_ONLY) {
9211             char is_nothing= 1;
9212             if (depth==1)
9213                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9214
9215             /* Hook the tails of the branches to the closing node. */
9216             for (br = ret; br; br = regnext(br)) {
9217                 const U8 op = PL_regkind[OP(br)];
9218                 if (op == BRANCH) {
9219                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9220                     if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9221                         is_nothing= 0;
9222                 }
9223                 else if (op == BRANCHJ) {
9224                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9225                     /* for now we always disable this optimisation * /
9226                     if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9227                     */
9228                         is_nothing= 0;
9229                 }
9230             }
9231             if (is_nothing) {
9232                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9233                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9234                     SV * const mysv_val1=sv_newmortal();
9235                     SV * const mysv_val2=sv_newmortal();
9236                     DEBUG_PARSE_MSG("NADA");
9237                     regprop(RExC_rx, mysv_val1, ret);
9238                     regprop(RExC_rx, mysv_val2, ender);
9239                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9240                                   SvPV_nolen_const(mysv_val1),
9241                                   (IV)REG_NODE_NUM(ret),
9242                                   SvPV_nolen_const(mysv_val2),
9243                                   (IV)REG_NODE_NUM(ender),
9244                                   (IV)(ender - ret)
9245                     );
9246                 });
9247                 OP(br)= NOTHING;
9248                 if (OP(ender) == TAIL) {
9249                     NEXT_OFF(br)= 0;
9250                     RExC_emit= br + 1;
9251                 } else {
9252                     regnode *opt;
9253                     for ( opt= br + 1; opt < ender ; opt++ )
9254                         OP(opt)= OPTIMIZED;
9255                     NEXT_OFF(br)= ender - br;
9256                 }
9257             }
9258         }
9259     }
9260
9261     {
9262         const char *p;
9263         static const char parens[] = "=!<,>";
9264
9265         if (paren && (p = strchr(parens, paren))) {
9266             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9267             int flag = (p - parens) > 1;
9268
9269             if (paren == '>')
9270                 node = SUSPEND, flag = 0;
9271             reginsert(pRExC_state, node,ret, depth+1);
9272             Set_Node_Cur_Length(ret);
9273             Set_Node_Offset(ret, parse_start + 1);
9274             ret->flags = flag;
9275             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9276         }
9277     }
9278
9279     /* Check for proper termination. */
9280     if (paren) {
9281         RExC_flags = oregflags;
9282         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9283             RExC_parse = oregcomp_parse;
9284             vFAIL("Unmatched (");
9285         }
9286     }
9287     else if (!paren && RExC_parse < RExC_end) {
9288         if (*RExC_parse == ')') {
9289             RExC_parse++;
9290             vFAIL("Unmatched )");
9291         }
9292         else
9293             FAIL("Junk on end of regexp");      /* "Can't happen". */
9294         assert(0); /* NOTREACHED */
9295     }
9296
9297     if (RExC_in_lookbehind) {
9298         RExC_in_lookbehind--;
9299     }
9300     if (after_freeze > RExC_npar)
9301         RExC_npar = after_freeze;
9302     return(ret);
9303 }
9304
9305 /*
9306  - regbranch - one alternative of an | operator
9307  *
9308  * Implements the concatenation operator.
9309  */
9310 STATIC regnode *
9311 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9312 {
9313     dVAR;
9314     regnode *ret;
9315     regnode *chain = NULL;
9316     regnode *latest;
9317     I32 flags = 0, c = 0;
9318     GET_RE_DEBUG_FLAGS_DECL;
9319
9320     PERL_ARGS_ASSERT_REGBRANCH;
9321
9322     DEBUG_PARSE("brnc");
9323
9324     if (first)
9325         ret = NULL;
9326     else {
9327         if (!SIZE_ONLY && RExC_extralen)
9328             ret = reganode(pRExC_state, BRANCHJ,0);
9329         else {
9330             ret = reg_node(pRExC_state, BRANCH);
9331             Set_Node_Length(ret, 1);
9332         }
9333     }
9334
9335     if (!first && SIZE_ONLY)
9336         RExC_extralen += 1;                     /* BRANCHJ */
9337
9338     *flagp = WORST;                     /* Tentatively. */
9339
9340     RExC_parse--;
9341     nextchar(pRExC_state);
9342     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9343         flags &= ~TRYAGAIN;
9344         latest = regpiece(pRExC_state, &flags,depth+1);
9345         if (latest == NULL) {
9346             if (flags & TRYAGAIN)
9347                 continue;
9348             return(NULL);
9349         }
9350         else if (ret == NULL)
9351             ret = latest;
9352         *flagp |= flags&(HASWIDTH|POSTPONED);
9353         if (chain == NULL)      /* First piece. */
9354             *flagp |= flags&SPSTART;
9355         else {
9356             RExC_naughty++;
9357             REGTAIL(pRExC_state, chain, latest);
9358         }
9359         chain = latest;
9360         c++;
9361     }
9362     if (chain == NULL) {        /* Loop ran zero times. */
9363         chain = reg_node(pRExC_state, NOTHING);
9364         if (ret == NULL)
9365             ret = chain;
9366     }
9367     if (c == 1) {
9368         *flagp |= flags&SIMPLE;
9369     }
9370
9371     return ret;
9372 }
9373
9374 /*
9375  - regpiece - something followed by possible [*+?]
9376  *
9377  * Note that the branching code sequences used for ? and the general cases
9378  * of * and + are somewhat optimized:  they use the same NOTHING node as
9379  * both the endmarker for their branch list and the body of the last branch.
9380  * It might seem that this node could be dispensed with entirely, but the
9381  * endmarker role is not redundant.
9382  */
9383 STATIC regnode *
9384 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9385 {
9386     dVAR;
9387     regnode *ret;
9388     char op;
9389     char *next;
9390     I32 flags;
9391     const char * const origparse = RExC_parse;
9392     I32 min;
9393     I32 max = REG_INFTY;
9394 #ifdef RE_TRACK_PATTERN_OFFSETS
9395     char *parse_start;
9396 #endif
9397     const char *maxpos = NULL;
9398     GET_RE_DEBUG_FLAGS_DECL;
9399
9400     PERL_ARGS_ASSERT_REGPIECE;
9401
9402     DEBUG_PARSE("piec");
9403
9404     ret = regatom(pRExC_state, &flags,depth+1);
9405     if (ret == NULL) {
9406         if (flags & TRYAGAIN)
9407             *flagp |= TRYAGAIN;
9408         return(NULL);
9409     }
9410
9411     op = *RExC_parse;
9412
9413     if (op == '{' && regcurly(RExC_parse)) {
9414         maxpos = NULL;
9415 #ifdef RE_TRACK_PATTERN_OFFSETS
9416         parse_start = RExC_parse; /* MJD */
9417 #endif
9418         next = RExC_parse + 1;
9419         while (isDIGIT(*next) || *next == ',') {
9420             if (*next == ',') {
9421                 if (maxpos)
9422                     break;
9423                 else
9424                     maxpos = next;
9425             }
9426             next++;
9427         }
9428         if (*next == '}') {             /* got one */
9429             if (!maxpos)
9430                 maxpos = next;
9431             RExC_parse++;
9432             min = atoi(RExC_parse);
9433             if (*maxpos == ',')
9434                 maxpos++;
9435             else
9436                 maxpos = RExC_parse;
9437             max = atoi(maxpos);
9438             if (!max && *maxpos != '0')
9439                 max = REG_INFTY;                /* meaning "infinity" */
9440             else if (max >= REG_INFTY)
9441                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9442             RExC_parse = next;
9443             nextchar(pRExC_state);
9444
9445         do_curly:
9446             if ((flags&SIMPLE)) {
9447                 RExC_naughty += 2 + RExC_naughty / 2;
9448                 reginsert(pRExC_state, CURLY, ret, depth+1);
9449                 Set_Node_Offset(ret, parse_start+1); /* MJD */
9450                 Set_Node_Cur_Length(ret);
9451             }
9452             else {
9453                 regnode * const w = reg_node(pRExC_state, WHILEM);
9454
9455                 w->flags = 0;
9456                 REGTAIL(pRExC_state, ret, w);
9457                 if (!SIZE_ONLY && RExC_extralen) {
9458                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
9459                     reginsert(pRExC_state, NOTHING,ret, depth+1);
9460                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
9461                 }
9462                 reginsert(pRExC_state, CURLYX,ret, depth+1);
9463                                 /* MJD hk */
9464                 Set_Node_Offset(ret, parse_start+1);
9465                 Set_Node_Length(ret,
9466                                 op == '{' ? (RExC_parse - parse_start) : 1);
9467
9468                 if (!SIZE_ONLY && RExC_extralen)
9469                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
9470                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9471                 if (SIZE_ONLY)
9472                     RExC_whilem_seen++, RExC_extralen += 3;
9473                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
9474             }
9475             ret->flags = 0;
9476
9477             if (min > 0)
9478                 *flagp = WORST;
9479             if (max > 0)
9480                 *flagp |= HASWIDTH;
9481             if (max < min)
9482                 vFAIL("Can't do {n,m} with n > m");
9483             if (!SIZE_ONLY) {
9484                 ARG1_SET(ret, (U16)min);
9485                 ARG2_SET(ret, (U16)max);
9486             }
9487
9488             goto nest_check;
9489         }
9490     }
9491
9492     if (!ISMULT1(op)) {
9493         *flagp = flags;
9494         return(ret);
9495     }
9496
9497 #if 0                           /* Now runtime fix should be reliable. */
9498
9499     /* if this is reinstated, don't forget to put this back into perldiag:
9500
9501             =item Regexp *+ operand could be empty at {#} in regex m/%s/
9502
9503            (F) The part of the regexp subject to either the * or + quantifier
9504            could match an empty string. The {#} shows in the regular
9505            expression about where the problem was discovered.
9506
9507     */
9508
9509     if (!(flags&HASWIDTH) && op != '?')
9510       vFAIL("Regexp *+ operand could be empty");
9511 #endif
9512
9513 #ifdef RE_TRACK_PATTERN_OFFSETS
9514     parse_start = RExC_parse;
9515 #endif
9516     nextchar(pRExC_state);
9517
9518     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9519
9520     if (op == '*' && (flags&SIMPLE)) {
9521         reginsert(pRExC_state, STAR, ret, depth+1);
9522         ret->flags = 0;
9523         RExC_naughty += 4;
9524     }
9525     else if (op == '*') {
9526         min = 0;
9527         goto do_curly;
9528     }
9529     else if (op == '+' && (flags&SIMPLE)) {
9530         reginsert(pRExC_state, PLUS, ret, depth+1);
9531         ret->flags = 0;
9532         RExC_naughty += 3;
9533     }
9534     else if (op == '+') {
9535         min = 1;
9536         goto do_curly;
9537     }
9538     else if (op == '?') {
9539         min = 0; max = 1;
9540         goto do_curly;
9541     }
9542   nest_check:
9543     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9544         ckWARN3reg(RExC_parse,
9545                    "%.*s matches null string many times",
9546                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9547                    origparse);
9548     }
9549
9550     if (RExC_parse < RExC_end && *RExC_parse == '?') {
9551         nextchar(pRExC_state);
9552         reginsert(pRExC_state, MINMOD, ret, depth+1);
9553         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9554     }
9555 #ifndef REG_ALLOW_MINMOD_SUSPEND
9556     else
9557 #endif
9558     if (RExC_parse < RExC_end && *RExC_parse == '+') {
9559         regnode *ender;
9560         nextchar(pRExC_state);
9561         ender = reg_node(pRExC_state, SUCCEED);
9562         REGTAIL(pRExC_state, ret, ender);
9563         reginsert(pRExC_state, SUSPEND, ret, depth+1);
9564         ret->flags = 0;
9565         ender = reg_node(pRExC_state, TAIL);
9566         REGTAIL(pRExC_state, ret, ender);
9567         /*ret= ender;*/
9568     }
9569
9570     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9571         RExC_parse++;
9572         vFAIL("Nested quantifiers");
9573     }
9574
9575     return(ret);
9576 }
9577
9578 STATIC bool
9579 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9580 {
9581
9582  /* This is expected to be called by a parser routine that has recognized '\N'
9583    and needs to handle the rest. RExC_parse is expected to point at the first
9584    char following the N at the time of the call.  On successful return,
9585    RExC_parse has been updated to point to just after the sequence identified
9586    by this routine, and <*flagp> has been updated.
9587
9588    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9589    character class.
9590
9591    \N may begin either a named sequence, or if outside a character class, mean
9592    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9593    attempted to decide which, and in the case of a named sequence, converted it
9594    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9595    where c1... are the characters in the sequence.  For single-quoted regexes,
9596    the tokenizer passes the \N sequence through unchanged; this code will not
9597    attempt to determine this nor expand those, instead raising a syntax error.
9598    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9599    or there is no '}', it signals that this \N occurrence means to match a
9600    non-newline.
9601
9602    Only the \N{U+...} form should occur in a character class, for the same
9603    reason that '.' inside a character class means to just match a period: it
9604    just doesn't make sense.
9605
9606    The function raises an error (via vFAIL), and doesn't return for various
9607    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9608    success; it returns FALSE otherwise.
9609
9610    If <valuep> is non-null, it means the caller can accept an input sequence
9611    consisting of a just a single code point; <*valuep> is set to that value
9612    if the input is such.
9613
9614    If <node_p> is non-null it signifies that the caller can accept any other
9615    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9616    is set as follows:
9617     1) \N means not-a-NL: points to a newly created REG_ANY node;
9618     2) \N{}:              points to a new NOTHING node;
9619     3) otherwise:         points to a new EXACT node containing the resolved
9620                           string.
9621    Note that FALSE is returned for single code point sequences if <valuep> is
9622    null.
9623  */
9624
9625     char * endbrace;    /* '}' following the name */
9626     char* p;
9627     char *endchar;      /* Points to '.' or '}' ending cur char in the input
9628                            stream */
9629     bool has_multiple_chars; /* true if the input stream contains a sequence of
9630                                 more than one character */
9631
9632     GET_RE_DEBUG_FLAGS_DECL;
9633
9634     PERL_ARGS_ASSERT_GROK_BSLASH_N;
9635
9636     GET_RE_DEBUG_FLAGS;
9637
9638     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9639
9640     /* The [^\n] meaning of \N ignores spaces and comments under the /x
9641      * modifier.  The other meaning does not */
9642     p = (RExC_flags & RXf_PMf_EXTENDED)
9643         ? regwhite( pRExC_state, RExC_parse )
9644         : RExC_parse;
9645
9646     /* Disambiguate between \N meaning a named character versus \N meaning
9647      * [^\n].  The former is assumed when it can't be the latter. */
9648     if (*p != '{' || regcurly(p)) {
9649         RExC_parse = p;
9650         if (! node_p) {
9651             /* no bare \N in a charclass */
9652             if (in_char_class) {
9653                 vFAIL("\\N in a character class must be a named character: \\N{...}");
9654             }
9655             return FALSE;
9656         }
9657         nextchar(pRExC_state);
9658         *node_p = reg_node(pRExC_state, REG_ANY);
9659         *flagp |= HASWIDTH|SIMPLE;
9660         RExC_naughty++;
9661         RExC_parse--;
9662         Set_Node_Length(*node_p, 1); /* MJD */
9663         return TRUE;
9664     }
9665
9666     /* Here, we have decided it should be a named character or sequence */
9667
9668     /* The test above made sure that the next real character is a '{', but
9669      * under the /x modifier, it could be separated by space (or a comment and
9670      * \n) and this is not allowed (for consistency with \x{...} and the
9671      * tokenizer handling of \N{NAME}). */
9672     if (*RExC_parse != '{') {
9673         vFAIL("Missing braces on \\N{}");
9674     }
9675
9676     RExC_parse++;       /* Skip past the '{' */
9677
9678     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9679         || ! (endbrace == RExC_parse            /* nothing between the {} */
9680               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
9681                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9682     {
9683         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
9684         vFAIL("\\N{NAME} must be resolved by the lexer");
9685     }
9686
9687     if (endbrace == RExC_parse) {   /* empty: \N{} */
9688         bool ret = TRUE;
9689         if (node_p) {
9690             *node_p = reg_node(pRExC_state,NOTHING);
9691         }
9692         else if (in_char_class) {
9693             if (SIZE_ONLY && in_char_class) {
9694                 ckWARNreg(RExC_parse,
9695                         "Ignoring zero length \\N{} in character class"
9696                 );
9697             }
9698             ret = FALSE;
9699         }
9700         else {
9701             return FALSE;
9702         }
9703         nextchar(pRExC_state);
9704         return ret;
9705     }
9706
9707     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9708     RExC_parse += 2;    /* Skip past the 'U+' */
9709
9710     endchar = RExC_parse + strcspn(RExC_parse, ".}");
9711
9712     /* Code points are separated by dots.  If none, there is only one code
9713      * point, and is terminated by the brace */
9714     has_multiple_chars = (endchar < endbrace);
9715
9716     if (valuep && (! has_multiple_chars || in_char_class)) {
9717         /* We only pay attention to the first char of
9718         multichar strings being returned in char classes. I kinda wonder
9719         if this makes sense as it does change the behaviour
9720         from earlier versions, OTOH that behaviour was broken
9721         as well. XXX Solution is to recharacterize as
9722         [rest-of-class]|multi1|multi2... */
9723
9724         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9725         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9726             | PERL_SCAN_DISALLOW_PREFIX
9727             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9728
9729         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9730
9731         /* The tokenizer should have guaranteed validity, but it's possible to
9732          * bypass it by using single quoting, so check */
9733         if (length_of_hex == 0
9734             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9735         {
9736             RExC_parse += length_of_hex;        /* Includes all the valid */
9737             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
9738                             ? UTF8SKIP(RExC_parse)
9739                             : 1;
9740             /* Guard against malformed utf8 */
9741             if (RExC_parse >= endchar) {
9742                 RExC_parse = endchar;
9743             }
9744             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9745         }
9746
9747         if (in_char_class && has_multiple_chars) {
9748             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9749         }
9750         RExC_parse = endbrace + 1;
9751     }
9752     else if (! node_p || ! has_multiple_chars) {
9753
9754         /* Here, the input is legal, but not according to the caller's
9755          * options.  We fail without advancing the parse, so that the
9756          * caller can try again */
9757         RExC_parse = p;
9758         return FALSE;
9759     }
9760     else {
9761
9762         /* What is done here is to convert this to a sub-pattern of the form
9763          * (?:\x{char1}\x{char2}...)
9764          * and then call reg recursively.  That way, it retains its atomicness,
9765          * while not having to worry about special handling that some code
9766          * points may have.  toke.c has converted the original Unicode values
9767          * to native, so that we can just pass on the hex values unchanged.  We
9768          * do have to set a flag to keep recoding from happening in the
9769          * recursion */
9770
9771         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9772         STRLEN len;
9773         char *orig_end = RExC_end;
9774         I32 flags;
9775
9776         while (RExC_parse < endbrace) {
9777
9778             /* Convert to notation the rest of the code understands */
9779             sv_catpv(substitute_parse, "\\x{");
9780             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9781             sv_catpv(substitute_parse, "}");
9782
9783             /* Point to the beginning of the next character in the sequence. */
9784             RExC_parse = endchar + 1;
9785             endchar = RExC_parse + strcspn(RExC_parse, ".}");
9786         }
9787         sv_catpv(substitute_parse, ")");
9788
9789         RExC_parse = SvPV(substitute_parse, len);
9790
9791         /* Don't allow empty number */
9792         if (len < 8) {
9793             vFAIL("Invalid hexadecimal number in \\N{U+...}");
9794         }
9795         RExC_end = RExC_parse + len;
9796
9797         /* The values are Unicode, and therefore not subject to recoding */
9798         RExC_override_recoding = 1;
9799
9800         *node_p = reg(pRExC_state, 1, &flags, depth+1);
9801         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9802
9803         RExC_parse = endbrace;
9804         RExC_end = orig_end;
9805         RExC_override_recoding = 0;
9806
9807         nextchar(pRExC_state);
9808     }
9809
9810     return TRUE;
9811 }
9812
9813
9814 /*
9815  * reg_recode
9816  *
9817  * It returns the code point in utf8 for the value in *encp.
9818  *    value: a code value in the source encoding
9819  *    encp:  a pointer to an Encode object
9820  *
9821  * If the result from Encode is not a single character,
9822  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9823  */
9824 STATIC UV
9825 S_reg_recode(pTHX_ const char value, SV **encp)
9826 {
9827     STRLEN numlen = 1;
9828     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9829     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9830     const STRLEN newlen = SvCUR(sv);
9831     UV uv = UNICODE_REPLACEMENT;
9832
9833     PERL_ARGS_ASSERT_REG_RECODE;
9834
9835     if (newlen)
9836         uv = SvUTF8(sv)
9837              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9838              : *(U8*)s;
9839
9840     if (!newlen || numlen != newlen) {
9841         uv = UNICODE_REPLACEMENT;
9842         *encp = NULL;
9843     }
9844     return uv;
9845 }
9846
9847 PERL_STATIC_INLINE U8
9848 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9849 {
9850     U8 op;
9851
9852     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9853
9854     if (! FOLD) {
9855         return EXACT;
9856     }
9857
9858     op = get_regex_charset(RExC_flags);
9859     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9860         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9861                  been, so there is no hole */
9862     }
9863
9864     return op + EXACTF;
9865 }
9866
9867 PERL_STATIC_INLINE void
9868 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9869 {
9870     /* This knows the details about sizing an EXACTish node, setting flags for
9871      * it (by setting <*flagp>, and potentially populating it with a single
9872      * character.
9873      *
9874      * If <len> is non-zero, this function assumes that the node has already
9875      * been populated, and just does the sizing.  In this case <code_point>
9876      * should be the final code point that has already been placed into the
9877      * node.  This value will be ignored except that under some circumstances
9878      * <*flagp> is set based on it.
9879      *
9880      * If <len is zero, the function assumes that the node is to contain only
9881      * the single character given by <code_point> and calculates what <len>
9882      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
9883      * additionally will populate the node's STRING with <code_point>, if <len>
9884      * is 0.  In both cases <*flagp> is appropriately set
9885      *
9886      * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9887      * folded (the latter only when the rules indicate it can match 'ss') */
9888
9889     bool len_passed_in = cBOOL(len != 0);
9890     U8 character[UTF8_MAXBYTES_CASE+1];
9891
9892     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9893
9894     if (! len_passed_in) {
9895         if (UTF) {
9896             if (FOLD) {
9897                 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9898             }
9899             else {
9900                 uvchr_to_utf8( character, code_point);
9901                 len = UTF8SKIP(character);
9902             }
9903         }
9904         else if (! FOLD
9905                  || code_point != LATIN_SMALL_LETTER_SHARP_S
9906                  || ASCII_FOLD_RESTRICTED
9907                  || ! AT_LEAST_UNI_SEMANTICS)
9908         {
9909             *character = (U8) code_point;
9910             len = 1;
9911         }
9912         else {
9913             *character = 's';
9914             *(character + 1) = 's';
9915             len = 2;
9916         }
9917     }
9918
9919     if (SIZE_ONLY) {
9920         RExC_size += STR_SZ(len);
9921     }
9922     else {
9923         RExC_emit += STR_SZ(len);
9924         STR_LEN(node) = len;
9925         if (! len_passed_in) {
9926             Copy((char *) character, STRING(node), len, char);
9927         }
9928     }
9929
9930     *flagp |= HASWIDTH;
9931     if (len == 1 && UNI_IS_INVARIANT(code_point))
9932         *flagp |= SIMPLE;
9933 }
9934
9935 /*
9936  - regatom - the lowest level
9937
9938    Try to identify anything special at the start of the pattern. If there
9939    is, then handle it as required. This may involve generating a single regop,
9940    such as for an assertion; or it may involve recursing, such as to
9941    handle a () structure.
9942
9943    If the string doesn't start with something special then we gobble up
9944    as much literal text as we can.
9945
9946    Once we have been able to handle whatever type of thing started the
9947    sequence, we return.
9948
9949    Note: we have to be careful with escapes, as they can be both literal
9950    and special, and in the case of \10 and friends, context determines which.
9951
9952    A summary of the code structure is:
9953
9954    switch (first_byte) {
9955         cases for each special:
9956             handle this special;
9957             break;
9958         case '\\':
9959             switch (2nd byte) {
9960                 cases for each unambiguous special:
9961                     handle this special;
9962                     break;
9963                 cases for each ambigous special/literal:
9964                     disambiguate;
9965                     if (special)  handle here
9966                     else goto defchar;
9967                 default: // unambiguously literal:
9968                     goto defchar;
9969             }
9970         default:  // is a literal char
9971             // FALL THROUGH
9972         defchar:
9973             create EXACTish node for literal;
9974             while (more input and node isn't full) {
9975                 switch (input_byte) {
9976                    cases for each special;
9977                        make sure parse pointer is set so that the next call to
9978                            regatom will see this special first
9979                        goto loopdone; // EXACTish node terminated by prev. char
9980                    default:
9981                        append char to EXACTISH node;
9982                 }
9983                 get next input byte;
9984             }
9985         loopdone:
9986    }
9987    return the generated node;
9988
9989    Specifically there are two separate switches for handling
9990    escape sequences, with the one for handling literal escapes requiring
9991    a dummy entry for all of the special escapes that are actually handled
9992    by the other.
9993 */
9994
9995 STATIC regnode *
9996 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9997 {
9998     dVAR;
9999     regnode *ret = NULL;
10000     I32 flags;
10001     char *parse_start = RExC_parse;
10002     U8 op;
10003     GET_RE_DEBUG_FLAGS_DECL;
10004     DEBUG_PARSE("atom");
10005     *flagp = WORST;             /* Tentatively. */
10006
10007     PERL_ARGS_ASSERT_REGATOM;
10008
10009 tryagain:
10010     switch ((U8)*RExC_parse) {
10011     case '^':
10012         RExC_seen_zerolen++;
10013         nextchar(pRExC_state);
10014         if (RExC_flags & RXf_PMf_MULTILINE)
10015             ret = reg_node(pRExC_state, MBOL);
10016         else if (RExC_flags & RXf_PMf_SINGLELINE)
10017             ret = reg_node(pRExC_state, SBOL);
10018         else
10019             ret = reg_node(pRExC_state, BOL);
10020         Set_Node_Length(ret, 1); /* MJD */
10021         break;
10022     case '$':
10023         nextchar(pRExC_state);
10024         if (*RExC_parse)
10025             RExC_seen_zerolen++;
10026         if (RExC_flags & RXf_PMf_MULTILINE)
10027             ret = reg_node(pRExC_state, MEOL);
10028         else if (RExC_flags & RXf_PMf_SINGLELINE)
10029             ret = reg_node(pRExC_state, SEOL);
10030         else
10031             ret = reg_node(pRExC_state, EOL);
10032         Set_Node_Length(ret, 1); /* MJD */
10033         break;
10034     case '.':
10035         nextchar(pRExC_state);
10036         if (RExC_flags & RXf_PMf_SINGLELINE)
10037             ret = reg_node(pRExC_state, SANY);
10038         else
10039             ret = reg_node(pRExC_state, REG_ANY);
10040         *flagp |= HASWIDTH|SIMPLE;
10041         RExC_naughty++;
10042         Set_Node_Length(ret, 1); /* MJD */
10043         break;
10044     case '[':
10045     {
10046         char * const oregcomp_parse = ++RExC_parse;
10047         ret = regclass(pRExC_state, flagp,depth+1);
10048         if (*RExC_parse != ']') {
10049             RExC_parse = oregcomp_parse;
10050             vFAIL("Unmatched [");
10051         }
10052         nextchar(pRExC_state);
10053         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10054         break;
10055     }
10056     case '(':
10057         nextchar(pRExC_state);
10058         ret = reg(pRExC_state, 1, &flags,depth+1);
10059         if (ret == NULL) {
10060                 if (flags & TRYAGAIN) {
10061                     if (RExC_parse == RExC_end) {
10062                          /* Make parent create an empty node if needed. */
10063                         *flagp |= TRYAGAIN;
10064                         return(NULL);
10065                     }
10066                     goto tryagain;
10067                 }
10068                 return(NULL);
10069         }
10070         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10071         break;
10072     case '|':
10073     case ')':
10074         if (flags & TRYAGAIN) {
10075             *flagp |= TRYAGAIN;
10076             return NULL;
10077         }
10078         vFAIL("Internal urp");
10079                                 /* Supposed to be caught earlier. */
10080         break;
10081     case '?':
10082     case '+':
10083     case '*':
10084         RExC_parse++;
10085         vFAIL("Quantifier follows nothing");
10086         break;
10087     case '\\':
10088         /* Special Escapes
10089
10090            This switch handles escape sequences that resolve to some kind
10091            of special regop and not to literal text. Escape sequnces that
10092            resolve to literal text are handled below in the switch marked
10093            "Literal Escapes".
10094
10095            Every entry in this switch *must* have a corresponding entry
10096            in the literal escape switch. However, the opposite is not
10097            required, as the default for this switch is to jump to the
10098            literal text handling code.
10099         */
10100         switch ((U8)*++RExC_parse) {
10101         /* Special Escapes */
10102         case 'A':
10103             RExC_seen_zerolen++;
10104             ret = reg_node(pRExC_state, SBOL);
10105             *flagp |= SIMPLE;
10106             goto finish_meta_pat;
10107         case 'G':
10108             ret = reg_node(pRExC_state, GPOS);
10109             RExC_seen |= REG_SEEN_GPOS;
10110             *flagp |= SIMPLE;
10111             goto finish_meta_pat;
10112         case 'K':
10113             RExC_seen_zerolen++;
10114             ret = reg_node(pRExC_state, KEEPS);
10115             *flagp |= SIMPLE;
10116             /* XXX:dmq : disabling in-place substitution seems to
10117              * be necessary here to avoid cases of memory corruption, as
10118              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10119              */
10120             RExC_seen |= REG_SEEN_LOOKBEHIND;
10121             goto finish_meta_pat;
10122         case 'Z':
10123             ret = reg_node(pRExC_state, SEOL);
10124             *flagp |= SIMPLE;
10125             RExC_seen_zerolen++;                /* Do not optimize RE away */
10126             goto finish_meta_pat;
10127         case 'z':
10128             ret = reg_node(pRExC_state, EOS);
10129             *flagp |= SIMPLE;
10130             RExC_seen_zerolen++;                /* Do not optimize RE away */
10131             goto finish_meta_pat;
10132         case 'C':
10133             ret = reg_node(pRExC_state, CANY);
10134             RExC_seen |= REG_SEEN_CANY;
10135             *flagp |= HASWIDTH|SIMPLE;
10136             goto finish_meta_pat;
10137         case 'X':
10138             ret = reg_node(pRExC_state, CLUMP);
10139             *flagp |= HASWIDTH;
10140             goto finish_meta_pat;
10141         case 'w':
10142             op = ALNUM + get_regex_charset(RExC_flags);
10143             if (op > ALNUMA) {  /* /aa is same as /a */
10144                 op = ALNUMA;
10145             }
10146             ret = reg_node(pRExC_state, op);
10147             *flagp |= HASWIDTH|SIMPLE;
10148             goto finish_meta_pat;
10149         case 'W':
10150             op = NALNUM + get_regex_charset(RExC_flags);
10151             if (op > NALNUMA) { /* /aa is same as /a */
10152                 op = NALNUMA;
10153             }
10154             ret = reg_node(pRExC_state, op);
10155             *flagp |= HASWIDTH|SIMPLE;
10156             goto finish_meta_pat;
10157         case 'b':
10158             RExC_seen_zerolen++;
10159             RExC_seen |= REG_SEEN_LOOKBEHIND;
10160             op = BOUND + get_regex_charset(RExC_flags);
10161             if (op > BOUNDA) {  /* /aa is same as /a */
10162                 op = BOUNDA;
10163             }
10164             ret = reg_node(pRExC_state, op);
10165             FLAGS(ret) = get_regex_charset(RExC_flags);
10166             *flagp |= SIMPLE;
10167             goto finish_meta_pat;
10168         case 'B':
10169             RExC_seen_zerolen++;
10170             RExC_seen |= REG_SEEN_LOOKBEHIND;
10171             op = NBOUND + get_regex_charset(RExC_flags);
10172             if (op > NBOUNDA) { /* /aa is same as /a */
10173                 op = NBOUNDA;
10174             }
10175             ret = reg_node(pRExC_state, op);
10176             FLAGS(ret) = get_regex_charset(RExC_flags);
10177             *flagp |= SIMPLE;
10178             goto finish_meta_pat;
10179         case 's':
10180             op = SPACE + get_regex_charset(RExC_flags);
10181             if (op > SPACEA) {  /* /aa is same as /a */
10182                 op = SPACEA;
10183             }
10184             ret = reg_node(pRExC_state, op);
10185             *flagp |= HASWIDTH|SIMPLE;
10186             goto finish_meta_pat;
10187         case 'S':
10188             op = NSPACE + get_regex_charset(RExC_flags);
10189             if (op > NSPACEA) { /* /aa is same as /a */
10190                 op = NSPACEA;
10191             }
10192             ret = reg_node(pRExC_state, op);
10193             *flagp |= HASWIDTH|SIMPLE;
10194             goto finish_meta_pat;
10195         case 'D':
10196             op = NDIGIT;
10197             goto join_D_and_d;
10198         case 'd':
10199             op = DIGIT;
10200         join_D_and_d:
10201             {
10202                 U8 offset = get_regex_charset(RExC_flags);
10203                 if (offset == REGEX_UNICODE_CHARSET) {
10204                     offset = REGEX_DEPENDS_CHARSET;
10205                 }
10206                 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10207                     offset = REGEX_ASCII_RESTRICTED_CHARSET;
10208                 }
10209                 op += offset;
10210             }
10211             ret = reg_node(pRExC_state, op);
10212             *flagp |= HASWIDTH|SIMPLE;
10213             goto finish_meta_pat;
10214         case 'R':
10215             ret = reg_node(pRExC_state, LNBREAK);
10216             *flagp |= HASWIDTH|SIMPLE;
10217             goto finish_meta_pat;
10218         case 'h':
10219             ret = reg_node(pRExC_state, HORIZWS);
10220             *flagp |= HASWIDTH|SIMPLE;
10221             goto finish_meta_pat;
10222         case 'H':
10223             ret = reg_node(pRExC_state, NHORIZWS);
10224             *flagp |= HASWIDTH|SIMPLE;
10225             goto finish_meta_pat;
10226         case 'v':
10227             ret = reg_node(pRExC_state, VERTWS);
10228             *flagp |= HASWIDTH|SIMPLE;
10229             goto finish_meta_pat;
10230         case 'V':
10231             ret = reg_node(pRExC_state, NVERTWS);
10232             *flagp |= HASWIDTH|SIMPLE;
10233          finish_meta_pat:
10234             nextchar(pRExC_state);
10235             Set_Node_Length(ret, 2); /* MJD */
10236             break;
10237         case 'p':
10238         case 'P':
10239             {
10240                 char* const oldregxend = RExC_end;
10241 #ifdef DEBUGGING
10242                 char* parse_start = RExC_parse - 2;
10243 #endif
10244
10245                 if (RExC_parse[1] == '{') {
10246                   /* a lovely hack--pretend we saw [\pX] instead */
10247                     RExC_end = strchr(RExC_parse, '}');
10248                     if (!RExC_end) {
10249                         const U8 c = (U8)*RExC_parse;
10250                         RExC_parse += 2;
10251                         RExC_end = oldregxend;
10252                         vFAIL2("Missing right brace on \\%c{}", c);
10253                     }
10254                     RExC_end++;
10255                 }
10256                 else {
10257                     RExC_end = RExC_parse + 2;
10258                     if (RExC_end > oldregxend)
10259                         RExC_end = oldregxend;
10260                 }
10261                 RExC_parse--;
10262
10263                 ret = regclass(pRExC_state, flagp,depth+1);
10264
10265                 RExC_end = oldregxend;
10266                 RExC_parse--;
10267
10268                 Set_Node_Offset(ret, parse_start + 2);
10269                 Set_Node_Cur_Length(ret);
10270                 nextchar(pRExC_state);
10271             }
10272             break;
10273         case 'N':
10274             /* Handle \N and \N{NAME} with multiple code points here and not
10275              * below because it can be multicharacter. join_exact() will join
10276              * them up later on.  Also this makes sure that things like
10277              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10278              * The options to the grok function call causes it to fail if the
10279              * sequence is just a single code point.  We then go treat it as
10280              * just another character in the current EXACT node, and hence it
10281              * gets uniform treatment with all the other characters.  The
10282              * special treatment for quantifiers is not needed for such single
10283              * character sequences */
10284             ++RExC_parse;
10285             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10286                 RExC_parse--;
10287                 goto defchar;
10288             }
10289             break;
10290         case 'k':    /* Handle \k<NAME> and \k'NAME' */
10291         parse_named_seq:
10292         {
10293             char ch= RExC_parse[1];
10294             if (ch != '<' && ch != '\'' && ch != '{') {
10295                 RExC_parse++;
10296                 vFAIL2("Sequence %.2s... not terminated",parse_start);
10297             } else {
10298                 /* this pretty much dupes the code for (?P=...) in reg(), if
10299                    you change this make sure you change that */
10300                 char* name_start = (RExC_parse += 2);
10301                 U32 num = 0;
10302                 SV *sv_dat = reg_scan_name(pRExC_state,
10303                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10304                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10305                 if (RExC_parse == name_start || *RExC_parse != ch)
10306                     vFAIL2("Sequence %.3s... not terminated",parse_start);
10307
10308                 if (!SIZE_ONLY) {
10309                     num = add_data( pRExC_state, 1, "S" );
10310                     RExC_rxi->data->data[num]=(void*)sv_dat;
10311                     SvREFCNT_inc_simple_void(sv_dat);
10312                 }
10313
10314                 RExC_sawback = 1;
10315                 ret = reganode(pRExC_state,
10316                                ((! FOLD)
10317                                  ? NREF
10318                                  : (ASCII_FOLD_RESTRICTED)
10319                                    ? NREFFA
10320                                    : (AT_LEAST_UNI_SEMANTICS)
10321                                      ? NREFFU
10322                                      : (LOC)
10323                                        ? NREFFL
10324                                        : NREFF),
10325                                 num);
10326                 *flagp |= HASWIDTH;
10327
10328                 /* override incorrect value set in reganode MJD */
10329                 Set_Node_Offset(ret, parse_start+1);
10330                 Set_Node_Cur_Length(ret); /* MJD */
10331                 nextchar(pRExC_state);
10332
10333             }
10334             break;
10335         }
10336         case 'g':
10337         case '1': case '2': case '3': case '4':
10338         case '5': case '6': case '7': case '8': case '9':
10339             {
10340                 I32 num;
10341                 bool isg = *RExC_parse == 'g';
10342                 bool isrel = 0;
10343                 bool hasbrace = 0;
10344                 if (isg) {
10345                     RExC_parse++;
10346                     if (*RExC_parse == '{') {
10347                         RExC_parse++;
10348                         hasbrace = 1;
10349                     }
10350                     if (*RExC_parse == '-') {
10351                         RExC_parse++;
10352                         isrel = 1;
10353                     }
10354                     if (hasbrace && !isDIGIT(*RExC_parse)) {
10355                         if (isrel) RExC_parse--;
10356                         RExC_parse -= 2;
10357                         goto parse_named_seq;
10358                 }   }
10359                 num = atoi(RExC_parse);
10360                 if (isg && num == 0)
10361                     vFAIL("Reference to invalid group 0");
10362                 if (isrel) {
10363                     num = RExC_npar - num;
10364                     if (num < 1)
10365                         vFAIL("Reference to nonexistent or unclosed group");
10366                 }
10367                 if (!isg && num > 9 && num >= RExC_npar)
10368                     /* Probably a character specified in octal, e.g. \35 */
10369                     goto defchar;
10370                 else {
10371                     char * const parse_start = RExC_parse - 1; /* MJD */
10372                     while (isDIGIT(*RExC_parse))
10373                         RExC_parse++;
10374                     if (parse_start == RExC_parse - 1)
10375                         vFAIL("Unterminated \\g... pattern");
10376                     if (hasbrace) {
10377                         if (*RExC_parse != '}')
10378                             vFAIL("Unterminated \\g{...} pattern");
10379                         RExC_parse++;
10380                     }
10381                     if (!SIZE_ONLY) {
10382                         if (num > (I32)RExC_rx->nparens)
10383                             vFAIL("Reference to nonexistent group");
10384                     }
10385                     RExC_sawback = 1;
10386                     ret = reganode(pRExC_state,
10387                                    ((! FOLD)
10388                                      ? REF
10389                                      : (ASCII_FOLD_RESTRICTED)
10390                                        ? REFFA
10391                                        : (AT_LEAST_UNI_SEMANTICS)
10392                                          ? REFFU
10393                                          : (LOC)
10394                                            ? REFFL
10395                                            : REFF),
10396                                     num);
10397                     *flagp |= HASWIDTH;
10398
10399                     /* override incorrect value set in reganode MJD */
10400                     Set_Node_Offset(ret, parse_start+1);
10401                     Set_Node_Cur_Length(ret); /* MJD */
10402                     RExC_parse--;
10403                     nextchar(pRExC_state);
10404                 }
10405             }
10406             break;
10407         case '\0':
10408             if (RExC_parse >= RExC_end)
10409                 FAIL("Trailing \\");
10410             /* FALL THROUGH */
10411         default:
10412             /* Do not generate "unrecognized" warnings here, we fall
10413                back into the quick-grab loop below */
10414             parse_start--;
10415             goto defchar;
10416         }
10417         break;
10418
10419     case '#':
10420         if (RExC_flags & RXf_PMf_EXTENDED) {
10421             if ( reg_skipcomment( pRExC_state ) )
10422                 goto tryagain;
10423         }
10424         /* FALL THROUGH */
10425
10426     default:
10427
10428             parse_start = RExC_parse - 1;
10429
10430             RExC_parse++;
10431
10432         defchar: {
10433             STRLEN len = 0;
10434             UV ender;
10435             char *p;
10436             char *s;
10437 #define MAX_NODE_STRING_SIZE 127
10438             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10439             char *s0;
10440             U8 upper_parse = MAX_NODE_STRING_SIZE;
10441             STRLEN foldlen;
10442             U8 node_type;
10443             bool next_is_quantifier;
10444             char * oldp;
10445
10446             ender = 0;
10447             node_type = compute_EXACTish(pRExC_state);
10448             ret = reg_node(pRExC_state, node_type);
10449
10450             /* In pass1, folded, we use a temporary buffer instead of the
10451              * actual node, as the node doesn't exist yet */
10452             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10453
10454             s0 = s;
10455
10456         reparse:
10457
10458             /* XXX The node can hold up to 255 bytes, yet this only goes to
10459              * 127.  I (khw) do not know why.  Keeping it somewhat less than
10460              * 255 allows us to not have to worry about overflow due to
10461              * converting to utf8 and fold expansion, but that value is
10462              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10463              * split up by this limit into a single one using the real max of
10464              * 255.  Even at 127, this breaks under rare circumstances.  If
10465              * folding, we do not want to split a node at a character that is a
10466              * non-final in a multi-char fold, as an input string could just
10467              * happen to want to match across the node boundary.  The join
10468              * would solve that problem if the join actually happens.  But a
10469              * series of more than two nodes in a row each of 127 would cause
10470              * the first join to succeed to get to 254, but then there wouldn't
10471              * be room for the next one, which could at be one of those split
10472              * multi-char folds.  I don't know of any fool-proof solution.  One
10473              * could back off to end with only a code point that isn't such a
10474              * non-final, but it is possible for there not to be any in the
10475              * entire node. */
10476             for (p = RExC_parse - 1;
10477                  len < upper_parse && p < RExC_end;
10478                  len++)
10479             {
10480                 oldp = p;
10481
10482                 if (RExC_flags & RXf_PMf_EXTENDED)
10483                     p = regwhite( pRExC_state, p );
10484                 switch ((U8)*p) {
10485                 case '^':
10486                 case '$':
10487                 case '.':
10488                 case '[':
10489                 case '(':
10490                 case ')':
10491                 case '|':
10492                     goto loopdone;
10493                 case '\\':
10494                     /* Literal Escapes Switch
10495
10496                        This switch is meant to handle escape sequences that
10497                        resolve to a literal character.
10498
10499                        Every escape sequence that represents something
10500                        else, like an assertion or a char class, is handled
10501                        in the switch marked 'Special Escapes' above in this
10502                        routine, but also has an entry here as anything that
10503                        isn't explicitly mentioned here will be treated as
10504                        an unescaped equivalent literal.
10505                     */
10506
10507                     switch ((U8)*++p) {
10508                     /* These are all the special escapes. */
10509                     case 'A':             /* Start assertion */
10510                     case 'b': case 'B':   /* Word-boundary assertion*/
10511                     case 'C':             /* Single char !DANGEROUS! */
10512                     case 'd': case 'D':   /* digit class */
10513                     case 'g': case 'G':   /* generic-backref, pos assertion */
10514                     case 'h': case 'H':   /* HORIZWS */
10515                     case 'k': case 'K':   /* named backref, keep marker */
10516                     case 'p': case 'P':   /* Unicode property */
10517                               case 'R':   /* LNBREAK */
10518                     case 's': case 'S':   /* space class */
10519                     case 'v': case 'V':   /* VERTWS */
10520                     case 'w': case 'W':   /* word class */
10521                     case 'X':             /* eXtended Unicode "combining character sequence" */
10522                     case 'z': case 'Z':   /* End of line/string assertion */
10523                         --p;
10524                         goto loopdone;
10525
10526                     /* Anything after here is an escape that resolves to a
10527                        literal. (Except digits, which may or may not)
10528                      */
10529                     case 'n':
10530                         ender = '\n';
10531                         p++;
10532                         break;
10533                     case 'N': /* Handle a single-code point named character. */
10534                         /* The options cause it to fail if a multiple code
10535                          * point sequence.  Handle those in the switch() above
10536                          * */
10537                         RExC_parse = p + 1;
10538                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
10539                                             flagp, depth, FALSE))
10540                         {
10541                             RExC_parse = p = oldp;
10542                             goto loopdone;
10543                         }
10544                         p = RExC_parse;
10545                         if (ender > 0xff) {
10546                             REQUIRE_UTF8;
10547                         }
10548                         break;
10549                     case 'r':
10550                         ender = '\r';
10551                         p++;
10552                         break;
10553                     case 't':
10554                         ender = '\t';
10555                         p++;
10556                         break;
10557                     case 'f':
10558                         ender = '\f';
10559                         p++;
10560                         break;
10561                     case 'e':
10562                           ender = ASCII_TO_NATIVE('\033');
10563                         p++;
10564                         break;
10565                     case 'a':
10566                           ender = ASCII_TO_NATIVE('\007');
10567                         p++;
10568                         break;
10569                     case 'o':
10570                         {
10571                             STRLEN brace_len = len;
10572                             UV result;
10573                             const char* error_msg;
10574
10575                             bool valid = grok_bslash_o(p,
10576                                                        &result,
10577                                                        &brace_len,
10578                                                        &error_msg,
10579                                                        1);
10580                             p += brace_len;
10581                             if (! valid) {
10582                                 RExC_parse = p; /* going to die anyway; point
10583                                                    to exact spot of failure */
10584                                 vFAIL(error_msg);
10585                             }
10586                             else
10587                             {
10588                                 ender = result;
10589                             }
10590                             if (PL_encoding && ender < 0x100) {
10591                                 goto recode_encoding;
10592                             }
10593                             if (ender > 0xff) {
10594                                 REQUIRE_UTF8;
10595                             }
10596                             break;
10597                         }
10598                     case 'x':
10599                         {
10600                             STRLEN brace_len = len;
10601                             UV result;
10602                             const char* error_msg;
10603
10604                             bool valid = grok_bslash_x(p,
10605                                                        &result,
10606                                                        &brace_len,
10607                                                        &error_msg,
10608                                                        1);
10609                             p += brace_len;
10610                             if (! valid) {
10611                                 RExC_parse = p; /* going to die anyway; point
10612                                                    to exact spot of failure */
10613                                 vFAIL(error_msg);
10614                             }
10615                             else {
10616                                 ender = result;
10617                             }
10618                             if (PL_encoding && ender < 0x100) {
10619                                 goto recode_encoding;
10620                             }
10621                             if (ender > 0xff) {
10622                                 REQUIRE_UTF8;
10623                             }
10624                             break;
10625                         }
10626                     case 'c':
10627                         p++;
10628                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10629                         break;
10630                     case '0': case '1': case '2': case '3':case '4':
10631                     case '5': case '6': case '7':
10632                         if (*p == '0' ||
10633                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10634                         {
10635                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10636                             STRLEN numlen = 3;
10637                             ender = grok_oct(p, &numlen, &flags, NULL);
10638                             if (ender > 0xff) {
10639                                 REQUIRE_UTF8;
10640                             }
10641                             p += numlen;
10642                         }
10643                         else {
10644                             --p;
10645                             goto loopdone;
10646                         }
10647                         if (PL_encoding && ender < 0x100)
10648                             goto recode_encoding;
10649                         break;
10650                     recode_encoding:
10651                         if (! RExC_override_recoding) {
10652                             SV* enc = PL_encoding;
10653                             ender = reg_recode((const char)(U8)ender, &enc);
10654                             if (!enc && SIZE_ONLY)
10655                                 ckWARNreg(p, "Invalid escape in the specified encoding");
10656                             REQUIRE_UTF8;
10657                         }
10658                         break;
10659                     case '\0':
10660                         if (p >= RExC_end)
10661                             FAIL("Trailing \\");
10662                         /* FALL THROUGH */
10663                     default:
10664                         if (!SIZE_ONLY&& isALNUMC(*p)) {
10665                             ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10666                         }
10667                         goto normal_default;
10668                     }
10669                     break;
10670                 case '{':
10671                     /* Currently we don't warn when the lbrace is at the start
10672                      * of a construct.  This catches it in the middle of a
10673                      * literal string, or when its the first thing after
10674                      * something like "\b" */
10675                     if (! SIZE_ONLY
10676                         && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10677                     {
10678                         ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10679                     }
10680                     /*FALLTHROUGH*/
10681                 default:
10682                   normal_default:
10683                     if (UTF8_IS_START(*p) && UTF) {
10684                         STRLEN numlen;
10685                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10686                                                &numlen, UTF8_ALLOW_DEFAULT);
10687                         p += numlen;
10688                     }
10689                     else
10690                         ender = (U8) *p++;
10691                     break;
10692                 } /* End of switch on the literal */
10693
10694                 /* Here, have looked at the literal character and <ender>
10695                  * contains its ordinal, <p> points to the character after it
10696                  */
10697
10698                 if ( RExC_flags & RXf_PMf_EXTENDED)
10699                     p = regwhite( pRExC_state, p );
10700
10701                 /* If the next thing is a quantifier, it applies to this
10702                  * character only, which means that this character has to be in
10703                  * its own node and can't just be appended to the string in an
10704                  * existing node, so if there are already other characters in
10705                  * the node, close the node with just them, and set up to do
10706                  * this character again next time through, when it will be the
10707                  * only thing in its new node */
10708                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10709                 {
10710                     p = oldp;
10711                     goto loopdone;
10712                 }
10713
10714                 if (FOLD) {
10715                     if (UTF
10716                             /* See comments for join_exact() as to why we fold
10717                              * this non-UTF at compile time */
10718                         || (node_type == EXACTFU
10719                             && ender == LATIN_SMALL_LETTER_SHARP_S))
10720                     {
10721
10722
10723                         /* Prime the casefolded buffer.  Locale rules, which
10724                          * apply only to code points < 256, aren't known until
10725                          * execution, so for them, just output the original
10726                          * character using utf8.  If we start to fold non-UTF
10727                          * patterns, be sure to update join_exact() */
10728                         if (LOC && ender < 256) {
10729                             if (UNI_IS_INVARIANT(ender)) {
10730                                 *s = (U8) ender;
10731                                 foldlen = 1;
10732                             } else {
10733                                 *s = UTF8_TWO_BYTE_HI(ender);
10734                                 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10735                                 foldlen = 2;
10736                             }
10737                         }
10738                         else {
10739                             ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
10740                                     FOLD_FLAGS_FULL
10741                                      | ((LOC) ?  FOLD_FLAGS_LOCALE
10742                                               : (ASCII_FOLD_RESTRICTED)
10743                                                 ? FOLD_FLAGS_NOMIX_ASCII
10744                                                 : 0)
10745                                 );
10746                         }
10747                         s += foldlen;
10748
10749                         /* The loop increments <len> each time, as all but this
10750                          * path (and the one just below for UTF) through it add
10751                          * a single byte to the EXACTish node.  But this one
10752                          * has changed len to be the correct final value, so
10753                          * subtract one to cancel out the increment that
10754                          * follows */
10755                         len += foldlen - 1;
10756                     }
10757                     else {
10758                         *(s++) = ender;
10759                     }
10760                 }
10761                 else if (UTF) {
10762                     const STRLEN unilen = reguni(pRExC_state, ender, s);
10763                     if (unilen > 0) {
10764                        s   += unilen;
10765                        len += unilen;
10766                     }
10767
10768                     /* See comment just above for - 1 */
10769                     len--;
10770                 }
10771                 else {
10772                     REGC((char)ender, s++);
10773                 }
10774
10775                 if (next_is_quantifier) {
10776
10777                     /* Here, the next input is a quantifier, and to get here,
10778                      * the current character is the only one in the node.
10779                      * Also, here <len> doesn't include the final byte for this
10780                      * character */
10781                     len++;
10782                     goto loopdone;
10783                 }
10784
10785             } /* End of loop through literal characters */
10786
10787             /* Here we have either exhausted the input or ran out of room in
10788              * the node.  (If we encountered a character that can't be in the
10789              * node, transfer is made directly to <loopdone>, and so we
10790              * wouldn't have fallen off the end of the loop.)  In the latter
10791              * case, we artificially have to split the node into two, because
10792              * we just don't have enough space to hold everything.  This
10793              * creates a problem if the final character participates in a
10794              * multi-character fold in the non-final position, as a match that
10795              * should have occurred won't, due to the way nodes are matched,
10796              * and our artificial boundary.  So back off until we find a non-
10797              * problematic character -- one that isn't at the beginning or
10798              * middle of such a fold.  (Either it doesn't participate in any
10799              * folds, or appears only in the final position of all the folds it
10800              * does participate in.)  A better solution with far fewer false
10801              * positives, and that would fill the nodes more completely, would
10802              * be to actually have available all the multi-character folds to
10803              * test against, and to back-off only far enough to be sure that
10804              * this node isn't ending with a partial one.  <upper_parse> is set
10805              * further below (if we need to reparse the node) to include just
10806              * up through that final non-problematic character that this code
10807              * identifies, so when it is set to less than the full node, we can
10808              * skip the rest of this */
10809             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10810
10811                 const STRLEN full_len = len;
10812
10813                 assert(len >= MAX_NODE_STRING_SIZE);
10814
10815                 /* Here, <s> points to the final byte of the final character.
10816                  * Look backwards through the string until find a non-
10817                  * problematic character */
10818
10819                 if (! UTF) {
10820
10821                     /* These two have no multi-char folds to non-UTF characters
10822                      */
10823                     if (ASCII_FOLD_RESTRICTED || LOC) {
10824                         goto loopdone;
10825                     }
10826
10827                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10828                     len = s - s0 + 1;
10829                 }
10830                 else {
10831                     if (!  PL_NonL1NonFinalFold) {
10832                         PL_NonL1NonFinalFold = _new_invlist_C_array(
10833                                         NonL1_Perl_Non_Final_Folds_invlist);
10834                     }
10835
10836                     /* Point to the first byte of the final character */
10837                     s = (char *) utf8_hop((U8 *) s, -1);
10838
10839                     while (s >= s0) {   /* Search backwards until find
10840                                            non-problematic char */
10841                         if (UTF8_IS_INVARIANT(*s)) {
10842
10843                             /* There are no ascii characters that participate
10844                              * in multi-char folds under /aa.  In EBCDIC, the
10845                              * non-ascii invariants are all control characters,
10846                              * so don't ever participate in any folds. */
10847                             if (ASCII_FOLD_RESTRICTED
10848                                 || ! IS_NON_FINAL_FOLD(*s))
10849                             {
10850                                 break;
10851                             }
10852                         }
10853                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10854
10855                             /* No Latin1 characters participate in multi-char
10856                              * folds under /l */
10857                             if (LOC
10858                                 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10859                                                                 *s, *(s+1))))
10860                             {
10861                                 break;
10862                             }
10863                         }
10864                         else if (! _invlist_contains_cp(
10865                                         PL_NonL1NonFinalFold,
10866                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
10867                         {
10868                             break;
10869                         }
10870
10871                         /* Here, the current character is problematic in that
10872                          * it does occur in the non-final position of some
10873                          * fold, so try the character before it, but have to
10874                          * special case the very first byte in the string, so
10875                          * we don't read outside the string */
10876                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10877                     } /* End of loop backwards through the string */
10878
10879                     /* If there were only problematic characters in the string,
10880                      * <s> will point to before s0, in which case the length
10881                      * should be 0, otherwise include the length of the
10882                      * non-problematic character just found */
10883                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10884                 }
10885
10886                 /* Here, have found the final character, if any, that is
10887                  * non-problematic as far as ending the node without splitting
10888                  * it across a potential multi-char fold.  <len> contains the
10889                  * number of bytes in the node up-to and including that
10890                  * character, or is 0 if there is no such character, meaning
10891                  * the whole node contains only problematic characters.  In
10892                  * this case, give up and just take the node as-is.  We can't
10893                  * do any better */
10894                 if (len == 0) {
10895                     len = full_len;
10896                 } else {
10897
10898                     /* Here, the node does contain some characters that aren't
10899                      * problematic.  If one such is the final character in the
10900                      * node, we are done */
10901                     if (len == full_len) {
10902                         goto loopdone;
10903                     }
10904                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
10905
10906                         /* If the final character is problematic, but the
10907                          * penultimate is not, back-off that last character to
10908                          * later start a new node with it */
10909                         p = oldp;
10910                         goto loopdone;
10911                     }
10912
10913                     /* Here, the final non-problematic character is earlier
10914                      * in the input than the penultimate character.  What we do
10915                      * is reparse from the beginning, going up only as far as
10916                      * this final ok one, thus guaranteeing that the node ends
10917                      * in an acceptable character.  The reason we reparse is
10918                      * that we know how far in the character is, but we don't
10919                      * know how to correlate its position with the input parse.
10920                      * An alternate implementation would be to build that
10921                      * correlation as we go along during the original parse,
10922                      * but that would entail extra work for every node, whereas
10923                      * this code gets executed only when the string is too
10924                      * large for the node, and the final two characters are
10925                      * problematic, an infrequent occurrence.  Yet another
10926                      * possible strategy would be to save the tail of the
10927                      * string, and the next time regatom is called, initialize
10928                      * with that.  The problem with this is that unless you
10929                      * back off one more character, you won't be guaranteed
10930                      * regatom will get called again, unless regbranch,
10931                      * regpiece ... are also changed.  If you do back off that
10932                      * extra character, so that there is input guaranteed to
10933                      * force calling regatom, you can't handle the case where
10934                      * just the first character in the node is acceptable.  I
10935                      * (khw) decided to try this method which doesn't have that
10936                      * pitfall; if performance issues are found, we can do a
10937                      * combination of the current approach plus that one */
10938                     upper_parse = len;
10939                     len = 0;
10940                     s = s0;
10941                     goto reparse;
10942                 }
10943             }   /* End of verifying node ends with an appropriate char */
10944
10945         loopdone:   /* Jumped to when encounters something that shouldn't be in
10946                        the node */
10947
10948             /* I (khw) don't know if you can get here with zero length, but the
10949              * old code handled this situation by creating a zero-length EXACT
10950              * node.  Might as well be NOTHING instead */
10951             if (len == 0) {
10952                 OP(ret) = NOTHING;
10953             }
10954             else{
10955                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
10956             }
10957
10958             RExC_parse = p - 1;
10959             Set_Node_Cur_Length(ret); /* MJD */
10960             nextchar(pRExC_state);
10961             {
10962                 /* len is STRLEN which is unsigned, need to copy to signed */
10963                 IV iv = len;
10964                 if (iv < 0)
10965                     vFAIL("Internal disaster");
10966             }
10967
10968         } /* End of label 'defchar:' */
10969         break;
10970     } /* End of giant switch on input character */
10971
10972     return(ret);
10973 }
10974
10975 STATIC char *
10976 S_regwhite( RExC_state_t *pRExC_state, char *p )
10977 {
10978     const char *e = RExC_end;
10979
10980     PERL_ARGS_ASSERT_REGWHITE;
10981
10982     while (p < e) {
10983         if (isSPACE(*p))
10984             ++p;
10985         else if (*p == '#') {
10986             bool ended = 0;
10987             do {
10988                 if (*p++ == '\n') {
10989                     ended = 1;
10990                     break;
10991                 }
10992             } while (p < e);
10993             if (!ended)
10994                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10995         }
10996         else
10997             break;
10998     }
10999     return p;
11000 }
11001
11002 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11003    Character classes ([:foo:]) can also be negated ([:^foo:]).
11004    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11005    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11006    but trigger failures because they are currently unimplemented. */
11007
11008 #define POSIXCC_DONE(c)   ((c) == ':')
11009 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11010 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11011
11012 STATIC I32
11013 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11014 {
11015     dVAR;
11016     I32 namedclass = OOB_NAMEDCLASS;
11017
11018     PERL_ARGS_ASSERT_REGPPOSIXCC;
11019
11020     if (value == '[' && RExC_parse + 1 < RExC_end &&
11021         /* I smell either [: or [= or [. -- POSIX has been here, right? */
11022         POSIXCC(UCHARAT(RExC_parse))) {
11023         const char c = UCHARAT(RExC_parse);
11024         char* const s = RExC_parse++;
11025
11026         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11027             RExC_parse++;
11028         if (RExC_parse == RExC_end)
11029             /* Grandfather lone [:, [=, [. */
11030             RExC_parse = s;
11031         else {
11032             const char* const t = RExC_parse++; /* skip over the c */
11033             assert(*t == c);
11034
11035             if (UCHARAT(RExC_parse) == ']') {
11036                 const char *posixcc = s + 1;
11037                 RExC_parse++; /* skip over the ending ] */
11038
11039                 if (*s == ':') {
11040                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11041                     const I32 skip = t - posixcc;
11042
11043                     /* Initially switch on the length of the name.  */
11044                     switch (skip) {
11045                     case 4:
11046                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11047                             namedclass = ANYOF_ALNUM;
11048                         break;
11049                     case 5:
11050                         /* Names all of length 5.  */
11051                         /* alnum alpha ascii blank cntrl digit graph lower
11052                            print punct space upper  */
11053                         /* Offset 4 gives the best switch position.  */
11054                         switch (posixcc[4]) {
11055                         case 'a':
11056                             if (memEQ(posixcc, "alph", 4)) /* alpha */
11057                                 namedclass = ANYOF_ALPHA;
11058                             break;
11059                         case 'e':
11060                             if (memEQ(posixcc, "spac", 4)) /* space */
11061                                 namedclass = ANYOF_PSXSPC;
11062                             break;
11063                         case 'h':
11064                             if (memEQ(posixcc, "grap", 4)) /* graph */
11065                                 namedclass = ANYOF_GRAPH;
11066                             break;
11067                         case 'i':
11068                             if (memEQ(posixcc, "asci", 4)) /* ascii */
11069                                 namedclass = ANYOF_ASCII;
11070                             break;
11071                         case 'k':
11072                             if (memEQ(posixcc, "blan", 4)) /* blank */
11073                                 namedclass = ANYOF_BLANK;
11074                             break;
11075                         case 'l':
11076                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11077                                 namedclass = ANYOF_CNTRL;
11078                             break;
11079                         case 'm':
11080                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
11081                                 namedclass = ANYOF_ALNUMC;
11082                             break;
11083                         case 'r':
11084                             if (memEQ(posixcc, "lowe", 4)) /* lower */
11085                                 namedclass = ANYOF_LOWER;
11086                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
11087                                 namedclass = ANYOF_UPPER;
11088                             break;
11089                         case 't':
11090                             if (memEQ(posixcc, "digi", 4)) /* digit */
11091                                 namedclass = ANYOF_DIGIT;
11092                             else if (memEQ(posixcc, "prin", 4)) /* print */
11093                                 namedclass = ANYOF_PRINT;
11094                             else if (memEQ(posixcc, "punc", 4)) /* punct */
11095                                 namedclass = ANYOF_PUNCT;
11096                             break;
11097                         }
11098                         break;
11099                     case 6:
11100                         if (memEQ(posixcc, "xdigit", 6))
11101                             namedclass = ANYOF_XDIGIT;
11102                         break;
11103                     }
11104
11105                     if (namedclass == OOB_NAMEDCLASS)
11106                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11107                                       t - s - 1, s + 1);
11108
11109                     /* The #defines are structured so each complement is +1 to
11110                      * the normal one */
11111                     if (complement) {
11112                         namedclass++;
11113                     }
11114                     assert (posixcc[skip] == ':');
11115                     assert (posixcc[skip+1] == ']');
11116                 } else if (!SIZE_ONLY) {
11117                     /* [[=foo=]] and [[.foo.]] are still future. */
11118
11119                     /* adjust RExC_parse so the warning shows after
11120                        the class closes */
11121                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11122                         RExC_parse++;
11123                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11124                 }
11125             } else {
11126                 /* Maternal grandfather:
11127                  * "[:" ending in ":" but not in ":]" */
11128                 RExC_parse = s;
11129             }
11130         }
11131     }
11132
11133     return namedclass;
11134 }
11135
11136 STATIC void
11137 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11138 {
11139     dVAR;
11140
11141     PERL_ARGS_ASSERT_CHECKPOSIXCC;
11142
11143     if (POSIXCC(UCHARAT(RExC_parse))) {
11144         const char *s = RExC_parse;
11145         const char  c = *s++;
11146
11147         while (isALNUM(*s))
11148             s++;
11149         if (*s && c == *s && s[1] == ']') {
11150             ckWARN3reg(s+2,
11151                        "POSIX syntax [%c %c] belongs inside character classes",
11152                        c, c);
11153
11154             /* [[=foo=]] and [[.foo.]] are still future. */
11155             if (POSIXCC_NOTYET(c)) {
11156                 /* adjust RExC_parse so the error shows after
11157                    the class closes */
11158                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11159                     NOOP;
11160                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11161             }
11162         }
11163     }
11164 }
11165
11166 /* Generate the code to add a full posix character <class> to the bracketed
11167  * character class given by <node>.  (<node> is needed only under locale rules)
11168  * destlist     is the inversion list for non-locale rules that this class is
11169  *              to be added to
11170  * sourcelist   is the ASCII-range inversion list to add under /a rules
11171  * Xsourcelist  is the full Unicode range list to use otherwise. */
11172 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist)           \
11173     if (LOC) {                                                             \
11174         SV* scratch_list = NULL;                                           \
11175                                                                            \
11176         /* Set this class in the node for runtime matching */              \
11177         ANYOF_CLASS_SET(node, class);                                      \
11178                                                                            \
11179         /* For above Latin1 code points, we use the full Unicode range */  \
11180         _invlist_intersection(PL_AboveLatin1,                              \
11181                               Xsourcelist,                                 \
11182                               &scratch_list);                              \
11183         /* And set the output to it, adding instead if there already is an \
11184          * output.  Checking if <destlist> is NULL first saves an extra    \
11185          * clone.  Its reference count will be decremented at the next     \
11186          * union, etc, or if this is the only instance, at the end of the  \
11187          * routine */                                                      \
11188         if (! destlist) {                                                  \
11189             destlist = scratch_list;                                       \
11190         }                                                                  \
11191         else {                                                             \
11192             _invlist_union(destlist, scratch_list, &destlist);             \
11193             SvREFCNT_dec(scratch_list);                                    \
11194         }                                                                  \
11195     }                                                                      \
11196     else {                                                                 \
11197         /* For non-locale, just add it to any existing list */             \
11198         _invlist_union(destlist,                                           \
11199                        (AT_LEAST_ASCII_RESTRICTED)                         \
11200                            ? sourcelist                                    \
11201                            : Xsourcelist,                                  \
11202                        &destlist);                                         \
11203     }
11204
11205 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11206  */
11207 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist)         \
11208     if (LOC) {                                                             \
11209         SV* scratch_list = NULL;                                           \
11210         ANYOF_CLASS_SET(node, class);                                      \
11211         _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list);     \
11212         if (! destlist) {                                                  \
11213             destlist = scratch_list;                                       \
11214         }                                                                  \
11215         else {                                                             \
11216             _invlist_union(destlist, scratch_list, &destlist);             \
11217             SvREFCNT_dec(scratch_list);                                    \
11218         }                                                                  \
11219     }                                                                      \
11220     else {                                                                 \
11221         _invlist_union_complement_2nd(destlist,                            \
11222                                     (AT_LEAST_ASCII_RESTRICTED)            \
11223                                         ? sourcelist                       \
11224                                         : Xsourcelist,                     \
11225                                     &destlist);                            \
11226         /* Under /d, everything in the upper half of the Latin1 range      \
11227          * matches this complement */                                      \
11228         if (DEPENDS_SEMANTICS) {                                           \
11229             ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;                \
11230         }                                                                  \
11231     }
11232
11233 /* Generate the code to add a posix character <class> to the bracketed
11234  * character class given by <node>.  (<node> is needed only under locale rules)
11235  * destlist       is the inversion list for non-locale rules that this class is
11236  *                to be added to
11237  * sourcelist     is the ASCII-range inversion list to add under /a rules
11238  * l1_sourcelist  is the Latin1 range list to use otherwise.
11239  * Xpropertyname  is the name to add to <run_time_list> of the property to
11240  *                specify the code points above Latin1 that will have to be
11241  *                determined at run-time
11242  * run_time_list  is a SV* that contains text names of properties that are to
11243  *                be computed at run time.  This concatenates <Xpropertyname>
11244  *                to it, appropriately
11245  * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11246  * time */
11247 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,      \
11248                               l1_sourcelist, Xpropertyname, run_time_list) \
11249         /* First, resolve whether to use the ASCII-only list or the L1     \
11250          * list */                                                         \
11251         DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist,      \
11252                 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11253                 Xpropertyname, run_time_list)
11254
11255 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11256                 Xpropertyname, run_time_list)                              \
11257     /* If not /a matching, there are going to be code points we will have  \
11258      * to defer to runtime to look-up */                                   \
11259     if (! AT_LEAST_ASCII_RESTRICTED) {                                     \
11260         Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11261     }                                                                      \
11262     if (LOC) {                                                             \
11263         ANYOF_CLASS_SET(node, class);                                      \
11264     }                                                                      \
11265     else {                                                                 \
11266         _invlist_union(destlist, sourcelist, &destlist);                   \
11267     }
11268
11269 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement.  A combination of
11270  * this and DO_N_POSIX.  Sets <matches_above_unicode> only if it can; unchanged
11271  * otherwise */
11272 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist,    \
11273        l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11274     if (AT_LEAST_ASCII_RESTRICTED) {                                       \
11275         _invlist_union_complement_2nd(destlist, sourcelist, &destlist);    \
11276     }                                                                      \
11277     else {                                                                 \
11278         Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11279         matches_above_unicode = TRUE;                                      \
11280         if (LOC) {                                                         \
11281             ANYOF_CLASS_SET(node, namedclass);                             \
11282         }                                                                  \
11283         else {                                                             \
11284             SV* scratch_list = NULL;                                       \
11285             _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list);    \
11286             if (! destlist) {                                              \
11287                 destlist = scratch_list;                                   \
11288             }                                                              \
11289             else {                                                         \
11290                 _invlist_union(destlist, scratch_list, &destlist);         \
11291                 SvREFCNT_dec(scratch_list);                                \
11292             }                                                              \
11293             if (DEPENDS_SEMANTICS) {                                       \
11294                 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL;            \
11295             }                                                              \
11296         }                                                                  \
11297     }
11298
11299 STATIC void
11300 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
11301 {
11302     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
11303      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
11304      * the multi-character folds of characters in the node */
11305     SV *sv;
11306
11307     PERL_ARGS_ASSERT_ADD_ALTERNATE;
11308
11309     if (! *alternate_ptr) {
11310         *alternate_ptr = newAV();
11311     }
11312     sv = newSVpvn_utf8((char*)string, len, TRUE);
11313     av_push(*alternate_ptr, sv);
11314     return;
11315 }
11316
11317 /* The names of properties whose definitions are not known at compile time are
11318  * stored in this SV, after a constant heading.  So if the length has been
11319  * changed since initialization, then there is a run-time definition. */
11320 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11321
11322 /* This converts the named class defined in regcomp.h to its equivalent class
11323  * number defined in handy.h. */
11324 #define namedclass_to_classnum(class)  ((class) / 2)
11325
11326 /*
11327    parse a class specification and produce either an ANYOF node that
11328    matches the pattern or perhaps will be optimized into an EXACTish node
11329    instead. The node contains a bit map for the first 256 characters, with the
11330    corresponding bit set if that character is in the list.  For characters
11331    above 255, a range list is used */
11332
11333 STATIC regnode *
11334 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11335 {
11336     dVAR;
11337     UV nextvalue;
11338     UV prevvalue = OOB_UNICODE;
11339     IV range = 0;
11340     UV value = 0;
11341     regnode *ret;
11342     STRLEN numlen;
11343     IV namedclass = OOB_NAMEDCLASS;
11344     char *rangebegin = NULL;
11345     bool need_class = 0;
11346     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
11347     SV *listsv = NULL;
11348     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11349                                       than just initialized.  */
11350     SV* properties = NULL;    /* Code points that match \p{} \P{} */
11351     SV* posixes = NULL;     /* Code points that match classes like, [:word:],
11352                                extended beyond the Latin1 range */
11353     UV element_count = 0;   /* Number of distinct elements in the class.
11354                                Optimizations may be possible if this is tiny */
11355     UV n;
11356
11357     /* Unicode properties are stored in a swash; this holds the current one
11358      * being parsed.  If this swash is the only above-latin1 component of the
11359      * character class, an optimization is to pass it directly on to the
11360      * execution engine.  Otherwise, it is set to NULL to indicate that there
11361      * are other things in the class that have to be dealt with at execution
11362      * time */
11363     SV* swash = NULL;           /* Code points that match \p{} \P{} */
11364
11365     /* Set if a component of this character class is user-defined; just passed
11366      * on to the engine */
11367     bool has_user_defined_property = FALSE;
11368
11369     /* inversion list of code points this node matches only when the target
11370      * string is in UTF-8.  (Because is under /d) */
11371     SV* depends_list = NULL;
11372
11373     /* inversion list of code points this node matches.  For much of the
11374      * function, it includes only those that match regardless of the utf8ness
11375      * of the target string */
11376     SV* cp_list = NULL;
11377
11378     /* List of multi-character folds that are matched by this node */
11379     AV* unicode_alternate  = NULL;
11380 #ifdef EBCDIC
11381     /* In a range, counts how many 0-2 of the ends of it came from literals,
11382      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
11383     UV literal_endpoint = 0;
11384 #endif
11385     bool invert = FALSE;    /* Is this class to be complemented */
11386
11387     /* Is there any thing like \W or [:^digit:] that matches above the legal
11388      * Unicode range? */
11389     bool runtime_posix_matches_above_Unicode = FALSE;
11390
11391     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11392         case we need to change the emitted regop to an EXACT. */
11393     const char * orig_parse = RExC_parse;
11394     const I32 orig_size = RExC_size;
11395     GET_RE_DEBUG_FLAGS_DECL;
11396
11397     PERL_ARGS_ASSERT_REGCLASS;
11398 #ifndef DEBUGGING
11399     PERL_UNUSED_ARG(depth);
11400 #endif
11401
11402     DEBUG_PARSE("clas");
11403
11404     /* Assume we are going to generate an ANYOF node. */
11405     ret = reganode(pRExC_state, ANYOF, 0);
11406
11407
11408     if (!SIZE_ONLY) {
11409         ANYOF_FLAGS(ret) = 0;
11410     }
11411
11412     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
11413         RExC_naughty++;
11414         RExC_parse++;
11415         invert = TRUE;
11416
11417         /* We have decided to not allow multi-char folds in inverted character
11418          * classes, due to the confusion that can happen, especially with
11419          * classes that are designed for a non-Unicode world:  You have the
11420          * peculiar case that:
11421             "s s" =~ /^[^\xDF]+$/i => Y
11422             "ss"  =~ /^[^\xDF]+$/i => N
11423          *
11424          * See [perl #89750] */
11425         allow_full_fold = FALSE;
11426     }
11427
11428     if (SIZE_ONLY) {
11429         RExC_size += ANYOF_SKIP;
11430         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11431     }
11432     else {
11433         RExC_emit += ANYOF_SKIP;
11434         if (LOC) {
11435             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11436         }
11437         listsv = newSVpvs("# comment\n");
11438         initial_listsv_len = SvCUR(listsv);
11439     }
11440
11441     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11442
11443     if (!SIZE_ONLY && POSIXCC(nextvalue))
11444         checkposixcc(pRExC_state);
11445
11446     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11447     if (UCHARAT(RExC_parse) == ']')
11448         goto charclassloop;
11449
11450 parseit:
11451     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11452
11453     charclassloop:
11454
11455         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11456
11457         if (!range) {
11458             rangebegin = RExC_parse;
11459             element_count++;
11460         }
11461         if (UTF) {
11462             value = utf8n_to_uvchr((U8*)RExC_parse,
11463                                    RExC_end - RExC_parse,
11464                                    &numlen, UTF8_ALLOW_DEFAULT);
11465             RExC_parse += numlen;
11466         }
11467         else
11468             value = UCHARAT(RExC_parse++);
11469
11470         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11471         if (value == '[' && POSIXCC(nextvalue))
11472             namedclass = regpposixcc(pRExC_state, value);
11473         else if (value == '\\') {
11474             if (UTF) {
11475                 value = utf8n_to_uvchr((U8*)RExC_parse,
11476                                    RExC_end - RExC_parse,
11477                                    &numlen, UTF8_ALLOW_DEFAULT);
11478                 RExC_parse += numlen;
11479             }
11480             else
11481                 value = UCHARAT(RExC_parse++);
11482             /* Some compilers cannot handle switching on 64-bit integer
11483              * values, therefore value cannot be an UV.  Yes, this will
11484              * be a problem later if we want switch on Unicode.
11485              * A similar issue a little bit later when switching on
11486              * namedclass. --jhi */
11487             switch ((I32)value) {
11488             case 'w':   namedclass = ANYOF_ALNUM;       break;
11489             case 'W':   namedclass = ANYOF_NALNUM;      break;
11490             case 's':   namedclass = ANYOF_SPACE;       break;
11491             case 'S':   namedclass = ANYOF_NSPACE;      break;
11492             case 'd':   namedclass = ANYOF_DIGIT;       break;
11493             case 'D':   namedclass = ANYOF_NDIGIT;      break;
11494             case 'v':   namedclass = ANYOF_VERTWS;      break;
11495             case 'V':   namedclass = ANYOF_NVERTWS;     break;
11496             case 'h':   namedclass = ANYOF_HORIZWS;     break;
11497             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
11498             case 'N':  /* Handle \N{NAME} in class */
11499                 {
11500                     /* We only pay attention to the first char of
11501                     multichar strings being returned. I kinda wonder
11502                     if this makes sense as it does change the behaviour
11503                     from earlier versions, OTOH that behaviour was broken
11504                     as well. */
11505                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11506                                       TRUE /* => charclass */))
11507                     {
11508                         goto parseit;
11509                     }
11510                 }
11511                 break;
11512             case 'p':
11513             case 'P':
11514                 {
11515                 char *e;
11516                 if (RExC_parse >= RExC_end)
11517                     vFAIL2("Empty \\%c{}", (U8)value);
11518                 if (*RExC_parse == '{') {
11519                     const U8 c = (U8)value;
11520                     e = strchr(RExC_parse++, '}');
11521                     if (!e)
11522                         vFAIL2("Missing right brace on \\%c{}", c);
11523                     while (isSPACE(UCHARAT(RExC_parse)))
11524                         RExC_parse++;
11525                     if (e == RExC_parse)
11526                         vFAIL2("Empty \\%c{}", c);
11527                     n = e - RExC_parse;
11528                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11529                         n--;
11530                 }
11531                 else {
11532                     e = RExC_parse;
11533                     n = 1;
11534                 }
11535                 if (!SIZE_ONLY) {
11536                     SV* invlist;
11537                     char* name;
11538
11539                     if (UCHARAT(RExC_parse) == '^') {
11540                          RExC_parse++;
11541                          n--;
11542                          value = value == 'p' ? 'P' : 'p'; /* toggle */
11543                          while (isSPACE(UCHARAT(RExC_parse))) {
11544                               RExC_parse++;
11545                               n--;
11546                          }
11547                     }
11548                     /* Try to get the definition of the property into
11549                      * <invlist>.  If /i is in effect, the effective property
11550                      * will have its name be <__NAME_i>.  The design is
11551                      * discussed in commit
11552                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11553                     Newx(name, n + sizeof("_i__\n"), char);
11554
11555                     sprintf(name, "%s%.*s%s\n",
11556                                     (FOLD) ? "__" : "",
11557                                     (int)n,
11558                                     RExC_parse,
11559                                     (FOLD) ? "_i" : ""
11560                     );
11561
11562                     /* Look up the property name, and get its swash and
11563                      * inversion list, if the property is found  */
11564                     if (swash) {
11565                         SvREFCNT_dec(swash);
11566                     }
11567                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
11568                                              1, /* binary */
11569                                              0, /* not tr/// */
11570                                              TRUE, /* this routine will handle
11571                                                       undefined properties */
11572                                              NULL, FALSE /* No inversion list */
11573                                             );
11574                     if (   ! swash
11575                         || ! SvROK(swash)
11576                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11577                         || ! (invlist = _get_swash_invlist(swash)))
11578                     {
11579                         if (swash) {
11580                             SvREFCNT_dec(swash);
11581                             swash = NULL;
11582                         }
11583
11584                         /* Here didn't find it.  It could be a user-defined
11585                          * property that will be available at run-time.  Add it
11586                          * to the list to look up then */
11587                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11588                                         (value == 'p' ? '+' : '!'),
11589                                         name);
11590                         has_user_defined_property = TRUE;
11591
11592                         /* We don't know yet, so have to assume that the
11593                          * property could match something in the Latin1 range,
11594                          * hence something that isn't utf8.  Note that this
11595                          * would cause things in <depends_list> to match
11596                          * inappropriately, except that any \p{}, including
11597                          * this one forces Unicode semantics, which means there
11598                          * is <no depends_list> */
11599                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11600                     }
11601                     else {
11602
11603                         /* Here, did get the swash and its inversion list.  If
11604                          * the swash is from a user-defined property, then this
11605                          * whole character class should be regarded as such */
11606                         has_user_defined_property =
11607                                                 _is_swash_user_defined(swash);
11608
11609                         /* Invert if asking for the complement */
11610                         if (value == 'P') {
11611                             _invlist_union_complement_2nd(properties,
11612                                                           invlist,
11613                                                           &properties);
11614
11615                             /* The swash can't be used as-is, because we've
11616                              * inverted things; delay removing it to here after
11617                              * have copied its invlist above */
11618                             SvREFCNT_dec(swash);
11619                             swash = NULL;
11620                         }
11621                         else {
11622                             _invlist_union(properties, invlist, &properties);
11623                         }
11624                     }
11625                     Safefree(name);
11626                 }
11627                 RExC_parse = e + 1;
11628                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
11629
11630                 /* \p means they want Unicode semantics */
11631                 RExC_uni_semantics = 1;
11632                 }
11633                 break;
11634             case 'n':   value = '\n';                   break;
11635             case 'r':   value = '\r';                   break;
11636             case 't':   value = '\t';                   break;
11637             case 'f':   value = '\f';                   break;
11638             case 'b':   value = '\b';                   break;
11639             case 'e':   value = ASCII_TO_NATIVE('\033');break;
11640             case 'a':   value = ASCII_TO_NATIVE('\007');break;
11641             case 'o':
11642                 RExC_parse--;   /* function expects to be pointed at the 'o' */
11643                 {
11644                     const char* error_msg;
11645                     bool valid = grok_bslash_o(RExC_parse,
11646                                                &value,
11647                                                &numlen,
11648                                                &error_msg,
11649                                                SIZE_ONLY);
11650                     RExC_parse += numlen;
11651                     if (! valid) {
11652                         vFAIL(error_msg);
11653                     }
11654                 }
11655                 if (PL_encoding && value < 0x100) {
11656                     goto recode_encoding;
11657                 }
11658                 break;
11659             case 'x':
11660                 RExC_parse--;   /* function expects to be pointed at the 'x' */
11661                 {
11662                     const char* error_msg;
11663                     bool valid = grok_bslash_x(RExC_parse,
11664                                                &value,
11665                                                &numlen,
11666                                                &error_msg,
11667                                                1);
11668                     RExC_parse += numlen;
11669                     if (! valid) {
11670                         vFAIL(error_msg);
11671                     }
11672                 }
11673                 if (PL_encoding && value < 0x100)
11674                     goto recode_encoding;
11675                 break;
11676             case 'c':
11677                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11678                 break;
11679             case '0': case '1': case '2': case '3': case '4':
11680             case '5': case '6': case '7':
11681                 {
11682                     /* Take 1-3 octal digits */
11683                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11684                     numlen = 3;
11685                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11686                     RExC_parse += numlen;
11687                     if (PL_encoding && value < 0x100)
11688                         goto recode_encoding;
11689                     break;
11690                 }
11691             recode_encoding:
11692                 if (! RExC_override_recoding) {
11693                     SV* enc = PL_encoding;
11694                     value = reg_recode((const char)(U8)value, &enc);
11695                     if (!enc && SIZE_ONLY)
11696                         ckWARNreg(RExC_parse,
11697                                   "Invalid escape in the specified encoding");
11698                     break;
11699                 }
11700             default:
11701                 /* Allow \_ to not give an error */
11702                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11703                     ckWARN2reg(RExC_parse,
11704                                "Unrecognized escape \\%c in character class passed through",
11705                                (int)value);
11706                 }
11707                 break;
11708             }
11709         } /* end of \blah */
11710 #ifdef EBCDIC
11711         else
11712             literal_endpoint++;
11713 #endif
11714
11715             /* What matches in a locale is not known until runtime.  This
11716              * includes what the Posix classes (like \w, [:space:]) match.
11717              * Room must be reserved (one time per class) to store such
11718              * classes, either if Perl is compiled so that locale nodes always
11719              * should have this space, or if there is such class info to be
11720              * stored.  The space will contain a bit for each named class that
11721              * is to be matched against.  This isn't needed for \p{} and
11722              * pseudo-classes, as they are not affected by locale, and hence
11723              * are dealt with separately */
11724             if (LOC
11725                 && ! need_class
11726                 && (ANYOF_LOCALE == ANYOF_CLASS
11727                     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11728             {
11729                 need_class = 1;
11730                 if (SIZE_ONLY) {
11731                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11732                 }
11733                 else {
11734                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11735                     ANYOF_CLASS_ZERO(ret);
11736                 }
11737                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11738             }
11739
11740         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11741
11742             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
11743              * literal, as is the character that began the false range, i.e.
11744              * the 'a' in the examples */
11745             if (range) {
11746                 if (!SIZE_ONLY) {
11747                     const int w =
11748                         RExC_parse >= rangebegin ?
11749                         RExC_parse - rangebegin : 0;
11750                     ckWARN4reg(RExC_parse,
11751                                "False [] range \"%*.*s\"",
11752                                w, w, rangebegin);
11753                     cp_list = add_cp_to_invlist(cp_list, '-');
11754                     cp_list = add_cp_to_invlist(cp_list, prevvalue);
11755                 }
11756
11757                 range = 0; /* this was not a true range */
11758                 element_count += 2; /* So counts for three values */
11759             }
11760
11761             if (! SIZE_ONLY) {
11762                 switch ((I32)namedclass) {
11763
11764                 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11765                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11766                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11767                     break;
11768                 case ANYOF_NALNUMC:
11769                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11770                         PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11771                         runtime_posix_matches_above_Unicode);
11772                     break;
11773                 case ANYOF_ALPHA:
11774                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11775                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11776                     break;
11777                 case ANYOF_NALPHA:
11778                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11779                         PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11780                         runtime_posix_matches_above_Unicode);
11781                     break;
11782                 case ANYOF_ASCII:
11783                     if (LOC) {
11784                         ANYOF_CLASS_SET(ret, namedclass);
11785                     }
11786                     else {
11787                         _invlist_union(posixes, PL_ASCII, &posixes);
11788                     }
11789                     break;
11790                 case ANYOF_NASCII:
11791                     if (LOC) {
11792                         ANYOF_CLASS_SET(ret, namedclass);
11793                     }
11794                     else {
11795                         _invlist_union_complement_2nd(posixes,
11796                                                     PL_ASCII, &posixes);
11797                         if (DEPENDS_SEMANTICS) {
11798                             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11799                         }
11800                     }
11801                     break;
11802                 case ANYOF_BLANK:
11803                     DO_POSIX(ret, namedclass, posixes,
11804                                             PL_PosixBlank, PL_XPosixBlank);
11805                     break;
11806                 case ANYOF_NBLANK:
11807                     DO_N_POSIX(ret, namedclass, posixes,
11808                                             PL_PosixBlank, PL_XPosixBlank);
11809                     break;
11810                 case ANYOF_CNTRL:
11811                     DO_POSIX(ret, namedclass, posixes,
11812                                             PL_PosixCntrl, PL_XPosixCntrl);
11813                     break;
11814                 case ANYOF_NCNTRL:
11815                     DO_N_POSIX(ret, namedclass, posixes,
11816                                             PL_PosixCntrl, PL_XPosixCntrl);
11817                     break;
11818                 case ANYOF_DIGIT:
11819                     /* There are no digits in the Latin1 range outside of
11820                      * ASCII, so call the macro that doesn't have to resolve
11821                      * them */
11822                     DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11823                         PL_PosixDigit, "XPosixDigit", listsv);
11824                     break;
11825                 case ANYOF_NDIGIT:
11826                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11827                         PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11828                         runtime_posix_matches_above_Unicode);
11829                     break;
11830                 case ANYOF_GRAPH:
11831                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11832                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11833                     break;
11834                 case ANYOF_NGRAPH:
11835                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11836                         PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
11837                         runtime_posix_matches_above_Unicode);
11838                     break;
11839                 case ANYOF_HORIZWS:
11840                     /* For these, we use the cp_list, as /d doesn't make a
11841                      * difference in what these match.  There would be problems
11842                      * if these characters had folds other than themselves, as
11843                      * cp_list is subject to folding.  It turns out that \h
11844                      * is just a synonym for XPosixBlank */
11845                     _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
11846                     break;
11847                 case ANYOF_NHORIZWS:
11848                     _invlist_union_complement_2nd(cp_list,
11849                                                  PL_XPosixBlank, &cp_list);
11850                     break;
11851                 case ANYOF_LOWER:
11852                 case ANYOF_NLOWER:
11853                 {   /* These require special handling, as they differ under
11854                        folding, matching Cased there (which in the ASCII range
11855                        is the same as Alpha */
11856
11857                     SV* ascii_source;
11858                     SV* l1_source;
11859                     const char *Xname;
11860
11861                     if (FOLD && ! LOC) {
11862                         ascii_source = PL_PosixAlpha;
11863                         l1_source = PL_L1Cased;
11864                         Xname = "Cased";
11865                     }
11866                     else {
11867                         ascii_source = PL_PosixLower;
11868                         l1_source = PL_L1PosixLower;
11869                         Xname = "XPosixLower";
11870                     }
11871                     if (namedclass == ANYOF_LOWER) {
11872                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11873                                     ascii_source, l1_source, Xname, listsv);
11874                     }
11875                     else {
11876                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11877                             posixes, ascii_source, l1_source, Xname, listsv,
11878                             runtime_posix_matches_above_Unicode);
11879                     }
11880                     break;
11881                 }
11882                 case ANYOF_PRINT:
11883                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11884                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11885                     break;
11886                 case ANYOF_NPRINT:
11887                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11888                         PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
11889                         runtime_posix_matches_above_Unicode);
11890                     break;
11891                 case ANYOF_PUNCT:
11892                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11893                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11894                     break;
11895                 case ANYOF_NPUNCT:
11896                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11897                         PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
11898                         runtime_posix_matches_above_Unicode);
11899                     break;
11900                 case ANYOF_PSXSPC:
11901                     DO_POSIX(ret, namedclass, posixes,
11902                                             PL_PosixSpace, PL_XPosixSpace);
11903                     break;
11904                 case ANYOF_NPSXSPC:
11905                     DO_N_POSIX(ret, namedclass, posixes,
11906                                             PL_PosixSpace, PL_XPosixSpace);
11907                     break;
11908                 case ANYOF_SPACE:
11909                     DO_POSIX(ret, namedclass, posixes,
11910                                             PL_PerlSpace, PL_XPerlSpace);
11911                     break;
11912                 case ANYOF_NSPACE:
11913                     DO_N_POSIX(ret, namedclass, posixes,
11914                                             PL_PerlSpace, PL_XPerlSpace);
11915                     break;
11916                 case ANYOF_UPPER:   /* Same as LOWER, above */
11917                 case ANYOF_NUPPER:
11918                 {
11919                     SV* ascii_source;
11920                     SV* l1_source;
11921                     const char *Xname;
11922
11923                     if (FOLD && ! LOC) {
11924                         ascii_source = PL_PosixAlpha;
11925                         l1_source = PL_L1Cased;
11926                         Xname = "Cased";
11927                     }
11928                     else {
11929                         ascii_source = PL_PosixUpper;
11930                         l1_source = PL_L1PosixUpper;
11931                         Xname = "XPosixUpper";
11932                     }
11933                     if (namedclass == ANYOF_UPPER) {
11934                         DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11935                                     ascii_source, l1_source, Xname, listsv);
11936                     }
11937                     else {
11938                         DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11939                         posixes, ascii_source, l1_source, Xname, listsv,
11940                         runtime_posix_matches_above_Unicode);
11941                     }
11942                     break;
11943                 }
11944                 case ANYOF_ALNUM:   /* Really is 'Word' */
11945                     DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11946                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11947                     break;
11948                 case ANYOF_NALNUM:
11949                     DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11950                             PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
11951                             runtime_posix_matches_above_Unicode);
11952                     break;
11953                 case ANYOF_VERTWS:
11954                     /* For these, we use the cp_list, as /d doesn't make a
11955                      * difference in what these match.  There would be problems
11956                      * if these characters had folds other than themselves, as
11957                      * cp_list is subject to folding */
11958                     _invlist_union(cp_list, PL_VertSpace, &cp_list);
11959                     break;
11960                 case ANYOF_NVERTWS:
11961                     _invlist_union_complement_2nd(cp_list,
11962                                                     PL_VertSpace, &cp_list);
11963                     break;
11964                 case ANYOF_XDIGIT:
11965                     DO_POSIX(ret, namedclass, posixes,
11966                                             PL_PosixXDigit, PL_XPosixXDigit);
11967                     break;
11968                 case ANYOF_NXDIGIT:
11969                     DO_N_POSIX(ret, namedclass, posixes,
11970                                             PL_PosixXDigit, PL_XPosixXDigit);
11971                     break;
11972                 case ANYOF_MAX:
11973                     /* this is to handle \p and \P */
11974                     break;
11975                 default:
11976                     vFAIL("Invalid [::] class");
11977                     break;
11978                 }
11979
11980                 continue;   /* Go get next character */
11981             }
11982         } /* end of namedclass \blah */
11983
11984         if (range) {
11985             if (prevvalue > value) /* b-a */ {
11986                 const int w = RExC_parse - rangebegin;
11987                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11988                 range = 0; /* not a valid range */
11989             }
11990         }
11991         else {
11992             prevvalue = value; /* save the beginning of the potential range */
11993             if (RExC_parse+1 < RExC_end
11994                 && *RExC_parse == '-'
11995                 && RExC_parse[1] != ']')
11996             {
11997                 RExC_parse++;
11998
11999                 /* a bad range like \w-, [:word:]- ? */
12000                 if (namedclass > OOB_NAMEDCLASS) {
12001                     if (ckWARN(WARN_REGEXP)) {
12002                         const int w =
12003                             RExC_parse >= rangebegin ?
12004                             RExC_parse - rangebegin : 0;
12005                         vWARN4(RExC_parse,
12006                                "False [] range \"%*.*s\"",
12007                                w, w, rangebegin);
12008                     }
12009                     if (!SIZE_ONLY) {
12010                         cp_list = add_cp_to_invlist(cp_list, '-');
12011                     }
12012                     element_count++;
12013                 } else
12014                     range = 1;  /* yeah, it's a range! */
12015                 continue;       /* but do it the next time */
12016             }
12017         }
12018
12019         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12020          * if not */
12021
12022         /* non-Latin1 code point implies unicode semantics.  Must be set in
12023          * pass1 so is there for the whole of pass 2 */
12024         if (value > 255) {
12025             RExC_uni_semantics = 1;
12026         }
12027
12028         /* Ready to process either the single value, or the completed range */
12029         if (!SIZE_ONLY) {
12030 #ifndef EBCDIC
12031             cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12032 #else
12033             UV* this_range = _new_invlist(1);
12034             _append_range_to_invlist(this_range, prevvalue, value);
12035
12036             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12037              * If this range was specified using something like 'i-j', we want
12038              * to include only the 'i' and the 'j', and not anything in
12039              * between, so exclude non-ASCII, non-alphabetics from it.
12040              * However, if the range was specified with something like
12041              * [\x89-\x91] or [\x89-j], all code points within it should be
12042              * included.  literal_endpoint==2 means both ends of the range used
12043              * a literal character, not \x{foo} */
12044             if (literal_endpoint == 2
12045                 && (prevvalue >= 'a' && value <= 'z')
12046                     || (prevvalue >= 'A' && value <= 'Z'))
12047             {
12048                 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12049                 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12050             }
12051             _invlist_union(cp_list, this_range, &cp_list);
12052             literal_endpoint = 0;
12053 #endif
12054         }
12055
12056         range = 0; /* this range (if it was one) is done now */
12057     } /* End of loop through all the text within the brackets */
12058
12059     /* If the character class contains only a single element, it may be
12060      * optimizable into another node type which is smaller and runs faster.
12061      * Check if this is the case for this class */
12062     if (element_count == 1) {
12063         U8 op = END;
12064         U8 arg = 0;
12065
12066         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12067                                               [:digit:] or \p{foo} */
12068
12069             /* Certain named classes have equivalents that can appear outside a
12070              * character class, e.g. \w, \H.  We use these instead of a
12071              * character class. */
12072             switch ((I32)namedclass) {
12073                 U8 offset;
12074
12075                 /* The first group is for node types that depend on the charset
12076                  * modifier to the regex.  We first calculate the base node
12077                  * type, and if it should be inverted */
12078
12079                 case ANYOF_NALNUM:
12080                     invert = ! invert;
12081                     /* FALLTHROUGH */
12082                 case ANYOF_ALNUM:
12083                     op = ALNUM;
12084                     goto join_charset_classes;
12085
12086                 case ANYOF_NSPACE:
12087                     invert = ! invert;
12088                     /* FALLTHROUGH */
12089                 case ANYOF_SPACE:
12090                     op = SPACE;
12091                     goto join_charset_classes;
12092
12093                 case ANYOF_NDIGIT:
12094                     invert = ! invert;
12095                     /* FALLTHROUGH */
12096                 case ANYOF_DIGIT:
12097                     op = DIGIT;
12098
12099                   join_charset_classes:
12100
12101                     /* Now that we have the base node type, we take advantage
12102                      * of the enum ordering of the charset modifiers to get the
12103                      * exact node type,  For example the base SPACE also has
12104                      * SPACEL, SPACEU, and SPACEA */
12105
12106                     offset = get_regex_charset(RExC_flags);
12107
12108                     /* /aa is the same as /a for these */
12109                     if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12110                         offset = REGEX_ASCII_RESTRICTED_CHARSET;
12111                     }
12112                     else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12113                         offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12114                     }
12115
12116                     op += offset;
12117
12118                     /* The number of varieties of each of these is the same,
12119                      * hence, so is the delta between the normal and
12120                      * complemented nodes */
12121                     if (invert) {
12122                         op += NALNUM - ALNUM;
12123                     }
12124                     *flagp |= HASWIDTH|SIMPLE;
12125                     break;
12126
12127                 /* The second group doesn't depend of the charset modifiers.
12128                  * We just have normal and complemented */
12129                 case ANYOF_NHORIZWS:
12130                     invert = ! invert;
12131                     /* FALLTHROUGH */
12132                 case ANYOF_HORIZWS:
12133                   is_horizws:
12134                     op = (invert) ? NHORIZWS : HORIZWS;
12135                     *flagp |= HASWIDTH|SIMPLE;
12136                     break;
12137
12138                 case ANYOF_NVERTWS:
12139                     invert = ! invert;
12140                     /* FALLTHROUGH */
12141                 case ANYOF_VERTWS:
12142                     op = (invert) ? NVERTWS : VERTWS;
12143                     *flagp |= HASWIDTH|SIMPLE;
12144                     break;
12145
12146                 case ANYOF_MAX:
12147                     break;
12148
12149                 case ANYOF_NBLANK:
12150                     invert = ! invert;
12151                     /* FALLTHROUGH */
12152                 case ANYOF_BLANK:
12153                     if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12154                         goto is_horizws;
12155                     }
12156                     /* FALLTHROUGH */
12157                 default:
12158                     /* A generic posix class.  All the /a ones can be handled
12159                      * by the POSIXA opcode.  And all are closed under folding
12160                      * in the ASCII range, so FOLD doesn't matter */
12161                     if (AT_LEAST_ASCII_RESTRICTED
12162                         || (! LOC && namedclass == ANYOF_ASCII))
12163                     {
12164                         /* The odd numbered ones are the complements of the
12165                          * next-lower even number one */
12166                         if (namedclass % 2 == 1) {
12167                             invert = ! invert;
12168                             namedclass--;
12169                         }
12170                         arg = namedclass_to_classnum(namedclass);
12171                         op = (invert) ? NPOSIXA : POSIXA;
12172                     }
12173                     break;
12174             }
12175         }
12176         else if (value == prevvalue) {
12177
12178             /* Here, the class consists of just a single code point */
12179
12180             if (invert) {
12181                 if (! LOC && value == '\n') {
12182                     op = REG_ANY; /* Optimize [^\n] */
12183                     *flagp |= HASWIDTH|SIMPLE;
12184                     RExC_naughty++;
12185                 }
12186             }
12187             else if (value < 256 || UTF) {
12188
12189                 /* Optimize a single value into an EXACTish node, but not if it
12190                  * would require converting the pattern to UTF-8. */
12191                 op = compute_EXACTish(pRExC_state);
12192             }
12193         } /* Otherwise is a range */
12194         else if (! LOC) {   /* locale could vary these */
12195             if (prevvalue == '0') {
12196                 if (value == '9') {
12197                     op = (invert) ? NDIGITA : DIGITA;
12198                     *flagp |= HASWIDTH|SIMPLE;
12199                 }
12200             }
12201         }
12202
12203         /* Here, we have changed <op> away from its initial value iff we found
12204          * an optimization */
12205         if (op != END) {
12206
12207             /* Throw away this ANYOF regnode, and emit the calculated one,
12208              * which should correspond to the beginning, not current, state of
12209              * the parse */
12210             const char * cur_parse = RExC_parse;
12211             RExC_parse = (char *)orig_parse;
12212             if ( SIZE_ONLY) {
12213                 if (! LOC) {
12214
12215                     /* To get locale nodes to not use the full ANYOF size would
12216                      * require moving the code above that writes the portions
12217                      * of it that aren't in other nodes to after this point.
12218                      * e.g.  ANYOF_CLASS_SET */
12219                     RExC_size = orig_size;
12220                 }
12221             }
12222             else {
12223                 RExC_emit = (regnode *)orig_emit;
12224             }
12225
12226             ret = reg_node(pRExC_state, op);
12227
12228             if (PL_regkind[op] == POSIXD) {
12229                 if (! SIZE_ONLY) {
12230                     FLAGS(ret) = arg;
12231                 }
12232                 *flagp |= HASWIDTH|SIMPLE;
12233             }
12234             else if (PL_regkind[op] == EXACT) {
12235                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12236             }
12237
12238             RExC_parse = (char *) cur_parse;
12239
12240             SvREFCNT_dec(listsv);
12241             return ret;
12242         }
12243     }
12244
12245     if (SIZE_ONLY)
12246         return ret;
12247     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12248
12249     /* If folding, we calculate all characters that could fold to or from the
12250      * ones already on the list */
12251     if (FOLD && cp_list) {
12252         UV start, end;  /* End points of code point ranges */
12253
12254         SV* fold_intersection = NULL;
12255
12256         /* In the Latin1 range, the characters that can be folded-to or -from
12257          * are precisely the alphabetic characters.  If the highest code point
12258          * is within Latin1, we can use the compiled-in list, and not have to
12259          * go out to disk. */
12260         if (invlist_highest(cp_list) < 256) {
12261             _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12262         }
12263         else {
12264
12265             /* Here, there are non-Latin1 code points, so we will have to go
12266              * fetch the list of all the characters that participate in folds
12267              */
12268             if (! PL_utf8_foldable) {
12269                 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12270                                        &PL_sv_undef, 1, 0);
12271                 PL_utf8_foldable = _get_swash_invlist(swash);
12272                 SvREFCNT_dec(swash);
12273             }
12274
12275             /* This is a hash that for a particular fold gives all characters
12276              * that are involved in it */
12277             if (! PL_utf8_foldclosures) {
12278
12279                 /* If we were unable to find any folds, then we likely won't be
12280                  * able to find the closures.  So just create an empty list.
12281                  * Folding will effectively be restricted to the non-Unicode
12282                  * rules hard-coded into Perl.  (This case happens legitimately
12283                  * during compilation of Perl itself before the Unicode tables
12284                  * are generated) */
12285                 if (invlist_len(PL_utf8_foldable) == 0) {
12286                     PL_utf8_foldclosures = newHV();
12287                 }
12288                 else {
12289                     /* If the folds haven't been read in, call a fold function
12290                      * to force that */
12291                     if (! PL_utf8_tofold) {
12292                         U8 dummy[UTF8_MAXBYTES+1];
12293                         STRLEN dummy_len;
12294
12295                         /* This particular string is above \xff in both UTF-8
12296                          * and UTFEBCDIC */
12297                         to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
12298                         assert(PL_utf8_tofold); /* Verify that worked */
12299                     }
12300                     PL_utf8_foldclosures =
12301                                         _swash_inversion_hash(PL_utf8_tofold);
12302                 }
12303             }
12304
12305             /* Only the characters in this class that participate in folds need
12306              * be checked.  Get the intersection of this class and all the
12307              * possible characters that are foldable.  This can quickly narrow
12308              * down a large class */
12309             _invlist_intersection(PL_utf8_foldable, cp_list,
12310                                   &fold_intersection);
12311         }
12312
12313         /* Now look at the foldable characters in this class individually */
12314         invlist_iterinit(fold_intersection);
12315         while (invlist_iternext(fold_intersection, &start, &end)) {
12316             UV j;
12317
12318             /* Locale folding for Latin1 characters is deferred until runtime */
12319             if (LOC && start < 256) {
12320                 start = 256;
12321             }
12322
12323             /* Look at every character in the range */
12324             for (j = start; j <= end; j++) {
12325
12326                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12327                 STRLEN foldlen;
12328                 UV f;
12329
12330                 if (j < 256) {
12331
12332                     /* We have the latin1 folding rules hard-coded here so that
12333                      * an innocent-looking character class, like /[ks]/i won't
12334                      * have to go out to disk to find the possible matches.
12335                      * XXX It would be better to generate these via regen, in
12336                      * case a new version of the Unicode standard adds new
12337                      * mappings, though that is not really likely, and may be
12338                      * caught by the default: case of the switch below. */
12339
12340                     if (PL_fold_latin1[j] != j) {
12341
12342                         /* ASCII is always matched; non-ASCII is matched only
12343                          * under Unicode rules */
12344                         if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12345                             cp_list =
12346                                 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12347                         }
12348                         else {
12349                             depends_list =
12350                              add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12351                         }
12352                     }
12353
12354                     if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12355                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12356                     {
12357                         /* Certain Latin1 characters have matches outside
12358                          * Latin1, or are multi-character.  To get here, 'j' is
12359                          * one of those characters.   None of these matches is
12360                          * valid for ASCII characters under /aa, which is why
12361                          * the 'if' just above excludes those.  The matches
12362                          * fall into three categories:
12363                          * 1) They are singly folded-to or -from an above 255
12364                          *    character, e.g., LATIN SMALL LETTER Y WITH
12365                          *    DIAERESIS and LATIN CAPITAL LETTER Y WITH
12366                          *    DIAERESIS;
12367                          * 2) They are part of a multi-char fold with another
12368                          *    latin1 character; only LATIN SMALL LETTER
12369                          *    SHARP S => "ss" fits this;
12370                          * 3) They are part of a multi-char fold with a
12371                          *    character outside of Latin1, such as various
12372                          *    ligatures.
12373                         * We aren't dealing fully with multi-char folds, except
12374                         * we do deal with the pattern containing a character
12375                         * that has a multi-char fold (not so much the inverse).
12376                         * For types 1) and 3), the matches only happen when the
12377                         * target string is utf8; that's not true for 2), and we
12378                         * set a flag for it.
12379                         *
12380                         * The code below adds the single fold closures for 'j'
12381                         * to the inversion list. */
12382                         switch (j) {
12383                             case 'k':
12384                             case 'K':
12385                                 cp_list =
12386                                     add_cp_to_invlist(cp_list, KELVIN_SIGN);
12387                                 break;
12388                             case 's':
12389                             case 'S':
12390                                 cp_list = add_cp_to_invlist(cp_list,
12391                                                     LATIN_SMALL_LETTER_LONG_S);
12392                                 break;
12393                             case MICRO_SIGN:
12394                                 cp_list = add_cp_to_invlist(cp_list,
12395                                                     GREEK_CAPITAL_LETTER_MU);
12396                                 cp_list = add_cp_to_invlist(cp_list,
12397                                                     GREEK_SMALL_LETTER_MU);
12398                                 break;
12399                             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12400                             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12401                                 cp_list =
12402                                     add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12403                                 break;
12404                             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12405                                 cp_list = add_cp_to_invlist(cp_list,
12406                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12407                                 break;
12408                             case LATIN_SMALL_LETTER_SHARP_S:
12409                                 cp_list = add_cp_to_invlist(cp_list,
12410                                                 LATIN_CAPITAL_LETTER_SHARP_S);
12411
12412                                 /* Under /a, /d, and /u, this can match the two
12413                                  * chars "ss" */
12414                                 if (! ASCII_FOLD_RESTRICTED) {
12415                                     add_alternate(&unicode_alternate,
12416                                                   (U8 *) "ss", 2);
12417
12418                                     /* And under /u or /a, it can match even if
12419                                      * the target is not utf8 */
12420                                     if (AT_LEAST_UNI_SEMANTICS) {
12421                                         ANYOF_FLAGS(ret) |=
12422                                                     ANYOF_NONBITMAP_NON_UTF8;
12423                                     }
12424                                 }
12425                                 break;
12426                             case 'F': case 'f':
12427                             case 'I': case 'i':
12428                             case 'L': case 'l':
12429                             case 'T': case 't':
12430                             case 'A': case 'a':
12431                             case 'H': case 'h':
12432                             case 'J': case 'j':
12433                             case 'N': case 'n':
12434                             case 'W': case 'w':
12435                             case 'Y': case 'y':
12436                                 /* These all are targets of multi-character
12437                                  * folds from code points that require UTF8 to
12438                                  * express, so they can't match unless the
12439                                  * target string is in UTF-8, so no action here
12440                                  * is necessary, as regexec.c properly handles
12441                                  * the general case for UTF-8 matching */
12442                                 break;
12443                             default:
12444                                 /* Use deprecated warning to increase the
12445                                  * chances of this being output */
12446                                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12447                                 break;
12448                         }
12449                     }
12450                     continue;
12451                 }
12452
12453                 /* Here is an above Latin1 character.  We don't have the rules
12454                  * hard-coded for it.  First, get its fold */
12455                 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12456                                     ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
12457                                     | ((LOC)
12458                                         ? FOLD_FLAGS_LOCALE
12459                                         : (ASCII_FOLD_RESTRICTED)
12460                                             ? FOLD_FLAGS_NOMIX_ASCII
12461                                             : 0));
12462
12463                 if (foldlen > (STRLEN)UNISKIP(f)) {
12464
12465                     /* Any multicharacter foldings (disallowed in lookbehind
12466                      * patterns) require the following transform: [ABCDEF] ->
12467                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12468                      * folds into "rst", all other characters fold to single
12469                      * characters.  We save away these multicharacter foldings,
12470                      * to be later saved as part of the additional "s" data. */
12471                     if (! RExC_in_lookbehind) {
12472                         U8* loc = foldbuf;
12473                         U8* e = foldbuf + foldlen;
12474
12475                         /* If any of the folded characters of this are in the
12476                          * Latin1 range, tell the regex engine that this can
12477                          * match a non-utf8 target string.  */
12478                         while (loc < e) {
12479                             if (UTF8_IS_INVARIANT(*loc)
12480                                 || UTF8_IS_DOWNGRADEABLE_START(*loc))
12481                             {
12482                                 ANYOF_FLAGS(ret)
12483                                         |= ANYOF_NONBITMAP_NON_UTF8;
12484                                 break;
12485                             }
12486                             loc += UTF8SKIP(loc);
12487                         }
12488
12489                         add_alternate(&unicode_alternate, foldbuf, foldlen);
12490                     }
12491                 }
12492                 else {
12493                     /* Single character fold of above Latin1.  Add everything
12494                      * in its fold closure to the list that this node should
12495                      * match */
12496                     SV** listp;
12497
12498                     /* The fold closures data structure is a hash with the keys
12499                      * being every character that is folded to, like 'k', and
12500                      * the values each an array of everything that folds to its
12501                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
12502                     if ((listp = hv_fetch(PL_utf8_foldclosures,
12503                                     (char *) foldbuf, foldlen, FALSE)))
12504                     {
12505                         AV* list = (AV*) *listp;
12506                         IV k;
12507                         for (k = 0; k <= av_len(list); k++) {
12508                             SV** c_p = av_fetch(list, k, FALSE);
12509                             UV c;
12510                             if (c_p == NULL) {
12511                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12512                             }
12513                             c = SvUV(*c_p);
12514
12515                             /* /aa doesn't allow folds between ASCII and non-;
12516                              * /l doesn't allow them between above and below
12517                              * 256 */
12518                             if ((ASCII_FOLD_RESTRICTED
12519                                       && (isASCII(c) != isASCII(j)))
12520                                 || (LOC && ((c < 256) != (j < 256))))
12521                             {
12522                                 continue;
12523                             }
12524
12525                             /* Folds involving non-ascii Latin1 characters
12526                              * under /d are added to a separate list */
12527                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12528                             {
12529                                 cp_list = add_cp_to_invlist(cp_list, c);
12530                             }
12531                             else {
12532                               depends_list = add_cp_to_invlist(depends_list, c);
12533                             }
12534                         }
12535                     }
12536                 }
12537             }
12538         }
12539         SvREFCNT_dec(fold_intersection);
12540     }
12541
12542     /* And combine the result (if any) with any inversion list from posix
12543      * classes.  The lists are kept separate up to now because we don't want to
12544      * fold the classes (folding of those is automatically handled by the swash
12545      * fetching code) */
12546     if (posixes) {
12547         if (! DEPENDS_SEMANTICS) {
12548             if (cp_list) {
12549                 _invlist_union(cp_list, posixes, &cp_list);
12550                 SvREFCNT_dec(posixes);
12551             }
12552             else {
12553                 cp_list = posixes;
12554             }
12555         }
12556         else {
12557             /* Under /d, we put into a separate list the Latin1 things that
12558              * match only when the target string is utf8 */
12559             SV* nonascii_but_latin1_properties = NULL;
12560             _invlist_intersection(posixes, PL_Latin1,
12561                                   &nonascii_but_latin1_properties);
12562             _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12563                               &nonascii_but_latin1_properties);
12564             _invlist_subtract(posixes, nonascii_but_latin1_properties,
12565                               &posixes);
12566             if (cp_list) {
12567                 _invlist_union(cp_list, posixes, &cp_list);
12568                 SvREFCNT_dec(posixes);
12569             }
12570             else {
12571                 cp_list = posixes;
12572             }
12573
12574             if (depends_list) {
12575                 _invlist_union(depends_list, nonascii_but_latin1_properties,
12576                                &depends_list);
12577                 SvREFCNT_dec(nonascii_but_latin1_properties);
12578             }
12579             else {
12580                 depends_list = nonascii_but_latin1_properties;
12581             }
12582         }
12583     }
12584
12585     /* And combine the result (if any) with any inversion list from properties.
12586      * The lists are kept separate up to now so that we can distinguish the two
12587      * in regards to matching above-Unicode.  A run-time warning is generated
12588      * if a Unicode property is matched against a non-Unicode code point. But,
12589      * we allow user-defined properties to match anything, without any warning,
12590      * and we also suppress the warning if there is a portion of the character
12591      * class that isn't a Unicode property, and which matches above Unicode, \W
12592      * or [\x{110000}] for example.
12593      * (Note that in this case, unlike the Posix one above, there is no
12594      * <depends_list>, because having a Unicode property forces Unicode
12595      * semantics */
12596     if (properties) {
12597         bool warn_super = ! has_user_defined_property;
12598         if (cp_list) {
12599
12600             /* If it matters to the final outcome, see if a non-property
12601              * component of the class matches above Unicode.  If so, the
12602              * warning gets suppressed.  This is true even if just a single
12603              * such code point is specified, as though not strictly correct if
12604              * another such code point is matched against, the fact that they
12605              * are using above-Unicode code points indicates they should know
12606              * the issues involved */
12607             if (warn_super) {
12608                 bool non_prop_matches_above_Unicode =
12609                             runtime_posix_matches_above_Unicode
12610                             | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12611                 if (invert) {
12612                     non_prop_matches_above_Unicode =
12613                                             !  non_prop_matches_above_Unicode;
12614                 }
12615                 warn_super = ! non_prop_matches_above_Unicode;
12616             }
12617
12618             _invlist_union(properties, cp_list, &cp_list);
12619             SvREFCNT_dec(properties);
12620         }
12621         else {
12622             cp_list = properties;
12623         }
12624
12625         if (warn_super) {
12626             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12627         }
12628     }
12629
12630     /* Here, we have calculated what code points should be in the character
12631      * class.
12632      *
12633      * Now we can see about various optimizations.  Fold calculation (which we
12634      * did above) needs to take place before inversion.  Otherwise /[^k]/i
12635      * would invert to include K, which under /i would match k, which it
12636      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
12637      * folded until runtime */
12638
12639     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12640      * at compile time.  Besides not inverting folded locale now, we can't invert
12641      * if there are things such as \w, which aren't known until runtime */
12642     if (invert
12643         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12644         && ! depends_list
12645         && ! unicode_alternate
12646         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12647     {
12648         _invlist_invert(cp_list);
12649
12650         /* Any swash can't be used as-is, because we've inverted things */
12651         if (swash) {
12652             SvREFCNT_dec(swash);
12653             swash = NULL;
12654         }
12655
12656         /* Clear the invert flag since have just done it here */
12657         invert = FALSE;
12658     }
12659
12660     /* If we didn't do folding, it's because some information isn't available
12661      * until runtime; set the run-time fold flag for these.  (We don't have to
12662      * worry about properties folding, as that is taken care of by the swash
12663      * fetching) */
12664     if (FOLD && (LOC || unicode_alternate))
12665     {
12666        ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12667     }
12668
12669     /* Some character classes are equivalent to other nodes.  Such nodes take
12670      * up less room and generally fewer operations to execute than ANYOF nodes.
12671      * Above, we checked for and optimized into some such equivalents for
12672      * certain common classes that are easy to test.  Getting to this point in
12673      * the code means that the class didn't get optimized there.  Since this
12674      * code is only executed in Pass 2, it is too late to save space--it has
12675      * been allocated in Pass 1, and currently isn't given back.  But turning
12676      * things into an EXACTish node can allow the optimizer to join it to any
12677      * adjacent such nodes.  And if the class is equivalent to things like /./,
12678      * expensive run-time swashes can be avoided.  Now that we have more
12679      * complete information, we can find things necessarily missed by the
12680      * earlier code.  I (khw) am not sure how much to look for here.  It would
12681      * be easy, but perhaps too slow, to check any candidates against all the
12682      * node types they could possibly match using _invlistEQ(). */
12683
12684     if (cp_list
12685         && ! unicode_alternate
12686         && ! invert
12687         && ! depends_list
12688         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12689         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12690     {
12691        UV start, end;
12692        U8 op = END;  /* The optimzation node-type */
12693         const char * cur_parse= RExC_parse;
12694
12695        invlist_iterinit(cp_list);
12696        if (! invlist_iternext(cp_list, &start, &end)) {
12697
12698             /* Here, the list is empty.  This happens, for example, when a
12699              * Unicode property is the only thing in the character class, and
12700              * it doesn't match anything.  (perluniprops.pod notes such
12701              * properties) */
12702             op = OPFAIL;
12703             *flagp |= HASWIDTH|SIMPLE;
12704         }
12705         else if (start == end) {    /* The range is a single code point */
12706             if (! invlist_iternext(cp_list, &start, &end)
12707
12708                     /* Don't do this optimization if it would require changing
12709                      * the pattern to UTF-8 */
12710                 && (start < 256 || UTF))
12711             {
12712                 /* Here, the list contains a single code point.  Can optimize
12713                  * into an EXACT node */
12714
12715                 value = start;
12716
12717                 if (! FOLD) {
12718                     op = EXACT;
12719                 }
12720                 else if (LOC) {
12721
12722                     /* A locale node under folding with one code point can be
12723                      * an EXACTFL, as its fold won't be calculated until
12724                      * runtime */
12725                     op = EXACTFL;
12726                 }
12727                 else {
12728
12729                     /* Here, we are generally folding, but there is only one
12730                      * code point to match.  If we have to, we use an EXACT
12731                      * node, but it would be better for joining with adjacent
12732                      * nodes in the optimization pass if we used the same
12733                      * EXACTFish node that any such are likely to be.  We can
12734                      * do this iff the code point doesn't participate in any
12735                      * folds.  For example, an EXACTF of a colon is the same as
12736                      * an EXACT one, since nothing folds to or from a colon.
12737                      * In the Latin1 range, being an alpha means that the
12738                      * character participates in a fold (except for the
12739                      * feminine and masculine ordinals, which I (khw) don't
12740                      * think are worrying about optimizing for). */
12741                     if (value < 256) {
12742                         if (isALPHA_L1(value)) {
12743                             op = EXACT;
12744                         }
12745                     }
12746                     else {
12747                         if (! PL_utf8_foldable) {
12748                             SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12749                                                 &PL_sv_undef, 1, 0);
12750                             PL_utf8_foldable = _get_swash_invlist(swash);
12751                             SvREFCNT_dec(swash);
12752                         }
12753                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
12754                             op = EXACT;
12755                         }
12756                     }
12757
12758                     /* If we haven't found the node type, above, it means we
12759                      * can use the prevailing one */
12760                     if (op == END) {
12761                         op = compute_EXACTish(pRExC_state);
12762                     }
12763                 }
12764             }
12765         }
12766         else if (start == 0) {
12767             if (end == UV_MAX) {
12768                 op = SANY;
12769                 *flagp |= HASWIDTH|SIMPLE;
12770                 RExC_naughty++;
12771             }
12772             else if (end == '\n' - 1
12773                     && invlist_iternext(cp_list, &start, &end)
12774                     && start == '\n' + 1 && end == UV_MAX)
12775             {
12776                 op = REG_ANY;
12777                 *flagp |= HASWIDTH|SIMPLE;
12778                 RExC_naughty++;
12779             }
12780         }
12781
12782         if (op != END) {
12783             RExC_parse = (char *)orig_parse;
12784             RExC_emit = (regnode *)orig_emit;
12785
12786             ret = reg_node(pRExC_state, op);
12787
12788             RExC_parse = (char *)cur_parse;
12789
12790             if (PL_regkind[op] == EXACT) {
12791                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12792             }
12793
12794             SvREFCNT_dec(listsv);
12795             return ret;
12796         }
12797     }
12798
12799     /* Here, <cp_list> contains all the code points we can determine at
12800      * compile time that match under all conditions.  Go through it, and
12801      * for things that belong in the bitmap, put them there, and delete from
12802      * <cp_list>.  While we are at it, see if everything above 255 is in the
12803      * list, and if so, set a flag to speed up execution */
12804     ANYOF_BITMAP_ZERO(ret);
12805     if (cp_list) {
12806
12807         /* This gets set if we actually need to modify things */
12808         bool change_invlist = FALSE;
12809
12810         UV start, end;
12811
12812         /* Start looking through <cp_list> */
12813         invlist_iterinit(cp_list);
12814         while (invlist_iternext(cp_list, &start, &end)) {
12815             UV high;
12816             int i;
12817
12818             if (end == UV_MAX && start <= 256) {
12819                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12820             }
12821
12822             /* Quit if are above what we should change */
12823             if (start > 255) {
12824                 break;
12825             }
12826
12827             change_invlist = TRUE;
12828
12829             /* Set all the bits in the range, up to the max that we are doing */
12830             high = (end < 255) ? end : 255;
12831             for (i = start; i <= (int) high; i++) {
12832                 if (! ANYOF_BITMAP_TEST(ret, i)) {
12833                     ANYOF_BITMAP_SET(ret, i);
12834                     prevvalue = value;
12835                     value = i;
12836                 }
12837             }
12838         }
12839
12840         /* Done with loop; remove any code points that are in the bitmap from
12841          * <cp_list> */
12842         if (change_invlist) {
12843             _invlist_subtract(cp_list, PL_Latin1, &cp_list);
12844         }
12845
12846         /* If have completely emptied it, remove it completely */
12847         if (invlist_len(cp_list) == 0) {
12848             SvREFCNT_dec(cp_list);
12849             cp_list = NULL;
12850         }
12851     }
12852
12853     if (invert) {
12854         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
12855     }
12856
12857     /* Here, the bitmap has been populated with all the Latin1 code points that
12858      * always match.  Can now add to the overall list those that match only
12859      * when the target string is UTF-8 (<depends_list>). */
12860     if (depends_list) {
12861         if (cp_list) {
12862             _invlist_union(cp_list, depends_list, &cp_list);
12863             SvREFCNT_dec(depends_list);
12864         }
12865         else {
12866             cp_list = depends_list;
12867         }
12868     }
12869
12870     /* If there is a swash and more than one element, we can't use the swash in
12871      * the optimization below. */
12872     if (swash && element_count > 1) {
12873         SvREFCNT_dec(swash);
12874         swash = NULL;
12875     }
12876
12877     if (! cp_list
12878         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12879         && ! unicode_alternate)
12880     {
12881         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12882         SvREFCNT_dec(listsv);
12883         SvREFCNT_dec(unicode_alternate);
12884     }
12885     else {
12886         /* av[0] stores the character class description in its textual form:
12887          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
12888          *       appropriate swash, and is also useful for dumping the regnode.
12889          * av[1] if NULL, is a placeholder to later contain the swash computed
12890          *       from av[0].  But if no further computation need be done, the
12891          *       swash is stored there now.
12892          * av[2] stores the multicharacter foldings, used later in
12893          *       regexec.c:S_reginclass().
12894          * av[3] stores the cp_list inversion list for use in addition or
12895          *       instead of av[0]; used only if av[1] is NULL
12896          * av[4] is set if any component of the class is from a user-defined
12897          *       property; used only if av[1] is NULL */
12898         AV * const av = newAV();
12899         SV *rv;
12900
12901         av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12902                         ? listsv
12903                         : &PL_sv_undef);
12904         if (swash) {
12905             av_store(av, 1, swash);
12906             SvREFCNT_dec(cp_list);
12907         }
12908         else {
12909             av_store(av, 1, NULL);
12910             if (cp_list) {
12911                 av_store(av, 3, cp_list);
12912                 av_store(av, 4, newSVuv(has_user_defined_property));
12913             }
12914         }
12915
12916         /* Store any computed multi-char folds only if we are allowing
12917          * them */
12918         if (allow_full_fold) {
12919             av_store(av, 2, MUTABLE_SV(unicode_alternate));
12920             if (unicode_alternate) { /* This node is variable length */
12921                 OP(ret) = ANYOFV;
12922             }
12923         }
12924         else {
12925             av_store(av, 2, NULL);
12926         }
12927         rv = newRV_noinc(MUTABLE_SV(av));
12928         n = add_data(pRExC_state, 1, "s");
12929         RExC_rxi->data->data[n] = (void*)rv;
12930         ARG_SET(ret, n);
12931     }
12932
12933     *flagp |= HASWIDTH|SIMPLE;
12934     return ret;
12935 }
12936 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12937
12938
12939 /* reg_skipcomment()
12940
12941    Absorbs an /x style # comments from the input stream.
12942    Returns true if there is more text remaining in the stream.
12943    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12944    terminates the pattern without including a newline.
12945
12946    Note its the callers responsibility to ensure that we are
12947    actually in /x mode
12948
12949 */
12950
12951 STATIC bool
12952 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12953 {
12954     bool ended = 0;
12955
12956     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12957
12958     while (RExC_parse < RExC_end)
12959         if (*RExC_parse++ == '\n') {
12960             ended = 1;
12961             break;
12962         }
12963     if (!ended) {
12964         /* we ran off the end of the pattern without ending
12965            the comment, so we have to add an \n when wrapping */
12966         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12967         return 0;
12968     } else
12969         return 1;
12970 }
12971
12972 /* nextchar()
12973
12974    Advances the parse position, and optionally absorbs
12975    "whitespace" from the inputstream.
12976
12977    Without /x "whitespace" means (?#...) style comments only,
12978    with /x this means (?#...) and # comments and whitespace proper.
12979
12980    Returns the RExC_parse point from BEFORE the scan occurs.
12981
12982    This is the /x friendly way of saying RExC_parse++.
12983 */
12984
12985 STATIC char*
12986 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12987 {
12988     char* const retval = RExC_parse++;
12989
12990     PERL_ARGS_ASSERT_NEXTCHAR;
12991
12992     for (;;) {
12993         if (RExC_end - RExC_parse >= 3
12994             && *RExC_parse == '('
12995             && RExC_parse[1] == '?'
12996             && RExC_parse[2] == '#')
12997         {
12998             while (*RExC_parse != ')') {
12999                 if (RExC_parse == RExC_end)
13000                     FAIL("Sequence (?#... not terminated");
13001                 RExC_parse++;
13002             }
13003             RExC_parse++;
13004             continue;
13005         }
13006         if (RExC_flags & RXf_PMf_EXTENDED) {
13007             if (isSPACE(*RExC_parse)) {
13008                 RExC_parse++;
13009                 continue;
13010             }
13011             else if (*RExC_parse == '#') {
13012                 if ( reg_skipcomment( pRExC_state ) )
13013                     continue;
13014             }
13015         }
13016         return retval;
13017     }
13018 }
13019
13020 /*
13021 - reg_node - emit a node
13022 */
13023 STATIC regnode *                        /* Location. */
13024 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13025 {
13026     dVAR;
13027     regnode *ptr;
13028     regnode * const ret = RExC_emit;
13029     GET_RE_DEBUG_FLAGS_DECL;
13030
13031     PERL_ARGS_ASSERT_REG_NODE;
13032
13033     if (SIZE_ONLY) {
13034         SIZE_ALIGN(RExC_size);
13035         RExC_size += 1;
13036         return(ret);
13037     }
13038     if (RExC_emit >= RExC_emit_bound)
13039         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13040                    op, RExC_emit, RExC_emit_bound);
13041
13042     NODE_ALIGN_FILL(ret);
13043     ptr = ret;
13044     FILL_ADVANCE_NODE(ptr, op);
13045     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
13046 #ifdef RE_TRACK_PATTERN_OFFSETS
13047     if (RExC_offsets) {         /* MJD */
13048         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13049               "reg_node", __LINE__,
13050               PL_reg_name[op],
13051               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13052                 ? "Overwriting end of array!\n" : "OK",
13053               (UV)(RExC_emit - RExC_emit_start),
13054               (UV)(RExC_parse - RExC_start),
13055               (UV)RExC_offsets[0]));
13056         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13057     }
13058 #endif
13059     RExC_emit = ptr;
13060     return(ret);
13061 }
13062
13063 /*
13064 - reganode - emit a node with an argument
13065 */
13066 STATIC regnode *                        /* Location. */
13067 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13068 {
13069     dVAR;
13070     regnode *ptr;
13071     regnode * const ret = RExC_emit;
13072     GET_RE_DEBUG_FLAGS_DECL;
13073
13074     PERL_ARGS_ASSERT_REGANODE;
13075
13076     if (SIZE_ONLY) {
13077         SIZE_ALIGN(RExC_size);
13078         RExC_size += 2;
13079         /*
13080            We can't do this:
13081
13082            assert(2==regarglen[op]+1);
13083
13084            Anything larger than this has to allocate the extra amount.
13085            If we changed this to be:
13086
13087            RExC_size += (1 + regarglen[op]);
13088
13089            then it wouldn't matter. Its not clear what side effect
13090            might come from that so its not done so far.
13091            -- dmq
13092         */
13093         return(ret);
13094     }
13095     if (RExC_emit >= RExC_emit_bound)
13096         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13097                    op, RExC_emit, RExC_emit_bound);
13098
13099     NODE_ALIGN_FILL(ret);
13100     ptr = ret;
13101     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13102     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
13103 #ifdef RE_TRACK_PATTERN_OFFSETS
13104     if (RExC_offsets) {         /* MJD */
13105         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13106               "reganode",
13107               __LINE__,
13108               PL_reg_name[op],
13109               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13110               "Overwriting end of array!\n" : "OK",
13111               (UV)(RExC_emit - RExC_emit_start),
13112               (UV)(RExC_parse - RExC_start),
13113               (UV)RExC_offsets[0]));
13114         Set_Cur_Node_Offset;
13115     }
13116 #endif
13117     RExC_emit = ptr;
13118     return(ret);
13119 }
13120
13121 /*
13122 - reguni - emit (if appropriate) a Unicode character
13123 */
13124 STATIC STRLEN
13125 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13126 {
13127     dVAR;
13128
13129     PERL_ARGS_ASSERT_REGUNI;
13130
13131     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13132 }
13133
13134 /*
13135 - reginsert - insert an operator in front of already-emitted operand
13136 *
13137 * Means relocating the operand.
13138 */
13139 STATIC void
13140 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13141 {
13142     dVAR;
13143     regnode *src;
13144     regnode *dst;
13145     regnode *place;
13146     const int offset = regarglen[(U8)op];
13147     const int size = NODE_STEP_REGNODE + offset;
13148     GET_RE_DEBUG_FLAGS_DECL;
13149
13150     PERL_ARGS_ASSERT_REGINSERT;
13151     PERL_UNUSED_ARG(depth);
13152 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13153     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13154     if (SIZE_ONLY) {
13155         RExC_size += size;
13156         return;
13157     }
13158
13159     src = RExC_emit;
13160     RExC_emit += size;
13161     dst = RExC_emit;
13162     if (RExC_open_parens) {
13163         int paren;
13164         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13165         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13166             if ( RExC_open_parens[paren] >= opnd ) {
13167                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13168                 RExC_open_parens[paren] += size;
13169             } else {
13170                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13171             }
13172             if ( RExC_close_parens[paren] >= opnd ) {
13173                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13174                 RExC_close_parens[paren] += size;
13175             } else {
13176                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13177             }
13178         }
13179     }
13180
13181     while (src > opnd) {
13182         StructCopy(--src, --dst, regnode);
13183 #ifdef RE_TRACK_PATTERN_OFFSETS
13184         if (RExC_offsets) {     /* MJD 20010112 */
13185             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13186                   "reg_insert",
13187                   __LINE__,
13188                   PL_reg_name[op],
13189                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13190                     ? "Overwriting end of array!\n" : "OK",
13191                   (UV)(src - RExC_emit_start),
13192                   (UV)(dst - RExC_emit_start),
13193                   (UV)RExC_offsets[0]));
13194             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13195             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13196         }
13197 #endif
13198     }
13199
13200
13201     place = opnd;               /* Op node, where operand used to be. */
13202 #ifdef RE_TRACK_PATTERN_OFFSETS
13203     if (RExC_offsets) {         /* MJD */
13204         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13205               "reginsert",
13206               __LINE__,
13207               PL_reg_name[op],
13208               (UV)(place - RExC_emit_start) > RExC_offsets[0]
13209               ? "Overwriting end of array!\n" : "OK",
13210               (UV)(place - RExC_emit_start),
13211               (UV)(RExC_parse - RExC_start),
13212               (UV)RExC_offsets[0]));
13213         Set_Node_Offset(place, RExC_parse);
13214         Set_Node_Length(place, 1);
13215     }
13216 #endif
13217     src = NEXTOPER(place);
13218     FILL_ADVANCE_NODE(place, op);
13219     REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
13220     Zero(src, offset, regnode);
13221 }
13222
13223 /*
13224 - regtail - set the next-pointer at the end of a node chain of p to val.
13225 - SEE ALSO: regtail_study
13226 */
13227 /* TODO: All three parms should be const */
13228 STATIC void
13229 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13230 {
13231     dVAR;
13232     regnode *scan;
13233     GET_RE_DEBUG_FLAGS_DECL;
13234
13235     PERL_ARGS_ASSERT_REGTAIL;
13236 #ifndef DEBUGGING
13237     PERL_UNUSED_ARG(depth);
13238 #endif
13239
13240     if (SIZE_ONLY)
13241         return;
13242
13243     /* Find last node. */
13244     scan = p;
13245     for (;;) {
13246         regnode * const temp = regnext(scan);
13247         DEBUG_PARSE_r({
13248             SV * const mysv=sv_newmortal();
13249             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13250             regprop(RExC_rx, mysv, scan);
13251             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13252                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13253                     (temp == NULL ? "->" : ""),
13254                     (temp == NULL ? PL_reg_name[OP(val)] : "")
13255             );
13256         });
13257         if (temp == NULL)
13258             break;
13259         scan = temp;
13260     }
13261
13262     if (reg_off_by_arg[OP(scan)]) {
13263         ARG_SET(scan, val - scan);
13264     }
13265     else {
13266         NEXT_OFF(scan) = val - scan;
13267     }
13268 }
13269
13270 #ifdef DEBUGGING
13271 /*
13272 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13273 - Look for optimizable sequences at the same time.
13274 - currently only looks for EXACT chains.
13275
13276 This is experimental code. The idea is to use this routine to perform
13277 in place optimizations on branches and groups as they are constructed,
13278 with the long term intention of removing optimization from study_chunk so
13279 that it is purely analytical.
13280
13281 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13282 to control which is which.
13283
13284 */
13285 /* TODO: All four parms should be const */
13286
13287 STATIC U8
13288 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13289 {
13290     dVAR;
13291     regnode *scan;
13292     U8 exact = PSEUDO;
13293 #ifdef EXPERIMENTAL_INPLACESCAN
13294     I32 min = 0;
13295 #endif
13296     GET_RE_DEBUG_FLAGS_DECL;
13297
13298     PERL_ARGS_ASSERT_REGTAIL_STUDY;
13299
13300
13301     if (SIZE_ONLY)
13302         return exact;
13303
13304     /* Find last node. */
13305
13306     scan = p;
13307     for (;;) {
13308         regnode * const temp = regnext(scan);
13309 #ifdef EXPERIMENTAL_INPLACESCAN
13310         if (PL_regkind[OP(scan)] == EXACT) {
13311             bool has_exactf_sharp_s;    /* Unexamined in this routine */
13312             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13313                 return EXACT;
13314         }
13315 #endif
13316         if ( exact ) {
13317             switch (OP(scan)) {
13318                 case EXACT:
13319                 case EXACTF:
13320                 case EXACTFA:
13321                 case EXACTFU:
13322                 case EXACTFU_SS:
13323                 case EXACTFU_TRICKYFOLD:
13324                 case EXACTFL:
13325                         if( exact == PSEUDO )
13326                             exact= OP(scan);
13327                         else if ( exact != OP(scan) )
13328                             exact= 0;
13329                 case NOTHING:
13330                     break;
13331                 default:
13332                     exact= 0;
13333             }
13334         }
13335         DEBUG_PARSE_r({
13336             SV * const mysv=sv_newmortal();
13337             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13338             regprop(RExC_rx, mysv, scan);
13339             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13340                 SvPV_nolen_const(mysv),
13341                 REG_NODE_NUM(scan),
13342                 PL_reg_name[exact]);
13343         });
13344         if (temp == NULL)
13345             break;
13346         scan = temp;
13347     }
13348     DEBUG_PARSE_r({
13349         SV * const mysv_val=sv_newmortal();
13350         DEBUG_PARSE_MSG("");
13351         regprop(RExC_rx, mysv_val, val);
13352         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13353                       SvPV_nolen_const(mysv_val),
13354                       (IV)REG_NODE_NUM(val),
13355                       (IV)(val - scan)
13356         );
13357     });
13358     if (reg_off_by_arg[OP(scan)]) {
13359         ARG_SET(scan, val - scan);
13360     }
13361     else {
13362         NEXT_OFF(scan) = val - scan;
13363     }
13364
13365     return exact;
13366 }
13367 #endif
13368
13369 /*
13370  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13371  */
13372 #ifdef DEBUGGING
13373 static void
13374 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13375 {
13376     int bit;
13377     int set=0;
13378     regex_charset cs;
13379
13380     for (bit=0; bit<32; bit++) {
13381         if (flags & (1<<bit)) {
13382             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
13383                 continue;
13384             }
13385             if (!set++ && lead)
13386                 PerlIO_printf(Perl_debug_log, "%s",lead);
13387             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13388         }
13389     }
13390     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13391             if (!set++ && lead) {
13392                 PerlIO_printf(Perl_debug_log, "%s",lead);
13393             }
13394             switch (cs) {
13395                 case REGEX_UNICODE_CHARSET:
13396                     PerlIO_printf(Perl_debug_log, "UNICODE");
13397                     break;
13398                 case REGEX_LOCALE_CHARSET:
13399                     PerlIO_printf(Perl_debug_log, "LOCALE");
13400                     break;
13401                 case REGEX_ASCII_RESTRICTED_CHARSET:
13402                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13403                     break;
13404                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13405                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13406                     break;
13407                 default:
13408                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13409                     break;
13410             }
13411     }
13412     if (lead)  {
13413         if (set)
13414             PerlIO_printf(Perl_debug_log, "\n");
13415         else
13416             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13417     }
13418 }
13419 #endif
13420
13421 void
13422 Perl_regdump(pTHX_ const regexp *r)
13423 {
13424 #ifdef DEBUGGING
13425     dVAR;
13426     SV * const sv = sv_newmortal();
13427     SV *dsv= sv_newmortal();
13428     RXi_GET_DECL(r,ri);
13429     GET_RE_DEBUG_FLAGS_DECL;
13430
13431     PERL_ARGS_ASSERT_REGDUMP;
13432
13433     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13434
13435     /* Header fields of interest. */
13436     if (r->anchored_substr) {
13437         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13438             RE_SV_DUMPLEN(r->anchored_substr), 30);
13439         PerlIO_printf(Perl_debug_log,
13440                       "anchored %s%s at %"IVdf" ",
13441                       s, RE_SV_TAIL(r->anchored_substr),
13442                       (IV)r->anchored_offset);
13443     } else if (r->anchored_utf8) {
13444         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13445             RE_SV_DUMPLEN(r->anchored_utf8), 30);
13446         PerlIO_printf(Perl_debug_log,
13447                       "anchored utf8 %s%s at %"IVdf" ",
13448                       s, RE_SV_TAIL(r->anchored_utf8),
13449                       (IV)r->anchored_offset);
13450     }
13451     if (r->float_substr) {
13452         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13453             RE_SV_DUMPLEN(r->float_substr), 30);
13454         PerlIO_printf(Perl_debug_log,
13455                       "floating %s%s at %"IVdf"..%"UVuf" ",
13456                       s, RE_SV_TAIL(r->float_substr),
13457                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13458     } else if (r->float_utf8) {
13459         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13460             RE_SV_DUMPLEN(r->float_utf8), 30);
13461         PerlIO_printf(Perl_debug_log,
13462                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13463                       s, RE_SV_TAIL(r->float_utf8),
13464                       (IV)r->float_min_offset, (UV)r->float_max_offset);
13465     }
13466     if (r->check_substr || r->check_utf8)
13467         PerlIO_printf(Perl_debug_log,
13468                       (const char *)
13469                       (r->check_substr == r->float_substr
13470                        && r->check_utf8 == r->float_utf8
13471                        ? "(checking floating" : "(checking anchored"));
13472     if (r->extflags & RXf_NOSCAN)
13473         PerlIO_printf(Perl_debug_log, " noscan");
13474     if (r->extflags & RXf_CHECK_ALL)
13475         PerlIO_printf(Perl_debug_log, " isall");
13476     if (r->check_substr || r->check_utf8)
13477         PerlIO_printf(Perl_debug_log, ") ");
13478
13479     if (ri->regstclass) {
13480         regprop(r, sv, ri->regstclass);
13481         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13482     }
13483     if (r->extflags & RXf_ANCH) {
13484         PerlIO_printf(Perl_debug_log, "anchored");
13485         if (r->extflags & RXf_ANCH_BOL)
13486             PerlIO_printf(Perl_debug_log, "(BOL)");
13487         if (r->extflags & RXf_ANCH_MBOL)
13488             PerlIO_printf(Perl_debug_log, "(MBOL)");
13489         if (r->extflags & RXf_ANCH_SBOL)
13490             PerlIO_printf(Perl_debug_log, "(SBOL)");
13491         if (r->extflags & RXf_ANCH_GPOS)
13492             PerlIO_printf(Perl_debug_log, "(GPOS)");
13493         PerlIO_putc(Perl_debug_log, ' ');
13494     }
13495     if (r->extflags & RXf_GPOS_SEEN)
13496         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13497     if (r->intflags & PREGf_SKIP)
13498         PerlIO_printf(Perl_debug_log, "plus ");
13499     if (r->intflags & PREGf_IMPLICIT)
13500         PerlIO_printf(Perl_debug_log, "implicit ");
13501     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13502     if (r->extflags & RXf_EVAL_SEEN)
13503         PerlIO_printf(Perl_debug_log, "with eval ");
13504     PerlIO_printf(Perl_debug_log, "\n");
13505     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13506 #else
13507     PERL_ARGS_ASSERT_REGDUMP;
13508     PERL_UNUSED_CONTEXT;
13509     PERL_UNUSED_ARG(r);
13510 #endif  /* DEBUGGING */
13511 }
13512
13513 /*
13514 - regprop - printable representation of opcode
13515 */
13516 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13517 STMT_START { \
13518         if (do_sep) {                           \
13519             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13520             if (flags & ANYOF_INVERT)           \
13521                 /*make sure the invert info is in each */ \
13522                 sv_catpvs(sv, "^");             \
13523             do_sep = 0;                         \
13524         }                                       \
13525 } STMT_END
13526
13527 void
13528 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13529 {
13530 #ifdef DEBUGGING
13531     dVAR;
13532     int k;
13533
13534     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13535     static const char * const anyofs[] = {
13536         "\\w",
13537         "\\W",
13538         "\\s",
13539         "\\S",
13540         "\\d",
13541         "\\D",
13542         "[:alnum:]",
13543         "[:^alnum:]",
13544         "[:alpha:]",
13545         "[:^alpha:]",
13546         "[:ascii:]",
13547         "[:^ascii:]",
13548         "[:cntrl:]",
13549         "[:^cntrl:]",
13550         "[:graph:]",
13551         "[:^graph:]",
13552         "[:lower:]",
13553         "[:^lower:]",
13554         "[:print:]",
13555         "[:^print:]",
13556         "[:punct:]",
13557         "[:^punct:]",
13558         "[:upper:]",
13559         "[:^upper:]",
13560         "[:xdigit:]",
13561         "[:^xdigit:]",
13562         "[:space:]",
13563         "[:^space:]",
13564         "[:blank:]",
13565         "[:^blank:]"
13566     };
13567     RXi_GET_DECL(prog,progi);
13568     GET_RE_DEBUG_FLAGS_DECL;
13569
13570     PERL_ARGS_ASSERT_REGPROP;
13571
13572     sv_setpvs(sv, "");
13573
13574     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
13575         /* It would be nice to FAIL() here, but this may be called from
13576            regexec.c, and it would be hard to supply pRExC_state. */
13577         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13578     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13579
13580     k = PL_regkind[OP(o)];
13581
13582     if (k == EXACT) {
13583         sv_catpvs(sv, " ");
13584         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13585          * is a crude hack but it may be the best for now since
13586          * we have no flag "this EXACTish node was UTF-8"
13587          * --jhi */
13588         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13589                   PERL_PV_ESCAPE_UNI_DETECT |
13590                   PERL_PV_ESCAPE_NONASCII   |
13591                   PERL_PV_PRETTY_ELLIPSES   |
13592                   PERL_PV_PRETTY_LTGT       |
13593                   PERL_PV_PRETTY_NOCLEAR
13594                   );
13595     } else if (k == TRIE) {
13596         /* print the details of the trie in dumpuntil instead, as
13597          * progi->data isn't available here */
13598         const char op = OP(o);
13599         const U32 n = ARG(o);
13600         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13601                (reg_ac_data *)progi->data->data[n] :
13602                NULL;
13603         const reg_trie_data * const trie
13604             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13605
13606         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13607         DEBUG_TRIE_COMPILE_r(
13608             Perl_sv_catpvf(aTHX_ sv,
13609                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13610                 (UV)trie->startstate,
13611                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13612                 (UV)trie->wordcount,
13613                 (UV)trie->minlen,
13614                 (UV)trie->maxlen,
13615                 (UV)TRIE_CHARCOUNT(trie),
13616                 (UV)trie->uniquecharcount
13617             )
13618         );
13619         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13620             int i;
13621             int rangestart = -1;
13622             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13623             sv_catpvs(sv, "[");
13624             for (i = 0; i <= 256; i++) {
13625                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13626                     if (rangestart == -1)
13627                         rangestart = i;
13628                 } else if (rangestart != -1) {
13629                     if (i <= rangestart + 3)
13630                         for (; rangestart < i; rangestart++)
13631                             put_byte(sv, rangestart);
13632                     else {
13633                         put_byte(sv, rangestart);
13634                         sv_catpvs(sv, "-");
13635                         put_byte(sv, i - 1);
13636                     }
13637                     rangestart = -1;
13638                 }
13639             }
13640             sv_catpvs(sv, "]");
13641         }
13642
13643     } else if (k == CURLY) {
13644         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13645             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13646         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13647     }
13648     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
13649         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13650     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13651         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
13652         if ( RXp_PAREN_NAMES(prog) ) {
13653             if ( k != REF || (OP(o) < NREF)) {
13654                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13655                 SV **name= av_fetch(list, ARG(o), 0 );
13656                 if (name)
13657                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13658             }
13659             else {
13660                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13661                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13662                 I32 *nums=(I32*)SvPVX(sv_dat);
13663                 SV **name= av_fetch(list, nums[0], 0 );
13664                 I32 n;
13665                 if (name) {
13666                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
13667                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13668                                     (n ? "," : ""), (IV)nums[n]);
13669                     }
13670                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13671                 }
13672             }
13673         }
13674     } else if (k == GOSUB)
13675         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13676     else if (k == VERB) {
13677         if (!o->flags)
13678             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13679                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13680     } else if (k == LOGICAL)
13681         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
13682     else if (k == ANYOF) {
13683         int i, rangestart = -1;
13684         const U8 flags = ANYOF_FLAGS(o);
13685         int do_sep = 0;
13686
13687
13688         if (flags & ANYOF_LOCALE)
13689             sv_catpvs(sv, "{loc}");
13690         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13691             sv_catpvs(sv, "{i}");
13692         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13693         if (flags & ANYOF_INVERT)
13694             sv_catpvs(sv, "^");
13695
13696         /* output what the standard cp 0-255 bitmap matches */
13697         for (i = 0; i <= 256; i++) {
13698             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13699                 if (rangestart == -1)
13700                     rangestart = i;
13701             } else if (rangestart != -1) {
13702                 if (i <= rangestart + 3)
13703                     for (; rangestart < i; rangestart++)
13704                         put_byte(sv, rangestart);
13705                 else {
13706                     put_byte(sv, rangestart);
13707                     sv_catpvs(sv, "-");
13708                     put_byte(sv, i - 1);
13709                 }
13710                 do_sep = 1;
13711                 rangestart = -1;
13712             }
13713         }
13714
13715         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13716         /* output any special charclass tests (used entirely under use locale) */
13717         if (ANYOF_CLASS_TEST_ANY_SET(o))
13718             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13719                 if (ANYOF_CLASS_TEST(o,i)) {
13720                     sv_catpv(sv, anyofs[i]);
13721                     do_sep = 1;
13722                 }
13723
13724         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13725
13726         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13727             sv_catpvs(sv, "{non-utf8-latin1-all}");
13728         }
13729
13730         /* output information about the unicode matching */
13731         if (flags & ANYOF_UNICODE_ALL)
13732             sv_catpvs(sv, "{unicode_all}");
13733         else if (ANYOF_NONBITMAP(o))
13734             sv_catpvs(sv, "{unicode}");
13735         if (flags & ANYOF_NONBITMAP_NON_UTF8)
13736             sv_catpvs(sv, "{outside bitmap}");
13737
13738         if (ANYOF_NONBITMAP(o)) {
13739             SV *lv; /* Set if there is something outside the bit map */
13740             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13741             bool byte_output = FALSE;   /* If something in the bitmap has been
13742                                            output */
13743
13744             if (lv && lv != &PL_sv_undef) {
13745                 if (sw) {
13746                     U8 s[UTF8_MAXBYTES_CASE+1];
13747
13748                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13749                         uvchr_to_utf8(s, i);
13750
13751                         if (i < 256
13752                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
13753                                                                things already
13754                                                                output as part
13755                                                                of the bitmap */
13756                             && swash_fetch(sw, s, TRUE))
13757                         {
13758                             if (rangestart == -1)
13759                                 rangestart = i;
13760                         } else if (rangestart != -1) {
13761                             byte_output = TRUE;
13762                             if (i <= rangestart + 3)
13763                                 for (; rangestart < i; rangestart++) {
13764                                     put_byte(sv, rangestart);
13765                                 }
13766                             else {
13767                                 put_byte(sv, rangestart);
13768                                 sv_catpvs(sv, "-");
13769                                 put_byte(sv, i-1);
13770                             }
13771                             rangestart = -1;
13772                         }
13773                     }
13774                 }
13775
13776                 {
13777                     char *s = savesvpv(lv);
13778                     char * const origs = s;
13779
13780                     while (*s && *s != '\n')
13781                         s++;
13782
13783                     if (*s == '\n') {
13784                         const char * const t = ++s;
13785
13786                         if (byte_output) {
13787                             sv_catpvs(sv, " ");
13788                         }
13789
13790                         while (*s) {
13791                             if (*s == '\n') {
13792
13793                                 /* Truncate very long output */
13794                                 if (s - origs > 256) {
13795                                     Perl_sv_catpvf(aTHX_ sv,
13796                                                    "%.*s...",
13797                                                    (int) (s - origs - 1),
13798                                                    t);
13799                                     goto out_dump;
13800                                 }
13801                                 *s = ' ';
13802                             }
13803                             else if (*s == '\t') {
13804                                 *s = '-';
13805                             }
13806                             s++;
13807                         }
13808                         if (s[-1] == ' ')
13809                             s[-1] = 0;
13810
13811                         sv_catpv(sv, t);
13812                     }
13813
13814                 out_dump:
13815
13816                     Safefree(origs);
13817                 }
13818                 SvREFCNT_dec(lv);
13819             }
13820         }
13821
13822         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13823     }
13824     else if (k == POSIXD) {
13825         U8 index = FLAGS(o) * 2;
13826         if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
13827             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
13828         }
13829         else {
13830             sv_catpv(sv, anyofs[index]);
13831         }
13832     }
13833     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13834         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13835 #else
13836     PERL_UNUSED_CONTEXT;
13837     PERL_UNUSED_ARG(sv);
13838     PERL_UNUSED_ARG(o);
13839     PERL_UNUSED_ARG(prog);
13840 #endif  /* DEBUGGING */
13841 }
13842
13843 SV *
13844 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13845 {                               /* Assume that RE_INTUIT is set */
13846     dVAR;
13847     struct regexp *const prog = (struct regexp *)SvANY(r);
13848     GET_RE_DEBUG_FLAGS_DECL;
13849
13850     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13851     PERL_UNUSED_CONTEXT;
13852
13853     DEBUG_COMPILE_r(
13854         {
13855             const char * const s = SvPV_nolen_const(prog->check_substr
13856                       ? prog->check_substr : prog->check_utf8);
13857
13858             if (!PL_colorset) reginitcolors();
13859             PerlIO_printf(Perl_debug_log,
13860                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13861                       PL_colors[4],
13862                       prog->check_substr ? "" : "utf8 ",
13863                       PL_colors[5],PL_colors[0],
13864                       s,
13865                       PL_colors[1],
13866                       (strlen(s) > 60 ? "..." : ""));
13867         } );
13868
13869     return prog->check_substr ? prog->check_substr : prog->check_utf8;
13870 }
13871
13872 /*
13873    pregfree()
13874
13875    handles refcounting and freeing the perl core regexp structure. When
13876    it is necessary to actually free the structure the first thing it
13877    does is call the 'free' method of the regexp_engine associated to
13878    the regexp, allowing the handling of the void *pprivate; member
13879    first. (This routine is not overridable by extensions, which is why
13880    the extensions free is called first.)
13881
13882    See regdupe and regdupe_internal if you change anything here.
13883 */
13884 #ifndef PERL_IN_XSUB_RE
13885 void
13886 Perl_pregfree(pTHX_ REGEXP *r)
13887 {
13888     SvREFCNT_dec(r);
13889 }
13890
13891 void
13892 Perl_pregfree2(pTHX_ REGEXP *rx)
13893 {
13894     dVAR;
13895     struct regexp *const r = (struct regexp *)SvANY(rx);
13896     GET_RE_DEBUG_FLAGS_DECL;
13897
13898     PERL_ARGS_ASSERT_PREGFREE2;
13899
13900     if (r->mother_re) {
13901         ReREFCNT_dec(r->mother_re);
13902     } else {
13903         CALLREGFREE_PVT(rx); /* free the private data */
13904         SvREFCNT_dec(RXp_PAREN_NAMES(r));
13905     }
13906     if (r->substrs) {
13907         SvREFCNT_dec(r->anchored_substr);
13908         SvREFCNT_dec(r->anchored_utf8);
13909         SvREFCNT_dec(r->float_substr);
13910         SvREFCNT_dec(r->float_utf8);
13911         Safefree(r->substrs);
13912     }
13913     RX_MATCH_COPY_FREE(rx);
13914 #ifdef PERL_OLD_COPY_ON_WRITE
13915     SvREFCNT_dec(r->saved_copy);
13916 #endif
13917     Safefree(r->offs);
13918     SvREFCNT_dec(r->qr_anoncv);
13919 }
13920
13921 /*  reg_temp_copy()
13922
13923     This is a hacky workaround to the structural issue of match results
13924     being stored in the regexp structure which is in turn stored in
13925     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13926     could be PL_curpm in multiple contexts, and could require multiple
13927     result sets being associated with the pattern simultaneously, such
13928     as when doing a recursive match with (??{$qr})
13929
13930     The solution is to make a lightweight copy of the regexp structure
13931     when a qr// is returned from the code executed by (??{$qr}) this
13932     lightweight copy doesn't actually own any of its data except for
13933     the starp/end and the actual regexp structure itself.
13934
13935 */
13936
13937
13938 REGEXP *
13939 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13940 {
13941     struct regexp *ret;
13942     struct regexp *const r = (struct regexp *)SvANY(rx);
13943
13944     PERL_ARGS_ASSERT_REG_TEMP_COPY;
13945
13946     if (!ret_x)
13947         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13948     ret = (struct regexp *)SvANY(ret_x);
13949
13950     (void)ReREFCNT_inc(rx);
13951     /* We can take advantage of the existing "copied buffer" mechanism in SVs
13952        by pointing directly at the buffer, but flagging that the allocated
13953        space in the copy is zero. As we've just done a struct copy, it's now
13954        a case of zero-ing that, rather than copying the current length.  */
13955     SvPV_set(ret_x, RX_WRAPPED(rx));
13956     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13957     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13958            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13959     SvLEN_set(ret_x, 0);
13960     SvSTASH_set(ret_x, NULL);
13961     SvMAGIC_set(ret_x, NULL);
13962     if (r->offs) {
13963         const I32 npar = r->nparens+1;
13964         Newx(ret->offs, npar, regexp_paren_pair);
13965         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13966     }
13967     if (r->substrs) {
13968         Newx(ret->substrs, 1, struct reg_substr_data);
13969         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13970
13971         SvREFCNT_inc_void(ret->anchored_substr);
13972         SvREFCNT_inc_void(ret->anchored_utf8);
13973         SvREFCNT_inc_void(ret->float_substr);
13974         SvREFCNT_inc_void(ret->float_utf8);
13975
13976         /* check_substr and check_utf8, if non-NULL, point to either their
13977            anchored or float namesakes, and don't hold a second reference.  */
13978     }
13979     RX_MATCH_COPIED_off(ret_x);
13980 #ifdef PERL_OLD_COPY_ON_WRITE
13981     ret->saved_copy = NULL;
13982 #endif
13983     ret->mother_re = rx;
13984     SvREFCNT_inc_void(ret->qr_anoncv);
13985
13986     return ret_x;
13987 }
13988 #endif
13989
13990 /* regfree_internal()
13991
13992    Free the private data in a regexp. This is overloadable by
13993    extensions. Perl takes care of the regexp structure in pregfree(),
13994    this covers the *pprivate pointer which technically perl doesn't
13995    know about, however of course we have to handle the
13996    regexp_internal structure when no extension is in use.
13997
13998    Note this is called before freeing anything in the regexp
13999    structure.
14000  */
14001
14002 void
14003 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14004 {
14005     dVAR;
14006     struct regexp *const r = (struct regexp *)SvANY(rx);
14007     RXi_GET_DECL(r,ri);
14008     GET_RE_DEBUG_FLAGS_DECL;
14009
14010     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14011
14012     DEBUG_COMPILE_r({
14013         if (!PL_colorset)
14014             reginitcolors();
14015         {
14016             SV *dsv= sv_newmortal();
14017             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14018                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14019             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14020                 PL_colors[4],PL_colors[5],s);
14021         }
14022     });
14023 #ifdef RE_TRACK_PATTERN_OFFSETS
14024     if (ri->u.offsets)
14025         Safefree(ri->u.offsets);             /* 20010421 MJD */
14026 #endif
14027     if (ri->code_blocks) {
14028         int n;
14029         for (n = 0; n < ri->num_code_blocks; n++)
14030             SvREFCNT_dec(ri->code_blocks[n].src_regex);
14031         Safefree(ri->code_blocks);
14032     }
14033
14034     if (ri->data) {
14035         int n = ri->data->count;
14036
14037         while (--n >= 0) {
14038           /* If you add a ->what type here, update the comment in regcomp.h */
14039             switch (ri->data->what[n]) {
14040             case 'a':
14041             case 'r':
14042             case 's':
14043             case 'S':
14044             case 'u':
14045                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14046                 break;
14047             case 'f':
14048                 Safefree(ri->data->data[n]);
14049                 break;
14050             case 'l':
14051             case 'L':
14052                 break;
14053             case 'T':
14054                 { /* Aho Corasick add-on structure for a trie node.
14055                      Used in stclass optimization only */
14056                     U32 refcount;
14057                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14058                     OP_REFCNT_LOCK;
14059                     refcount = --aho->refcount;
14060                     OP_REFCNT_UNLOCK;
14061                     if ( !refcount ) {
14062                         PerlMemShared_free(aho->states);
14063                         PerlMemShared_free(aho->fail);
14064                          /* do this last!!!! */
14065                         PerlMemShared_free(ri->data->data[n]);
14066                         PerlMemShared_free(ri->regstclass);
14067                     }
14068                 }
14069                 break;
14070             case 't':
14071                 {
14072                     /* trie structure. */
14073                     U32 refcount;
14074                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14075                     OP_REFCNT_LOCK;
14076                     refcount = --trie->refcount;
14077                     OP_REFCNT_UNLOCK;
14078                     if ( !refcount ) {
14079                         PerlMemShared_free(trie->charmap);
14080                         PerlMemShared_free(trie->states);
14081                         PerlMemShared_free(trie->trans);
14082                         if (trie->bitmap)
14083                             PerlMemShared_free(trie->bitmap);
14084                         if (trie->jump)
14085                             PerlMemShared_free(trie->jump);
14086                         PerlMemShared_free(trie->wordinfo);
14087                         /* do this last!!!! */
14088                         PerlMemShared_free(ri->data->data[n]);
14089                     }
14090                 }
14091                 break;
14092             default:
14093                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14094             }
14095         }
14096         Safefree(ri->data->what);
14097         Safefree(ri->data);
14098     }
14099
14100     Safefree(ri);
14101 }
14102
14103 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14104 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14105 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
14106
14107 /*
14108    re_dup - duplicate a regexp.
14109
14110    This routine is expected to clone a given regexp structure. It is only
14111    compiled under USE_ITHREADS.
14112
14113    After all of the core data stored in struct regexp is duplicated
14114    the regexp_engine.dupe method is used to copy any private data
14115    stored in the *pprivate pointer. This allows extensions to handle
14116    any duplication it needs to do.
14117
14118    See pregfree() and regfree_internal() if you change anything here.
14119 */
14120 #if defined(USE_ITHREADS)
14121 #ifndef PERL_IN_XSUB_RE
14122 void
14123 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14124 {
14125     dVAR;
14126     I32 npar;
14127     const struct regexp *r = (const struct regexp *)SvANY(sstr);
14128     struct regexp *ret = (struct regexp *)SvANY(dstr);
14129
14130     PERL_ARGS_ASSERT_RE_DUP_GUTS;
14131
14132     npar = r->nparens+1;
14133     Newx(ret->offs, npar, regexp_paren_pair);
14134     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14135     if(ret->swap) {
14136         /* no need to copy these */
14137         Newx(ret->swap, npar, regexp_paren_pair);
14138     }
14139
14140     if (ret->substrs) {
14141         /* Do it this way to avoid reading from *r after the StructCopy().
14142            That way, if any of the sv_dup_inc()s dislodge *r from the L1
14143            cache, it doesn't matter.  */
14144         const bool anchored = r->check_substr
14145             ? r->check_substr == r->anchored_substr
14146             : r->check_utf8 == r->anchored_utf8;
14147         Newx(ret->substrs, 1, struct reg_substr_data);
14148         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14149
14150         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14151         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14152         ret->float_substr = sv_dup_inc(ret->float_substr, param);
14153         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14154
14155         /* check_substr and check_utf8, if non-NULL, point to either their
14156            anchored or float namesakes, and don't hold a second reference.  */
14157
14158         if (ret->check_substr) {
14159             if (anchored) {
14160                 assert(r->check_utf8 == r->anchored_utf8);
14161                 ret->check_substr = ret->anchored_substr;
14162                 ret->check_utf8 = ret->anchored_utf8;
14163             } else {
14164                 assert(r->check_substr == r->float_substr);
14165                 assert(r->check_utf8 == r->float_utf8);
14166                 ret->check_substr = ret->float_substr;
14167                 ret->check_utf8 = ret->float_utf8;
14168             }
14169         } else if (ret->check_utf8) {
14170             if (anchored) {
14171                 ret->check_utf8 = ret->anchored_utf8;
14172             } else {
14173                 ret->check_utf8 = ret->float_utf8;
14174             }
14175         }
14176     }
14177
14178     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14179     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14180
14181     if (ret->pprivate)
14182         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14183
14184     if (RX_MATCH_COPIED(dstr))
14185         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
14186     else
14187         ret->subbeg = NULL;
14188 #ifdef PERL_OLD_COPY_ON_WRITE
14189     ret->saved_copy = NULL;
14190 #endif
14191
14192     if (ret->mother_re) {
14193         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14194             /* Our storage points directly to our mother regexp, but that's
14195                1: a buffer in a different thread
14196                2: something we no longer hold a reference on
14197                so we need to copy it locally.  */
14198             /* Note we need to use SvCUR(), rather than
14199                SvLEN(), on our mother_re, because it, in
14200                turn, may well be pointing to its own mother_re.  */
14201             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14202                                    SvCUR(ret->mother_re)+1));
14203             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14204         }
14205         ret->mother_re      = NULL;
14206     }
14207     ret->gofs = 0;
14208 }
14209 #endif /* PERL_IN_XSUB_RE */
14210
14211 /*
14212    regdupe_internal()
14213
14214    This is the internal complement to regdupe() which is used to copy
14215    the structure pointed to by the *pprivate pointer in the regexp.
14216    This is the core version of the extension overridable cloning hook.
14217    The regexp structure being duplicated will be copied by perl prior
14218    to this and will be provided as the regexp *r argument, however
14219    with the /old/ structures pprivate pointer value. Thus this routine
14220    may override any copying normally done by perl.
14221
14222    It returns a pointer to the new regexp_internal structure.
14223 */
14224
14225 void *
14226 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14227 {
14228     dVAR;
14229     struct regexp *const r = (struct regexp *)SvANY(rx);
14230     regexp_internal *reti;
14231     int len;
14232     RXi_GET_DECL(r,ri);
14233
14234     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14235
14236     len = ProgLen(ri);
14237
14238     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14239     Copy(ri->program, reti->program, len+1, regnode);
14240
14241     reti->num_code_blocks = ri->num_code_blocks;
14242     if (ri->code_blocks) {
14243         int n;
14244         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14245                 struct reg_code_block);
14246         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14247                 struct reg_code_block);
14248         for (n = 0; n < ri->num_code_blocks; n++)
14249              reti->code_blocks[n].src_regex = (REGEXP*)
14250                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14251     }
14252     else
14253         reti->code_blocks = NULL;
14254
14255     reti->regstclass = NULL;
14256
14257     if (ri->data) {
14258         struct reg_data *d;
14259         const int count = ri->data->count;
14260         int i;
14261
14262         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14263                 char, struct reg_data);
14264         Newx(d->what, count, U8);
14265
14266         d->count = count;
14267         for (i = 0; i < count; i++) {
14268             d->what[i] = ri->data->what[i];
14269             switch (d->what[i]) {
14270                 /* see also regcomp.h and regfree_internal() */
14271             case 'a': /* actually an AV, but the dup function is identical.  */
14272             case 'r':
14273             case 's':
14274             case 'S':
14275             case 'u': /* actually an HV, but the dup function is identical.  */
14276                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14277                 break;
14278             case 'f':
14279                 /* This is cheating. */
14280                 Newx(d->data[i], 1, struct regnode_charclass_class);
14281                 StructCopy(ri->data->data[i], d->data[i],
14282                             struct regnode_charclass_class);
14283                 reti->regstclass = (regnode*)d->data[i];
14284                 break;
14285             case 'T':
14286                 /* Trie stclasses are readonly and can thus be shared
14287                  * without duplication. We free the stclass in pregfree
14288                  * when the corresponding reg_ac_data struct is freed.
14289                  */
14290                 reti->regstclass= ri->regstclass;
14291                 /* Fall through */
14292             case 't':
14293                 OP_REFCNT_LOCK;
14294                 ((reg_trie_data*)ri->data->data[i])->refcount++;
14295                 OP_REFCNT_UNLOCK;
14296                 /* Fall through */
14297             case 'l':
14298             case 'L':
14299                 d->data[i] = ri->data->data[i];
14300                 break;
14301             default:
14302                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14303             }
14304         }
14305
14306         reti->data = d;
14307     }
14308     else
14309         reti->data = NULL;
14310
14311     reti->name_list_idx = ri->name_list_idx;
14312
14313 #ifdef RE_TRACK_PATTERN_OFFSETS
14314     if (ri->u.offsets) {
14315         Newx(reti->u.offsets, 2*len+1, U32);
14316         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14317     }
14318 #else
14319     SetProgLen(reti,len);
14320 #endif
14321
14322     return (void*)reti;
14323 }
14324
14325 #endif    /* USE_ITHREADS */
14326
14327 #ifndef PERL_IN_XSUB_RE
14328
14329 /*
14330  - regnext - dig the "next" pointer out of a node
14331  */
14332 regnode *
14333 Perl_regnext(pTHX_ register regnode *p)
14334 {
14335     dVAR;
14336     I32 offset;
14337
14338     if (!p)
14339         return(NULL);
14340
14341     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
14342         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14343     }
14344
14345     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14346     if (offset == 0)
14347         return(NULL);
14348
14349     return(p+offset);
14350 }
14351 #endif
14352
14353 STATIC void
14354 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14355 {
14356     va_list args;
14357     STRLEN l1 = strlen(pat1);
14358     STRLEN l2 = strlen(pat2);
14359     char buf[512];
14360     SV *msv;
14361     const char *message;
14362
14363     PERL_ARGS_ASSERT_RE_CROAK2;
14364
14365     if (l1 > 510)
14366         l1 = 510;
14367     if (l1 + l2 > 510)
14368         l2 = 510 - l1;
14369     Copy(pat1, buf, l1 , char);
14370     Copy(pat2, buf + l1, l2 , char);
14371     buf[l1 + l2] = '\n';
14372     buf[l1 + l2 + 1] = '\0';
14373 #ifdef I_STDARG
14374     /* ANSI variant takes additional second argument */
14375     va_start(args, pat2);
14376 #else
14377     va_start(args);
14378 #endif
14379     msv = vmess(buf, &args);
14380     va_end(args);
14381     message = SvPV_const(msv,l1);
14382     if (l1 > 512)
14383         l1 = 512;
14384     Copy(message, buf, l1 , char);
14385     buf[l1-1] = '\0';                   /* Overwrite \n */
14386     Perl_croak(aTHX_ "%s", buf);
14387 }
14388
14389 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
14390
14391 #ifndef PERL_IN_XSUB_RE
14392 void
14393 Perl_save_re_context(pTHX)
14394 {
14395     dVAR;
14396
14397     struct re_save_state *state;
14398
14399     SAVEVPTR(PL_curcop);
14400     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14401
14402     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14403     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14404     SSPUSHUV(SAVEt_RE_STATE);
14405
14406     Copy(&PL_reg_state, state, 1, struct re_save_state);
14407
14408     PL_reg_oldsaved = NULL;
14409     PL_reg_oldsavedlen = 0;
14410     PL_reg_maxiter = 0;
14411     PL_reg_leftiter = 0;
14412     PL_reg_poscache = NULL;
14413     PL_reg_poscache_size = 0;
14414 #ifdef PERL_OLD_COPY_ON_WRITE
14415     PL_nrs = NULL;
14416 #endif
14417
14418     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14419     if (PL_curpm) {
14420         const REGEXP * const rx = PM_GETRE(PL_curpm);
14421         if (rx) {
14422             U32 i;
14423             for (i = 1; i <= RX_NPARENS(rx); i++) {
14424                 char digits[TYPE_CHARS(long)];
14425                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14426                 GV *const *const gvp
14427                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14428
14429                 if (gvp) {
14430                     GV * const gv = *gvp;
14431                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14432                         save_scalar(gv);
14433                 }
14434             }
14435         }
14436     }
14437 }
14438 #endif
14439
14440 static void
14441 clear_re(pTHX_ void *r)
14442 {
14443     dVAR;
14444     ReREFCNT_dec((REGEXP *)r);
14445 }
14446
14447 #ifdef DEBUGGING
14448
14449 STATIC void
14450 S_put_byte(pTHX_ SV *sv, int c)
14451 {
14452     PERL_ARGS_ASSERT_PUT_BYTE;
14453
14454     /* Our definition of isPRINT() ignores locales, so only bytes that are
14455        not part of UTF-8 are considered printable. I assume that the same
14456        holds for UTF-EBCDIC.
14457        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14458        which Wikipedia says:
14459
14460        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14461        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14462        identical, to the ASCII delete (DEL) or rubout control character.
14463        ) So the old condition can be simplified to !isPRINT(c)  */
14464     if (!isPRINT(c)) {
14465         if (c < 256) {
14466             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14467         }
14468         else {
14469             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14470         }
14471     }
14472     else {
14473         const char string = c;
14474         if (c == '-' || c == ']' || c == '\\' || c == '^')
14475             sv_catpvs(sv, "\\");
14476         sv_catpvn(sv, &string, 1);
14477     }
14478 }
14479
14480
14481 #define CLEAR_OPTSTART \
14482     if (optstart) STMT_START { \
14483             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14484             optstart=NULL; \
14485     } STMT_END
14486
14487 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14488
14489 STATIC const regnode *
14490 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14491             const regnode *last, const regnode *plast,
14492             SV* sv, I32 indent, U32 depth)
14493 {
14494     dVAR;
14495     U8 op = PSEUDO;     /* Arbitrary non-END op. */
14496     const regnode *next;
14497     const regnode *optstart= NULL;
14498
14499     RXi_GET_DECL(r,ri);
14500     GET_RE_DEBUG_FLAGS_DECL;
14501
14502     PERL_ARGS_ASSERT_DUMPUNTIL;
14503
14504 #ifdef DEBUG_DUMPUNTIL
14505     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14506         last ? last-start : 0,plast ? plast-start : 0);
14507 #endif
14508
14509     if (plast && plast < last)
14510         last= plast;
14511
14512     while (PL_regkind[op] != END && (!last || node < last)) {
14513         /* While that wasn't END last time... */
14514         NODE_ALIGN(node);
14515         op = OP(node);
14516         if (op == CLOSE || op == WHILEM)
14517             indent--;
14518         next = regnext((regnode *)node);
14519
14520         /* Where, what. */
14521         if (OP(node) == OPTIMIZED) {
14522             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14523                 optstart = node;
14524             else
14525                 goto after_print;
14526         } else
14527             CLEAR_OPTSTART;
14528
14529         regprop(r, sv, node);
14530         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14531                       (int)(2*indent + 1), "", SvPVX_const(sv));
14532
14533         if (OP(node) != OPTIMIZED) {
14534             if (next == NULL)           /* Next ptr. */
14535                 PerlIO_printf(Perl_debug_log, " (0)");
14536             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14537                 PerlIO_printf(Perl_debug_log, " (FAIL)");
14538             else
14539                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14540             (void)PerlIO_putc(Perl_debug_log, '\n');
14541         }
14542
14543       after_print:
14544         if (PL_regkind[(U8)op] == BRANCHJ) {
14545             assert(next);
14546             {
14547                 const regnode *nnode = (OP(next) == LONGJMP
14548                                        ? regnext((regnode *)next)
14549                                        : next);
14550                 if (last && nnode > last)
14551                     nnode = last;
14552                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14553             }
14554         }
14555         else if (PL_regkind[(U8)op] == BRANCH) {
14556             assert(next);
14557             DUMPUNTIL(NEXTOPER(node), next);
14558         }
14559         else if ( PL_regkind[(U8)op]  == TRIE ) {
14560             const regnode *this_trie = node;
14561             const char op = OP(node);
14562             const U32 n = ARG(node);
14563             const reg_ac_data * const ac = op>=AHOCORASICK ?
14564                (reg_ac_data *)ri->data->data[n] :
14565                NULL;
14566             const reg_trie_data * const trie =
14567                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14568 #ifdef DEBUGGING
14569             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14570 #endif
14571             const regnode *nextbranch= NULL;
14572             I32 word_idx;
14573             sv_setpvs(sv, "");
14574             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14575                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14576
14577                 PerlIO_printf(Perl_debug_log, "%*s%s ",
14578                    (int)(2*(indent+3)), "",
14579                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14580                             PL_colors[0], PL_colors[1],
14581                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14582                             PERL_PV_PRETTY_ELLIPSES    |
14583                             PERL_PV_PRETTY_LTGT
14584                             )
14585                             : "???"
14586                 );
14587                 if (trie->jump) {
14588                     U16 dist= trie->jump[word_idx+1];
14589                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14590                                   (UV)((dist ? this_trie + dist : next) - start));
14591                     if (dist) {
14592                         if (!nextbranch)
14593                             nextbranch= this_trie + trie->jump[0];
14594                         DUMPUNTIL(this_trie + dist, nextbranch);
14595                     }
14596                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14597                         nextbranch= regnext((regnode *)nextbranch);
14598                 } else {
14599                     PerlIO_printf(Perl_debug_log, "\n");
14600                 }
14601             }
14602             if (last && next > last)
14603                 node= last;
14604             else
14605                 node= next;
14606         }
14607         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
14608             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14609                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14610         }
14611         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14612             assert(next);
14613             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14614         }
14615         else if ( op == PLUS || op == STAR) {
14616             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14617         }
14618         else if (PL_regkind[(U8)op] == ANYOF) {
14619             /* arglen 1 + class block */
14620             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14621                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14622             node = NEXTOPER(node);
14623         }
14624         else if (PL_regkind[(U8)op] == EXACT) {
14625             /* Literal string, where present. */
14626             node += NODE_SZ_STR(node) - 1;
14627             node = NEXTOPER(node);
14628         }
14629         else {
14630             node = NEXTOPER(node);
14631             node += regarglen[(U8)op];
14632         }
14633         if (op == CURLYX || op == OPEN)
14634             indent++;
14635     }
14636     CLEAR_OPTSTART;
14637 #ifdef DEBUG_DUMPUNTIL
14638     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14639 #endif
14640     return node;
14641 }
14642
14643 #endif  /* DEBUGGING */
14644
14645 /*
14646  * Local variables:
14647  * c-indentation-style: bsd
14648  * c-basic-offset: 4
14649  * indent-tabs-mode: nil
14650  * End:
14651  *
14652  * ex: set ts=8 sts=4 sw=4 et:
14653  */