src/5020002/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98
  99 #ifndef STATIC
 100 #define STATIC  static
 101 #endif
 102
 103
 104 struct RExC_state_t {
 105     U32         flags;                  /* RXf_* are we folding, multilining? */
 106     U32         pm_flags;               /* PMf_* stuff from the calling PMOP */
 107     char        *precomp;               /* uncompiled string. */
 108     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 109     regexp      *rx;                    /* perl core regexp structure */
 110     regexp_internal     *rxi;           /* internal data for regexp object
 111                                            pprivate field */
 112     char        *start;                 /* Start of input for compile */
 113     char        *end;                   /* End of input for compile */
 114     char        *parse;                 /* Input-scan pointer. */
 115     SSize_t     whilem_seen;            /* number of WHILEM in this expr */
 116     regnode     *emit_start;            /* Start of emitted-code area */
 117     regnode     *emit_bound;            /* First regnode outside of the
 118                                            allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; if = &emit_dummy,
 120                                            implies compiling, so don't emit */
 121     regnode_ssc emit_dummy;             /* placeholder for emit to point to;
 122                                            large enough for the largest
 123                                            non-EXACTish node, so can use it as
 124                                            scratch in pass1 */
 125     I32         naughty;                /* How bad is this pattern? */
 126     I32         sawback;                /* Did we see \1, ...? */
 127     U32         seen;
 128     SSize_t     size;                   /* Code size. */
 129     I32                npar;            /* Capture buffer count, (OPEN) plus
 130                                            one. ("par" 0 is the whole
 131                                            pattern)*/
 132     I32         nestroot;               /* root parens we are in - used by
 133                                            accept */
 134     I32         extralen;
 135     I32         seen_zerolen;
 136     regnode     **open_parens;          /* pointers to open parens */
 137     regnode     **close_parens;         /* pointers to close parens */
 138     regnode     *opend;                 /* END node in program */
 139     I32         utf8;           /* whether the pattern is utf8 or not */
 140     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 141                                 /* XXX use this for future optimisation of case
 142                                  * where pattern must be upgraded to utf8. */
 143     I32         uni_semantics;  /* If a d charset modifier should use unicode
 144                                    rules, even if the pattern is not in
 145                                    utf8 */
 146     HV          *paren_names;           /* Paren names */
 147
 148     regnode     **recurse;              /* Recurse regops */
 149     I32         recurse_count;          /* Number of recurse regops */
 150     U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 151                                            through */
 152     U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 153     I32         in_lookbehind;
 154     I32         contains_locale;
 155     I32         contains_i;
 156     I32         override_recoding;
 157     I32         in_multi_char_class;
 158     struct reg_code_block *code_blocks; /* positions of literal (?{})
 159                                             within pattern */
 160     int         num_code_blocks;        /* size of code_blocks[] */
 161     int         code_index;             /* next code_blocks[] slot */
 162     SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 163 #ifdef ADD_TO_REGEXEC
 164     char        *starttry;              /* -Dr: where regtry was called. */
 165 #define RExC_starttry   (pRExC_state->starttry)
 166 #endif
 167     SV          *runtime_code_qr;       /* qr with the runtime code blocks */
 168 #ifdef DEBUGGING
 169     const char  *lastparse;
 170     I32         lastnum;
 171     AV          *paren_name_list;       /* idx -> name */
 172 #define RExC_lastparse  (pRExC_state->lastparse)
 173 #define RExC_lastnum    (pRExC_state->lastnum)
 174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 175 #endif
 176 };
 177
 178 #define RExC_flags      (pRExC_state->flags)
 179 #define RExC_pm_flags   (pRExC_state->pm_flags)
 180 #define RExC_precomp    (pRExC_state->precomp)
 181 #define RExC_rx_sv      (pRExC_state->rx_sv)
 182 #define RExC_rx         (pRExC_state->rx)
 183 #define RExC_rxi        (pRExC_state->rxi)
 184 #define RExC_start      (pRExC_state->start)
 185 #define RExC_end        (pRExC_state->end)
 186 #define RExC_parse      (pRExC_state->parse)
 187 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 188 #ifdef RE_TRACK_PATTERN_OFFSETS
 189 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the
 190                                                          others */
 191 #endif
 192 #define RExC_emit       (pRExC_state->emit)
 193 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 194 #define RExC_emit_start (pRExC_state->emit_start)
 195 #define RExC_emit_bound (pRExC_state->emit_bound)
 196 #define RExC_naughty    (pRExC_state->naughty)
 197 #define RExC_sawback    (pRExC_state->sawback)
 198 #define RExC_seen       (pRExC_state->seen)
 199 #define RExC_size       (pRExC_state->size)
 200 #define RExC_maxlen        (pRExC_state->maxlen)
 201 #define RExC_npar       (pRExC_state->npar)
 202 #define RExC_nestroot   (pRExC_state->nestroot)
 203 #define RExC_extralen   (pRExC_state->extralen)
 204 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 205 #define RExC_utf8       (pRExC_state->utf8)
 206 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 207 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 208 #define RExC_open_parens        (pRExC_state->open_parens)
 209 #define RExC_close_parens       (pRExC_state->close_parens)
 210 #define RExC_opend      (pRExC_state->opend)
 211 #define RExC_paren_names        (pRExC_state->paren_names)
 212 #define RExC_recurse    (pRExC_state->recurse)
 213 #define RExC_recurse_count      (pRExC_state->recurse_count)
 214 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 215 #define RExC_study_chunk_recursed_bytes  \
 216                                    (pRExC_state->study_chunk_recursed_bytes)
 217 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 218 #define RExC_contains_locale    (pRExC_state->contains_locale)
 219 #define RExC_contains_i (pRExC_state->contains_i)
 220 #define RExC_override_recoding (pRExC_state->override_recoding)
 221 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 222
 223
 224 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 225 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 226         ((*s) == '{' && regcurly(s, FALSE)))
 227
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST           0       /* Worst case. */
 232 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 235  * character.  (There needs to be a case: in the switch statement in regexec.c
 236  * for any node marked SIMPLE.)  Note that this is not the same thing as
 237  * REGNODE_SIMPLE */
 238 #define SIMPLE          0x02
 239 #define SPSTART         0x04    /* Starts with * or + */
 240 #define POSTPONED       0x08    /* (?1),(?&name), (??{...}) or similar */
 241 #define TRYAGAIN        0x10    /* Weeded out a declaration. */
 242 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 243
 244 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 245
 246 /* whether trie related optimizations are enabled */
 247 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 248 #define TRIE_STUDY_OPT
 249 #define FULL_TRIE_STUDY
 250 #define TRIE_STCLASS
 251 #endif
 252
 253
 254
 255 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 256 #define PBITVAL(paren) (1 << ((paren) & 7))
 257 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 258 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 259 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 260
 261 #define REQUIRE_UTF8    STMT_START {                                       \
 262                                      if (!UTF) {                           \
 263                                          *flagp = RESTART_UTF8;            \
 264                                          return NULL;                      \
 265                                      }                                     \
 266                         } STMT_END
 267
 268 /* This converts the named class defined in regcomp.h to its equivalent class
 269  * number defined in handy.h. */
 270 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 271 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 272
 273 #define _invlist_union_complement_2nd(a, b, output) \
 274                         _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 275 #define _invlist_intersection_complement_2nd(a, b, output) \
 276                  _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 277
 278 /* About scan_data_t.
 279
 280   During optimisation we recurse through the regexp program performing
 281   various inplace (keyhole style) optimisations. In addition study_chunk
 282   and scan_commit populate this data structure with information about
 283   what strings MUST appear in the pattern. We look for the longest
 284   string that must appear at a fixed location, and we look for the
 285   longest string that may appear at a floating location. So for instance
 286   in the pattern:
 287
 288     /FOO[xX]A.*B[xX]BAR/
 289
 290   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 291   strings (because they follow a .* construct). study_chunk will identify
 292   both FOO and BAR as being the longest fixed and floating strings respectively.
 293
 294   The strings can be composites, for instance
 295
 296      /(f)(o)(o)/
 297
 298   will result in a composite fixed substring 'foo'.
 299
 300   For each string some basic information is maintained:
 301
 302   - offset or min_offset
 303     This is the position the string must appear at, or not before.
 304     It also implicitly (when combined with minlenp) tells us how many
 305     characters must match before the string we are searching for.
 306     Likewise when combined with minlenp and the length of the string it
 307     tells us how many characters must appear after the string we have
 308     found.
 309
 310   - max_offset
 311     Only used for floating strings. This is the rightmost point that
 312     the string can appear at. If set to SSize_t_MAX it indicates that the
 313     string can occur infinitely far to the right.
 314
 315   - minlenp
 316     A pointer to the minimum number of characters of the pattern that the
 317     string was found inside. This is important as in the case of positive
 318     lookahead or positive lookbehind we can have multiple patterns
 319     involved. Consider
 320
 321     /(?=FOO).*F/
 322
 323     The minimum length of the pattern overall is 3, the minimum length
 324     of the lookahead part is 3, but the minimum length of the part that
 325     will actually match is 1. So 'FOO's minimum length is 3, but the
 326     minimum length for the F is 1. This is important as the minimum length
 327     is used to determine offsets in front of and behind the string being
 328     looked for.  Since strings can be composites this is the length of the
 329     pattern at the time it was committed with a scan_commit. Note that
 330     the length is calculated by study_chunk, so that the minimum lengths
 331     are not known until the full pattern has been compiled, thus the
 332     pointer to the value.
 333
 334   - lookbehind
 335
 336     In the case of lookbehind the string being searched for can be
 337     offset past the start point of the final matching string.
 338     If this value was just blithely removed from the min_offset it would
 339     invalidate some of the calculations for how many chars must match
 340     before or after (as they are derived from min_offset and minlen and
 341     the length of the string being searched for).
 342     When the final pattern is compiled and the data is moved from the
 343     scan_data_t structure into the regexp structure the information
 344     about lookbehind is factored in, with the information that would
 345     have been lost precalculated in the end_shift field for the
 346     associated string.
 347
 348   The fields pos_min and pos_delta are used to store the minimum offset
 349   and the delta to the maximum offset at the current point in the pattern.
 350
 351 */
 352
 353 typedef struct scan_data_t {
 354     /*I32 len_min;      unused */
 355     /*I32 len_delta;    unused */
 356     SSize_t pos_min;
 357     SSize_t pos_delta;
 358     SV *last_found;
 359     SSize_t last_end;       /* min value, <0 unless valid. */
 360     SSize_t last_start_min;
 361     SSize_t last_start_max;
 362     SV **longest;           /* Either &l_fixed, or &l_float. */
 363     SV *longest_fixed;      /* longest fixed string found in pattern */
 364     SSize_t offset_fixed;   /* offset where it starts */
 365     SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 366     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 367     SV *longest_float;      /* longest floating string found in pattern */
 368     SSize_t offset_float_min; /* earliest point in string it can appear */
 369     SSize_t offset_float_max; /* latest point in string it can appear */
 370     SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 371     SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 372     I32 flags;
 373     I32 whilem_c;
 374     SSize_t *last_closep;
 375     regnode_ssc *start_class;
 376 } scan_data_t;
 377
 378 /* The below is perhaps overboard, but this allows us to save a test at the
 379  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 380  * and 'a' differ by a single bit; the same with the upper and lower case of
 381  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 382  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 383  * then inverts it to form a mask, with just a single 0, in the bit position
 384  * where the upper- and lowercase differ.  XXX There are about 40 other
 385  * instances in the Perl core where this micro-optimization could be used.
 386  * Should decide if maintenance cost is worse, before changing those
 387  *
 388  * Returns a boolean as to whether or not 'v' is either a lowercase or
 389  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 390  * compile-time constant, the generated code is better than some optimizing
 391  * compilers figure out, amounting to a mask and test.  The results are
 392  * meaningless if 'c' is not one of [A-Za-z] */
 393 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 394                               (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 395
 396 /*
 397  * Forward declarations for pregcomp()'s friends.
 398  */
 399
 400 static const scan_data_t zero_scan_data =
 401   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 402
 403 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 404 #define SF_BEFORE_SEOL          0x0001
 405 #define SF_BEFORE_MEOL          0x0002
 406 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 407 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 408
 409 #define SF_FIX_SHIFT_EOL        (+2)
 410 #define SF_FL_SHIFT_EOL         (+4)
 411
 412 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 413 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 414
 415 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 416 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 417 #define SF_IS_INF               0x0040
 418 #define SF_HAS_PAR              0x0080
 419 #define SF_IN_PAR               0x0100
 420 #define SF_HAS_EVAL             0x0200
 421 #define SCF_DO_SUBSTR           0x0400
 422 #define SCF_DO_STCLASS_AND      0x0800
 423 #define SCF_DO_STCLASS_OR       0x1000
 424 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 425 #define SCF_WHILEM_VISITED_POS  0x2000
 426
 427 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 428 #define SCF_SEEN_ACCEPT         0x8000
 429 #define SCF_TRIE_DOING_RESTUDY 0x10000
 430
 431 #define UTF cBOOL(RExC_utf8)
 432
 433 /* The enums for all these are ordered so things work out correctly */
 434 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 435 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 436                                                      == REGEX_DEPENDS_CHARSET)
 437 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 438 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 439                                                      >= REGEX_UNICODE_CHARSET)
 440 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 441                                             == REGEX_ASCII_RESTRICTED_CHARSET)
 442 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 443                                             >= REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 445                                         == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 446
 447 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 448
 449 /* For programs that want to be strictly Unicode compatible by dying if any
 450  * attempt is made to match a non-Unicode code point against a Unicode
 451  * property.  */
 452 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 453
 454 #define OOB_NAMEDCLASS          -1
 455
 456 /* There is no code point that is out-of-bounds, so this is problematic.  But
 457  * its only current use is to initialize a variable that is always set before
 458  * looked at. */
 459 #define OOB_UNICODE             0xDEADBEEF
 460
 461 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 462 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 463
 464
 465 /* length of regex to show in messages that don't mark a position within */
 466 #define RegexLengthToShowInErrorMessages 127
 467
 468 /*
 469  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 470  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 471  * op/pragma/warn/regcomp.
 472  */
 473 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 474 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 475
 476 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 477                         " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 478
 479 #define REPORT_LOCATION_ARGS(offset)            \
 480                 UTF8fARG(UTF, offset, RExC_precomp), \
 481                 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 482
 483 /*
 484  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 485  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 486  * "...".
 487  */
 488 #define _FAIL(code) STMT_START {                                        \
 489     const char *ellipses = "";                                          \
 490     IV len = RExC_end - RExC_precomp;                                   \
 491                                                                         \
 492     if (!SIZE_ONLY)                                                     \
 493         SAVEFREESV(RExC_rx_sv);                                         \
 494     if (len > RegexLengthToShowInErrorMessages) {                       \
 495         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 496         len = RegexLengthToShowInErrorMessages - 10;                    \
 497         ellipses = "...";                                               \
 498     }                                                                   \
 499     code;                                                               \
 500 } STMT_END
 501
 502 #define FAIL(msg) _FAIL(                            \
 503     Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",           \
 504             msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 505
 506 #define FAIL2(msg,arg) _FAIL(                       \
 507     Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",         \
 508             arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 509
 510 /*
 511  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 512  */
 513 #define Simple_vFAIL(m) STMT_START {                                    \
 514     const IV offset = RExC_parse - RExC_precomp;                        \
 515     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 516             m, REPORT_LOCATION_ARGS(offset));   \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 521  */
 522 #define vFAIL(m) STMT_START {                           \
 523     if (!SIZE_ONLY)                                     \
 524         SAVEFREESV(RExC_rx_sv);                         \
 525     Simple_vFAIL(m);                                    \
 526 } STMT_END
 527
 528 /*
 529  * Like Simple_vFAIL(), but accepts two arguments.
 530  */
 531 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 532     const IV offset = RExC_parse - RExC_precomp;                        \
 533     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,                      \
 534                       REPORT_LOCATION_ARGS(offset));    \
 535 } STMT_END
 536
 537 /*
 538  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 539  */
 540 #define vFAIL2(m,a1) STMT_START {                       \
 541     if (!SIZE_ONLY)                                     \
 542         SAVEFREESV(RExC_rx_sv);                         \
 543     Simple_vFAIL2(m, a1);                               \
 544 } STMT_END
 545
 546
 547 /*
 548  * Like Simple_vFAIL(), but accepts three arguments.
 549  */
 550 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 551     const IV offset = RExC_parse - RExC_precomp;                \
 552     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,          \
 553             REPORT_LOCATION_ARGS(offset));      \
 554 } STMT_END
 555
 556 /*
 557  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 558  */
 559 #define vFAIL3(m,a1,a2) STMT_START {                    \
 560     if (!SIZE_ONLY)                                     \
 561         SAVEFREESV(RExC_rx_sv);                         \
 562     Simple_vFAIL3(m, a1, a2);                           \
 563 } STMT_END
 564
 565 /*
 566  * Like Simple_vFAIL(), but accepts four arguments.
 567  */
 568 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 569     const IV offset = RExC_parse - RExC_precomp;                \
 570     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,              \
 571             REPORT_LOCATION_ARGS(offset));      \
 572 } STMT_END
 573
 574 #define vFAIL4(m,a1,a2,a3) STMT_START {                 \
 575     if (!SIZE_ONLY)                                     \
 576         SAVEFREESV(RExC_rx_sv);                         \
 577     Simple_vFAIL4(m, a1, a2, a3);                       \
 578 } STMT_END
 579
 580 /* A specialized version of vFAIL2 that works with UTF8f */
 581 #define vFAIL2utf8f(m, a1) STMT_START { \
 582     const IV offset = RExC_parse - RExC_precomp;   \
 583     if (!SIZE_ONLY)                                \
 584         SAVEFREESV(RExC_rx_sv);                    \
 585     S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 586             REPORT_LOCATION_ARGS(offset));         \
 587 } STMT_END
 588
 589
 590 /* m is not necessarily a "literal string", in this macro */
 591 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 592     const IV offset = loc - RExC_precomp;                               \
 593     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 594             m, REPORT_LOCATION_ARGS(offset));       \
 595 } STMT_END
 596
 597 #define ckWARNreg(loc,m) STMT_START {                                   \
 598     const IV offset = loc - RExC_precomp;                               \
 599     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 600             REPORT_LOCATION_ARGS(offset));              \
 601 } STMT_END
 602
 603 #define vWARN_dep(loc, m) STMT_START {                                  \
 604     const IV offset = loc - RExC_precomp;                               \
 605     Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION,     \
 606             REPORT_LOCATION_ARGS(offset));              \
 607 } STMT_END
 608
 609 #define ckWARNdep(loc,m) STMT_START {                                   \
 610     const IV offset = loc - RExC_precomp;                               \
 611     Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                   \
 612             m REPORT_LOCATION,                                          \
 613             REPORT_LOCATION_ARGS(offset));              \
 614 } STMT_END
 615
 616 #define ckWARNregdep(loc,m) STMT_START {                                \
 617     const IV offset = loc - RExC_precomp;                               \
 618     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 619             m REPORT_LOCATION,                                          \
 620             REPORT_LOCATION_ARGS(offset));              \
 621 } STMT_END
 622
 623 #define ckWARN2reg_d(loc,m, a1) STMT_START {                            \
 624     const IV offset = loc - RExC_precomp;                               \
 625     Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),                       \
 626             m REPORT_LOCATION,                                          \
 627             a1, REPORT_LOCATION_ARGS(offset));  \
 628 } STMT_END
 629
 630 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 631     const IV offset = loc - RExC_precomp;                               \
 632     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 633             a1, REPORT_LOCATION_ARGS(offset));  \
 634 } STMT_END
 635
 636 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 637     const IV offset = loc - RExC_precomp;                               \
 638     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 639             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 640 } STMT_END
 641
 642 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 643     const IV offset = loc - RExC_precomp;                               \
 644     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 645             a1, a2, REPORT_LOCATION_ARGS(offset));      \
 646 } STMT_END
 647
 648 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 649     const IV offset = loc - RExC_precomp;                               \
 650     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 651             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 652 } STMT_END
 653
 654 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 655     const IV offset = loc - RExC_precomp;                               \
 656     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 657             a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 658 } STMT_END
 659
 660 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 661     const IV offset = loc - RExC_precomp;                               \
 662     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 663             a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 664 } STMT_END
 665
 666
 667 /* Allow for side effects in s */
 668 #define REGC(c,s) STMT_START {                  \
 669     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 670 } STMT_END
 671
 672 /* Macros for recording node offsets.   20001227 mjd@plover.com
 673  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 674  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 675  * Element 0 holds the number n.
 676  * Position is 1 indexed.
 677  */
 678 #ifndef RE_TRACK_PATTERN_OFFSETS
 679 #define Set_Node_Offset_To_R(node,byte)
 680 #define Set_Node_Offset(node,byte)
 681 #define Set_Cur_Node_Offset
 682 #define Set_Node_Length_To_R(node,len)
 683 #define Set_Node_Length(node,len)
 684 #define Set_Node_Cur_Length(node,start)
 685 #define Node_Offset(n)
 686 #define Node_Length(n)
 687 #define Set_Node_Offset_Length(node,offset,len)
 688 #define ProgLen(ri) ri->u.proglen
 689 #define SetProgLen(ri,x) ri->u.proglen = x
 690 #else
 691 #define ProgLen(ri) ri->u.offsets[0]
 692 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 693 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 694     if (! SIZE_ONLY) {                                                  \
 695         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 696                     __LINE__, (int)(node), (int)(byte)));               \
 697         if((node) < 0) {                                                \
 698             Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 699                                          (int)(node));                  \
 700         } else {                                                        \
 701             RExC_offsets[2*(node)-1] = (byte);                          \
 702         }                                                               \
 703     }                                                                   \
 704 } STMT_END
 705
 706 #define Set_Node_Offset(node,byte) \
 707     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 708 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 709
 710 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 711     if (! SIZE_ONLY) {                                                  \
 712         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 713                 __LINE__, (int)(node), (int)(len)));                    \
 714         if((node) < 0) {                                                \
 715             Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 716                                          (int)(node));                  \
 717         } else {                                                        \
 718             RExC_offsets[2*(node)] = (len);                             \
 719         }                                                               \
 720     }                                                                   \
 721 } STMT_END
 722
 723 #define Set_Node_Length(node,len) \
 724     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 725 #define Set_Node_Cur_Length(node, start)                \
 726     Set_Node_Length(node, RExC_parse - start)
 727
 728 /* Get offsets and lengths */
 729 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 730 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 731
 732 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 733     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 734     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 735 } STMT_END
 736 #endif
 737
 738 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 739 #define EXPERIMENTAL_INPLACESCAN
 740 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 741
 742 #define DEBUG_RExC_seen() \
 743         DEBUG_OPTIMISE_MORE_r({                                             \
 744             PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 745                                                                             \
 746             if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 747                 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 748                                                                             \
 749             if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 750                 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 751                                                                             \
 752             if (RExC_seen & REG_GPOS_SEEN)                                  \
 753                 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 754                                                                             \
 755             if (RExC_seen & REG_CANY_SEEN)                                  \
 756                 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 757                                                                             \
 758             if (RExC_seen & REG_RECURSE_SEEN)                               \
 759                 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 760                                                                             \
 761             if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 762                 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 763                                                                             \
 764             if (RExC_seen & REG_VERBARG_SEEN)                               \
 765                 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 766                                                                             \
 767             if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 768                 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 769                                                                             \
 770             if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 771                 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 772                                                                             \
 773             if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 774                 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 775                                                                             \
 776             if (RExC_seen & REG_GOSTART_SEEN)                               \
 777                 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 778                                                                             \
 779             if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 780                 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 781                                                                             \
 782             PerlIO_printf(Perl_debug_log,"\n");                             \
 783         });
 784
 785 #define DEBUG_STUDYDATA(str,data,depth)                              \
 786 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 787     PerlIO_printf(Perl_debug_log,                                    \
 788         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 789         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 790         (int)(depth)*2, "",                                          \
 791         (IV)((data)->pos_min),                                       \
 792         (IV)((data)->pos_delta),                                     \
 793         (UV)((data)->flags),                                         \
 794         (IV)((data)->whilem_c),                                      \
 795         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 796         is_inf ? "INF " : ""                                         \
 797     );                                                               \
 798     if ((data)->last_found)                                          \
 799         PerlIO_printf(Perl_debug_log,                                \
 800             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 801             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 802             SvPVX_const((data)->last_found),                         \
 803             (IV)((data)->last_end),                                  \
 804             (IV)((data)->last_start_min),                            \
 805             (IV)((data)->last_start_max),                            \
 806             ((data)->longest &&                                      \
 807              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 808             SvPVX_const((data)->longest_fixed),                      \
 809             (IV)((data)->offset_fixed),                              \
 810             ((data)->longest &&                                      \
 811              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 812             SvPVX_const((data)->longest_float),                      \
 813             (IV)((data)->offset_float_min),                          \
 814             (IV)((data)->offset_float_max)                           \
 815         );                                                           \
 816     PerlIO_printf(Perl_debug_log,"\n");                              \
 817 });
 818
 819 /* Mark that we cannot extend a found fixed substring at this point.
 820    Update the longest found anchored substring and the longest found
 821    floating substrings if needed. */
 822
 823 STATIC void
 824 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 825                     SSize_t *minlenp, int is_inf)
 826 {
 827     const STRLEN l = CHR_SVLEN(data->last_found);
 828     const STRLEN old_l = CHR_SVLEN(*data->longest);
 829     GET_RE_DEBUG_FLAGS_DECL;
 830
 831     PERL_ARGS_ASSERT_SCAN_COMMIT;
 832
 833     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 834         SvSetMagicSV(*data->longest, data->last_found);
 835         if (*data->longest == data->longest_fixed) {
 836             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 837             if (data->flags & SF_BEFORE_EOL)
 838                 data->flags
 839                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 840             else
 841                 data->flags &= ~SF_FIX_BEFORE_EOL;
 842             data->minlen_fixed=minlenp;
 843             data->lookbehind_fixed=0;
 844         }
 845         else { /* *data->longest == data->longest_float */
 846             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 847             data->offset_float_max = (l
 848                                       ? data->last_start_max
 849                                       : (data->pos_delta == SSize_t_MAX
 850                                          ? SSize_t_MAX
 851                                          : data->pos_min + data->pos_delta));
 852             if (is_inf
 853                  || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 854                 data->offset_float_max = SSize_t_MAX;
 855             if (data->flags & SF_BEFORE_EOL)
 856                 data->flags
 857                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 858             else
 859                 data->flags &= ~SF_FL_BEFORE_EOL;
 860             data->minlen_float=minlenp;
 861             data->lookbehind_float=0;
 862         }
 863     }
 864     SvCUR_set(data->last_found, 0);
 865     {
 866         SV * const sv = data->last_found;
 867         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 868             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 869             if (mg)
 870                 mg->mg_len = 0;
 871         }
 872     }
 873     data->last_end = -1;
 874     data->flags &= ~SF_BEFORE_EOL;
 875     DEBUG_STUDYDATA("commit: ",data,0);
 876 }
 877
 878 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 879  * list that describes which code points it matches */
 880
 881 STATIC void
 882 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 883 {
 884     /* Set the SSC 'ssc' to match an empty string or any code point */
 885
 886     PERL_ARGS_ASSERT_SSC_ANYTHING;
 887
 888     assert(is_ANYOF_SYNTHETIC(ssc));
 889
 890     ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 891     _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 892     ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 893 }
 894
 895 STATIC int
 896 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 897 {
 898     /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 899      * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 900      * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 901      * in any way, so there's no point in using it */
 902
 903     UV start, end;
 904     bool ret;
 905
 906     PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 907
 908     assert(is_ANYOF_SYNTHETIC(ssc));
 909
 910     if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 911         return FALSE;
 912     }
 913
 914     /* See if the list consists solely of the range 0 - Infinity */
 915     invlist_iterinit(ssc->invlist);
 916     ret = invlist_iternext(ssc->invlist, &start, &end)
 917           && start == 0
 918           && end == UV_MAX;
 919
 920     invlist_iterfinish(ssc->invlist);
 921
 922     if (ret) {
 923         return TRUE;
 924     }
 925
 926     /* If e.g., both \w and \W are set, matches everything */
 927     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 928         int i;
 929         for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 930             if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 931                 return TRUE;
 932             }
 933         }
 934     }
 935
 936     return FALSE;
 937 }
 938
 939 STATIC void
 940 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 941 {
 942     /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 943      * string, any code point, or any posix class under locale */
 944
 945     PERL_ARGS_ASSERT_SSC_INIT;
 946
 947     Zero(ssc, 1, regnode_ssc);
 948     set_ANYOF_SYNTHETIC(ssc);
 949     ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 950     ssc_anything(ssc);
 951
 952     /* If any portion of the regex is to operate under locale rules,
 953      * initialization includes it.  The reason this isn't done for all regexes
 954      * is that the optimizer was written under the assumption that locale was
 955      * all-or-nothing.  Given the complexity and lack of documentation in the
 956      * optimizer, and that there are inadequate test cases for locale, many
 957      * parts of it may not work properly, it is safest to avoid locale unless
 958      * necessary. */
 959     if (RExC_contains_locale) {
 960         ANYOF_POSIXL_SETALL(ssc);
 961     }
 962     else {
 963         ANYOF_POSIXL_ZERO(ssc);
 964     }
 965 }
 966
 967 STATIC int
 968 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 969                               const regnode_ssc *ssc)
 970 {
 971     /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 972      * to the list of code points matched, and locale posix classes; hence does
 973      * not check its flags) */
 974
 975     UV start, end;
 976     bool ret;
 977
 978     PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 979
 980     assert(is_ANYOF_SYNTHETIC(ssc));
 981
 982     invlist_iterinit(ssc->invlist);
 983     ret = invlist_iternext(ssc->invlist, &start, &end)
 984           && start == 0
 985           && end == UV_MAX;
 986
 987     invlist_iterfinish(ssc->invlist);
 988
 989     if (! ret) {
 990         return FALSE;
 991     }
 992
 993     if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 994         return FALSE;
 995     }
 996
 997     return TRUE;
 998 }
 999
1000 STATIC SV*
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002                                const regnode_charclass* const node)
1003 {
1004     /* Returns a mortal inversion list defining which code points are matched
1005      * by 'node', which is of type ANYOF.  Handles complementing the result if
1006      * appropriate.  If some code points aren't knowable at this time, the
1007      * returned list must, and will, contain every code point that is a
1008      * possibility. */
1009
1010     SV* invlist = sv_2mortal(_new_invlist(0));
1011     SV* only_utf8_locale_invlist = NULL;
1012     unsigned int i;
1013     const U32 n = ARG(node);
1014     bool new_node_has_latin1 = FALSE;
1015
1016     PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1017
1018     /* Look at the data structure created by S_set_ANYOF_arg() */
1019     if (n != ANYOF_NONBITMAP_EMPTY) {
1020         SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1021         AV * const av = MUTABLE_AV(SvRV(rv));
1022         SV **const ary = AvARRAY(av);
1023         assert(RExC_rxi->data->what[n] == 's');
1024
1025         if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1026             invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1027         }
1028         else if (ary[0] && ary[0] != &PL_sv_undef) {
1029
1030             /* Here, no compile-time swash, and there are things that won't be
1031              * known until runtime -- we have to assume it could be anything */
1032             return _add_range_to_invlist(invlist, 0, UV_MAX);
1033         }
1034         else if (ary[3] && ary[3] != &PL_sv_undef) {
1035
1036             /* Here no compile-time swash, and no run-time only data.  Use the
1037              * node's inversion list */
1038             invlist = sv_2mortal(invlist_clone(ary[3]));
1039         }
1040
1041         /* Get the code points valid only under UTF-8 locales */
1042         if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1043             && ary[2] && ary[2] != &PL_sv_undef)
1044         {
1045             only_utf8_locale_invlist = ary[2];
1046         }
1047     }
1048
1049     /* An ANYOF node contains a bitmap for the first 256 code points, and an
1050      * inversion list for the others, but if there are code points that should
1051      * match only conditionally on the target string being UTF-8, those are
1052      * placed in the inversion list, and not the bitmap.  Since there are
1053      * circumstances under which they could match, they are included in the
1054      * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1055      * here, so that when we invert below, the end result actually does include
1056      * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1057      * before we add the unconditionally matched code points */
1058     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1059         _invlist_intersection_complement_2nd(invlist,
1060                                              PL_UpperLatin1,
1061                                              &invlist);
1062     }
1063
1064     /* Add in the points from the bit map */
1065     for (i = 0; i < 256; i++) {
1066         if (ANYOF_BITMAP_TEST(node, i)) {
1067             invlist = add_cp_to_invlist(invlist, i);
1068             new_node_has_latin1 = TRUE;
1069         }
1070     }
1071
1072     /* If this can match all upper Latin1 code points, have to add them
1073      * as well */
1074     if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1075         _invlist_union(invlist, PL_UpperLatin1, &invlist);
1076     }
1077
1078     /* Similarly for these */
1079     if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1080         invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1081     }
1082
1083     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1084         _invlist_invert(invlist);
1085     }
1086     else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1087
1088         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1089          * locale.  We can skip this if there are no 0-255 at all. */
1090         _invlist_union(invlist, PL_Latin1, &invlist);
1091     }
1092
1093     /* Similarly add the UTF-8 locale possible matches.  These have to be
1094      * deferred until after the non-UTF-8 locale ones are taken care of just
1095      * above, or it leads to wrong results under ANYOF_INVERT */
1096     if (only_utf8_locale_invlist) {
1097         _invlist_union_maybe_complement_2nd(invlist,
1098                                             only_utf8_locale_invlist,
1099                                             ANYOF_FLAGS(node) & ANYOF_INVERT,
1100                                             &invlist);
1101     }
1102
1103     return invlist;
1104 }
1105
1106 /* These two functions currently do the exact same thing */
1107 #define ssc_init_zero           ssc_init
1108
1109 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1110 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1111
1112 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1113  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1114  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1115
1116 STATIC void
1117 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1118                 const regnode_charclass *and_with)
1119 {
1120     /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1121      * another SSC or a regular ANYOF class.  Can create false positives. */
1122
1123     SV* anded_cp_list;
1124     U8  anded_flags;
1125
1126     PERL_ARGS_ASSERT_SSC_AND;
1127
1128     assert(is_ANYOF_SYNTHETIC(ssc));
1129
1130     /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1131      * the code point inversion list and just the relevant flags */
1132     if (is_ANYOF_SYNTHETIC(and_with)) {
1133         anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1134         anded_flags = ANYOF_FLAGS(and_with);
1135
1136         /* XXX This is a kludge around what appears to be deficiencies in the
1137          * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1138          * there are paths through the optimizer where it doesn't get weeded
1139          * out when it should.  And if we don't make some extra provision for
1140          * it like the code just below, it doesn't get added when it should.
1141          * This solution is to add it only when AND'ing, which is here, and
1142          * only when what is being AND'ed is the pristine, original node
1143          * matching anything.  Thus it is like adding it to ssc_anything() but
1144          * only when the result is to be AND'ed.  Probably the same solution
1145          * could be adopted for the same problem we have with /l matching,
1146          * which is solved differently in S_ssc_init(), and that would lead to
1147          * fewer false positives than that solution has.  But if this solution
1148          * creates bugs, the consequences are only that a warning isn't raised
1149          * that should be; while the consequences for having /l bugs is
1150          * incorrect matches */
1151         if (ssc_is_anything((regnode_ssc *)and_with)) {
1152             anded_flags |= ANYOF_WARN_SUPER;
1153         }
1154     }
1155     else {
1156         anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1157         anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1158     }
1159
1160     ANYOF_FLAGS(ssc) &= anded_flags;
1161
1162     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1163      * C2 is the list of code points in 'and-with'; P2, its posix classes.
1164      * 'and_with' may be inverted.  When not inverted, we have the situation of
1165      * computing:
1166      *  (C1 | P1) & (C2 | P2)
1167      *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1168      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1169      *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1170      *                    <=  ((C1 & C2) | P1 | P2)
1171      * Alternatively, the last few steps could be:
1172      *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1173      *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1174      *                    <=  (C1 | C2 | (P1 & P2))
1175      * We favor the second approach if either P1 or P2 is non-empty.  This is
1176      * because these components are a barrier to doing optimizations, as what
1177      * they match cannot be known until the moment of matching as they are
1178      * dependent on the current locale, 'AND"ing them likely will reduce or
1179      * eliminate them.
1180      * But we can do better if we know that C1,P1 are in their initial state (a
1181      * frequent occurrence), each matching everything:
1182      *  (<everything>) & (C2 | P2) =  C2 | P2
1183      * Similarly, if C2,P2 are in their initial state (again a frequent
1184      * occurrence), the result is a no-op
1185      *  (C1 | P1) & (<everything>) =  C1 | P1
1186      *
1187      * Inverted, we have
1188      *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1189      *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1190      *                         <=  (C1 & ~C2) | (P1 & ~P2)
1191      * */
1192
1193     if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1194         && ! is_ANYOF_SYNTHETIC(and_with))
1195     {
1196         unsigned int i;
1197
1198         ssc_intersection(ssc,
1199                          anded_cp_list,
1200                          FALSE /* Has already been inverted */
1201                          );
1202
1203         /* If either P1 or P2 is empty, the intersection will be also; can skip
1204          * the loop */
1205         if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1206             ANYOF_POSIXL_ZERO(ssc);
1207         }
1208         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1209
1210             /* Note that the Posix class component P from 'and_with' actually
1211              * looks like:
1212              *      P = Pa | Pb | ... | Pn
1213              * where each component is one posix class, such as in [\w\s].
1214              * Thus
1215              *      ~P = ~(Pa | Pb | ... | Pn)
1216              *         = ~Pa & ~Pb & ... & ~Pn
1217              *        <= ~Pa | ~Pb | ... | ~Pn
1218              * The last is something we can easily calculate, but unfortunately
1219              * is likely to have many false positives.  We could do better
1220              * in some (but certainly not all) instances if two classes in
1221              * P have known relationships.  For example
1222              *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1223              * So
1224              *      :lower: & :print: = :lower:
1225              * And similarly for classes that must be disjoint.  For example,
1226              * since \s and \w can have no elements in common based on rules in
1227              * the POSIX standard,
1228              *      \w & ^\S = nothing
1229              * Unfortunately, some vendor locales do not meet the Posix
1230              * standard, in particular almost everything by Microsoft.
1231              * The loop below just changes e.g., \w into \W and vice versa */
1232
1233             regnode_charclass_posixl temp;
1234             int add = 1;    /* To calculate the index of the complement */
1235
1236             ANYOF_POSIXL_ZERO(&temp);
1237             for (i = 0; i < ANYOF_MAX; i++) {
1238                 assert(i % 2 != 0
1239                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1240                        || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1241
1242                 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1243                     ANYOF_POSIXL_SET(&temp, i + add);
1244                 }
1245                 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1246             }
1247             ANYOF_POSIXL_AND(&temp, ssc);
1248
1249         } /* else ssc already has no posixes */
1250     } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1251          in its initial state */
1252     else if (! is_ANYOF_SYNTHETIC(and_with)
1253              || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1254     {
1255         /* But if 'ssc' is in its initial state, the result is just 'and_with';
1256          * copy it over 'ssc' */
1257         if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1258             if (is_ANYOF_SYNTHETIC(and_with)) {
1259                 StructCopy(and_with, ssc, regnode_ssc);
1260             }
1261             else {
1262                 ssc->invlist = anded_cp_list;
1263                 ANYOF_POSIXL_ZERO(ssc);
1264                 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1265                     ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1266                 }
1267             }
1268         }
1269         else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1270                  || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1271         {
1272             /* One or the other of P1, P2 is non-empty. */
1273             if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274                 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1275             }
1276             ssc_union(ssc, anded_cp_list, FALSE);
1277         }
1278         else { /* P1 = P2 = empty */
1279             ssc_intersection(ssc, anded_cp_list, FALSE);
1280         }
1281     }
1282 }
1283
1284 STATIC void
1285 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1286                const regnode_charclass *or_with)
1287 {
1288     /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1289      * another SSC or a regular ANYOF class.  Can create false positives if
1290      * 'or_with' is to be inverted. */
1291
1292     SV* ored_cp_list;
1293     U8 ored_flags;
1294
1295     PERL_ARGS_ASSERT_SSC_OR;
1296
1297     assert(is_ANYOF_SYNTHETIC(ssc));
1298
1299     /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1300      * the code point inversion list and just the relevant flags */
1301     if (is_ANYOF_SYNTHETIC(or_with)) {
1302         ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1303         ored_flags = ANYOF_FLAGS(or_with);
1304     }
1305     else {
1306         ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1307         ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1308     }
1309
1310     ANYOF_FLAGS(ssc) |= ored_flags;
1311
1312     /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1313      * C2 is the list of code points in 'or-with'; P2, its posix classes.
1314      * 'or_with' may be inverted.  When not inverted, we have the simple
1315      * situation of computing:
1316      *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1317      * If P1|P2 yields a situation with both a class and its complement are
1318      * set, like having both \w and \W, this matches all code points, and we
1319      * can delete these from the P component of the ssc going forward.  XXX We
1320      * might be able to delete all the P components, but I (khw) am not certain
1321      * about this, and it is better to be safe.
1322      *
1323      * Inverted, we have
1324      *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1325      *                         <=  (C1 | P1) | ~C2
1326      *                         <=  (C1 | ~C2) | P1
1327      * (which results in actually simpler code than the non-inverted case)
1328      * */
1329
1330     if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1331         && ! is_ANYOF_SYNTHETIC(or_with))
1332     {
1333         /* We ignore P2, leaving P1 going forward */
1334     }   /* else  Not inverted */
1335     else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1336         ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1337         if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1338             unsigned int i;
1339             for (i = 0; i < ANYOF_MAX; i += 2) {
1340                 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1341                 {
1342                     ssc_match_all_cp(ssc);
1343                     ANYOF_POSIXL_CLEAR(ssc, i);
1344                     ANYOF_POSIXL_CLEAR(ssc, i+1);
1345                 }
1346             }
1347         }
1348     }
1349
1350     ssc_union(ssc,
1351               ored_cp_list,
1352               FALSE /* Already has been inverted */
1353               );
1354 }
1355
1356 PERL_STATIC_INLINE void
1357 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1358 {
1359     PERL_ARGS_ASSERT_SSC_UNION;
1360
1361     assert(is_ANYOF_SYNTHETIC(ssc));
1362
1363     _invlist_union_maybe_complement_2nd(ssc->invlist,
1364                                         invlist,
1365                                         invert2nd,
1366                                         &ssc->invlist);
1367 }
1368
1369 PERL_STATIC_INLINE void
1370 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1371                          SV* const invlist,
1372                          const bool invert2nd)
1373 {
1374     PERL_ARGS_ASSERT_SSC_INTERSECTION;
1375
1376     assert(is_ANYOF_SYNTHETIC(ssc));
1377
1378     _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1379                                                invlist,
1380                                                invert2nd,
1381                                                &ssc->invlist);
1382 }
1383
1384 PERL_STATIC_INLINE void
1385 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1386 {
1387     PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1388
1389     assert(is_ANYOF_SYNTHETIC(ssc));
1390
1391     ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1392 }
1393
1394 PERL_STATIC_INLINE void
1395 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1396 {
1397     /* AND just the single code point 'cp' into the SSC 'ssc' */
1398
1399     SV* cp_list = _new_invlist(2);
1400
1401     PERL_ARGS_ASSERT_SSC_CP_AND;
1402
1403     assert(is_ANYOF_SYNTHETIC(ssc));
1404
1405     cp_list = add_cp_to_invlist(cp_list, cp);
1406     ssc_intersection(ssc, cp_list,
1407                      FALSE /* Not inverted */
1408                      );
1409     SvREFCNT_dec_NN(cp_list);
1410 }
1411
1412 PERL_STATIC_INLINE void
1413 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1414 {
1415     /* Set the SSC 'ssc' to not match any locale things */
1416
1417     PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1418
1419     assert(is_ANYOF_SYNTHETIC(ssc));
1420
1421     ANYOF_POSIXL_ZERO(ssc);
1422     ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1423 }
1424
1425 STATIC void
1426 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1427 {
1428     /* The inversion list in the SSC is marked mortal; now we need a more
1429      * permanent copy, which is stored the same way that is done in a regular
1430      * ANYOF node, with the first 256 code points in a bit map */
1431
1432     SV* invlist = invlist_clone(ssc->invlist);
1433
1434     PERL_ARGS_ASSERT_SSC_FINALIZE;
1435
1436     assert(is_ANYOF_SYNTHETIC(ssc));
1437
1438     /* The code in this file assumes that all but these flags aren't relevant
1439      * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1440      * time we reach here */
1441     assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1442
1443     populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1444
1445     set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1446                                 NULL, NULL, NULL, FALSE);
1447
1448     /* Make sure is clone-safe */
1449     ssc->invlist = NULL;
1450
1451     if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1452         ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1453     }
1454
1455     assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1456 }
1457
1458 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1459 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1460 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1461 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1462                                ? (TRIE_LIST_CUR( idx ) - 1)           \
1463                                : 0 )
1464
1465
1466 #ifdef DEBUGGING
1467 /*
1468    dump_trie(trie,widecharmap,revcharmap)
1469    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1470    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1471
1472    These routines dump out a trie in a somewhat readable format.
1473    The _interim_ variants are used for debugging the interim
1474    tables that are used to generate the final compressed
1475    representation which is what dump_trie expects.
1476
1477    Part of the reason for their existence is to provide a form
1478    of documentation as to how the different representations function.
1479
1480 */
1481
1482 /*
1483   Dumps the final compressed table form of the trie to Perl_debug_log.
1484   Used for debugging make_trie().
1485 */
1486
1487 STATIC void
1488 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1489             AV *revcharmap, U32 depth)
1490 {
1491     U32 state;
1492     SV *sv=sv_newmortal();
1493     int colwidth= widecharmap ? 6 : 4;
1494     U16 word;
1495     GET_RE_DEBUG_FLAGS_DECL;
1496
1497     PERL_ARGS_ASSERT_DUMP_TRIE;
1498
1499     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1500         (int)depth * 2 + 2,"",
1501         "Match","Base","Ofs" );
1502
1503     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1504         SV ** const tmp = av_fetch( revcharmap, state, 0);
1505         if ( tmp ) {
1506             PerlIO_printf( Perl_debug_log, "%*s",
1507                 colwidth,
1508                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1509                             PL_colors[0], PL_colors[1],
1510                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1511                             PERL_PV_ESCAPE_FIRSTCHAR
1512                 )
1513             );
1514         }
1515     }
1516     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1517         (int)depth * 2 + 2,"");
1518
1519     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1520         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1521     PerlIO_printf( Perl_debug_log, "\n");
1522
1523     for( state = 1 ; state < trie->statecount ; state++ ) {
1524         const U32 base = trie->states[ state ].trans.base;
1525
1526         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1527                                        (int)depth * 2 + 2,"", (UV)state);
1528
1529         if ( trie->states[ state ].wordnum ) {
1530             PerlIO_printf( Perl_debug_log, " W%4X",
1531                                            trie->states[ state ].wordnum );
1532         } else {
1533             PerlIO_printf( Perl_debug_log, "%6s", "" );
1534         }
1535
1536         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1537
1538         if ( base ) {
1539             U32 ofs = 0;
1540
1541             while( ( base + ofs  < trie->uniquecharcount ) ||
1542                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1543                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1544                                                                     != state))
1545                     ofs++;
1546
1547             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1548
1549             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1550                 if ( ( base + ofs >= trie->uniquecharcount )
1551                         && ( base + ofs - trie->uniquecharcount
1552                                                         < trie->lasttrans )
1553                         && trie->trans[ base + ofs
1554                                     - trie->uniquecharcount ].check == state )
1555                 {
1556                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1557                     colwidth,
1558                     (UV)trie->trans[ base + ofs
1559                                              - trie->uniquecharcount ].next );
1560                 } else {
1561                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1562                 }
1563             }
1564
1565             PerlIO_printf( Perl_debug_log, "]");
1566
1567         }
1568         PerlIO_printf( Perl_debug_log, "\n" );
1569     }
1570     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1571                                 (int)depth*2, "");
1572     for (word=1; word <= trie->wordcount; word++) {
1573         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1574             (int)word, (int)(trie->wordinfo[word].prev),
1575             (int)(trie->wordinfo[word].len));
1576     }
1577     PerlIO_printf(Perl_debug_log, "\n" );
1578 }
1579 /*
1580   Dumps a fully constructed but uncompressed trie in list form.
1581   List tries normally only are used for construction when the number of
1582   possible chars (trie->uniquecharcount) is very high.
1583   Used for debugging make_trie().
1584 */
1585 STATIC void
1586 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1587                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1588                          U32 depth)
1589 {
1590     U32 state;
1591     SV *sv=sv_newmortal();
1592     int colwidth= widecharmap ? 6 : 4;
1593     GET_RE_DEBUG_FLAGS_DECL;
1594
1595     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1596
1597     /* print out the table precompression.  */
1598     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1599         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1600         "------:-----+-----------------\n" );
1601
1602     for( state=1 ; state < next_alloc ; state ++ ) {
1603         U16 charid;
1604
1605         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1606             (int)depth * 2 + 2,"", (UV)state  );
1607         if ( ! trie->states[ state ].wordnum ) {
1608             PerlIO_printf( Perl_debug_log, "%5s| ","");
1609         } else {
1610             PerlIO_printf( Perl_debug_log, "W%4x| ",
1611                 trie->states[ state ].wordnum
1612             );
1613         }
1614         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1615             SV ** const tmp = av_fetch( revcharmap,
1616                                         TRIE_LIST_ITEM(state,charid).forid, 0);
1617             if ( tmp ) {
1618                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1619                     colwidth,
1620                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1621                               colwidth,
1622                               PL_colors[0], PL_colors[1],
1623                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1624                               | PERL_PV_ESCAPE_FIRSTCHAR
1625                     ) ,
1626                     TRIE_LIST_ITEM(state,charid).forid,
1627                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1628                 );
1629                 if (!(charid % 10))
1630                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1631                         (int)((depth * 2) + 14), "");
1632             }
1633         }
1634         PerlIO_printf( Perl_debug_log, "\n");
1635     }
1636 }
1637
1638 /*
1639   Dumps a fully constructed but uncompressed trie in table form.
1640   This is the normal DFA style state transition table, with a few
1641   twists to facilitate compression later.
1642   Used for debugging make_trie().
1643 */
1644 STATIC void
1645 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1646                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1647                           U32 depth)
1648 {
1649     U32 state;
1650     U16 charid;
1651     SV *sv=sv_newmortal();
1652     int colwidth= widecharmap ? 6 : 4;
1653     GET_RE_DEBUG_FLAGS_DECL;
1654
1655     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1656
1657     /*
1658        print out the table precompression so that we can do a visual check
1659        that they are identical.
1660      */
1661
1662     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1663
1664     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1665         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1666         if ( tmp ) {
1667             PerlIO_printf( Perl_debug_log, "%*s",
1668                 colwidth,
1669                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670                             PL_colors[0], PL_colors[1],
1671                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672                             PERL_PV_ESCAPE_FIRSTCHAR
1673                 )
1674             );
1675         }
1676     }
1677
1678     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1679
1680     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1681         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1682     }
1683
1684     PerlIO_printf( Perl_debug_log, "\n" );
1685
1686     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1687
1688         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1689             (int)depth * 2 + 2,"",
1690             (UV)TRIE_NODENUM( state ) );
1691
1692         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1693             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1694             if (v)
1695                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1696             else
1697                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1698         }
1699         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1700             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1701                                             (UV)trie->trans[ state ].check );
1702         } else {
1703             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1704                                             (UV)trie->trans[ state ].check,
1705             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1706         }
1707     }
1708 }
1709
1710 #endif
1711
1712
1713 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1714   startbranch: the first branch in the whole branch sequence
1715   first      : start branch of sequence of branch-exact nodes.
1716                May be the same as startbranch
1717   last       : Thing following the last branch.
1718                May be the same as tail.
1719   tail       : item following the branch sequence
1720   count      : words in the sequence
1721   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1722   depth      : indent depth
1723
1724 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1725
1726 A trie is an N'ary tree where the branches are determined by digital
1727 decomposition of the key. IE, at the root node you look up the 1st character and
1728 follow that branch repeat until you find the end of the branches. Nodes can be
1729 marked as "accepting" meaning they represent a complete word. Eg:
1730
1731   /he|she|his|hers/
1732
1733 would convert into the following structure. Numbers represent states, letters
1734 following numbers represent valid transitions on the letter from that state, if
1735 the number is in square brackets it represents an accepting state, otherwise it
1736 will be in parenthesis.
1737
1738       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1739       |    |
1740       |   (2)
1741       |    |
1742      (1)   +-i->(6)-+-s->[7]
1743       |
1744       +-s->(3)-+-h->(4)-+-e->[5]
1745
1746       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1747
1748 This shows that when matching against the string 'hers' we will begin at state 1
1749 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1750 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1751 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1752 single traverse. We store a mapping from accepting to state to which word was
1753 matched, and then when we have multiple possibilities we try to complete the
1754 rest of the regex in the order in which they occured in the alternation.
1755
1756 The only prior NFA like behaviour that would be changed by the TRIE support is
1757 the silent ignoring of duplicate alternations which are of the form:
1758
1759  / (DUPE|DUPE) X? (?{ ... }) Y /x
1760
1761 Thus EVAL blocks following a trie may be called a different number of times with
1762 and without the optimisation. With the optimisations dupes will be silently
1763 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1764 the following demonstrates:
1765
1766  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1767
1768 which prints out 'word' three times, but
1769
1770  'words'=~/(word|word|word)(?{ print $1 })S/
1771
1772 which doesnt print it out at all. This is due to other optimisations kicking in.
1773
1774 Example of what happens on a structural level:
1775
1776 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1777
1778    1: CURLYM[1] {1,32767}(18)
1779    5:   BRANCH(8)
1780    6:     EXACT <ac>(16)
1781    8:   BRANCH(11)
1782    9:     EXACT <ad>(16)
1783   11:   BRANCH(14)
1784   12:     EXACT <ab>(16)
1785   16:   SUCCEED(0)
1786   17:   NOTHING(18)
1787   18: END(0)
1788
1789 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1790 and should turn into:
1791
1792    1: CURLYM[1] {1,32767}(18)
1793    5:   TRIE(16)
1794         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1795           <ac>
1796           <ad>
1797           <ab>
1798   16:   SUCCEED(0)
1799   17:   NOTHING(18)
1800   18: END(0)
1801
1802 Cases where tail != last would be like /(?foo|bar)baz/:
1803
1804    1: BRANCH(4)
1805    2:   EXACT <foo>(8)
1806    4: BRANCH(7)
1807    5:   EXACT <bar>(8)
1808    7: TAIL(8)
1809    8: EXACT <baz>(10)
1810   10: END(0)
1811
1812 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1813 and would end up looking like:
1814
1815     1: TRIE(8)
1816       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1817         <foo>
1818         <bar>
1819    7: TAIL(8)
1820    8: EXACT <baz>(10)
1821   10: END(0)
1822
1823     d = uvchr_to_utf8_flags(d, uv, 0);
1824
1825 is the recommended Unicode-aware way of saying
1826
1827     *(d++) = uv;
1828 */
1829
1830 #define TRIE_STORE_REVCHAR(val)                                            \
1831     STMT_START {                                                           \
1832         if (UTF) {                                                         \
1833             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1834             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1835             unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1836             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1837             SvPOK_on(zlopp);                                               \
1838             SvUTF8_on(zlopp);                                              \
1839             av_push(revcharmap, zlopp);                                    \
1840         } else {                                                           \
1841             char ooooff = (char)val;                                           \
1842             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1843         }                                                                  \
1844         } STMT_END
1845
1846 /* This gets the next character from the input, folding it if not already
1847  * folded. */
1848 #define TRIE_READ_CHAR STMT_START {                                           \
1849     wordlen++;                                                                \
1850     if ( UTF ) {                                                              \
1851         /* if it is UTF then it is either already folded, or does not need    \
1852          * folding */                                                         \
1853         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1854     }                                                                         \
1855     else if (folder == PL_fold_latin1) {                                      \
1856         /* This folder implies Unicode rules, which in the range expressible  \
1857          *  by not UTF is the lower case, with the two exceptions, one of     \
1858          *  which should have been taken care of before calling this */       \
1859         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1860         uvc = toLOWER_L1(*uc);                                                \
1861         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1862         len = 1;                                                              \
1863     } else {                                                                  \
1864         /* raw data, will be folded later if needed */                        \
1865         uvc = (U32)*uc;                                                       \
1866         len = 1;                                                              \
1867     }                                                                         \
1868 } STMT_END
1869
1870
1871
1872 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1873     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1874         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1875         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1876     }                                                           \
1877     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1878     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1879     TRIE_LIST_CUR( state )++;                                   \
1880 } STMT_END
1881
1882 #define TRIE_LIST_NEW(state) STMT_START {                       \
1883     Newxz( trie->states[ state ].trans.list,               \
1884         4, reg_trie_trans_le );                                 \
1885      TRIE_LIST_CUR( state ) = 1;                                \
1886      TRIE_LIST_LEN( state ) = 4;                                \
1887 } STMT_END
1888
1889 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1890     U16 dupe= trie->states[ state ].wordnum;                    \
1891     regnode * const noper_next = regnext( noper );              \
1892                                                                 \
1893     DEBUG_r({                                                   \
1894         /* store the word for dumping */                        \
1895         SV* tmp;                                                \
1896         if (OP(noper) != NOTHING)                               \
1897             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1898         else                                                    \
1899             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1900         av_push( trie_words, tmp );                             \
1901     });                                                         \
1902                                                                 \
1903     curword++;                                                  \
1904     trie->wordinfo[curword].prev   = 0;                         \
1905     trie->wordinfo[curword].len    = wordlen;                   \
1906     trie->wordinfo[curword].accept = state;                     \
1907                                                                 \
1908     if ( noper_next < tail ) {                                  \
1909         if (!trie->jump)                                        \
1910             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1911                                                  sizeof(U16) ); \
1912         trie->jump[curword] = (U16)(noper_next - convert);      \
1913         if (!jumper)                                            \
1914             jumper = noper_next;                                \
1915         if (!nextbranch)                                        \
1916             nextbranch= regnext(cur);                           \
1917     }                                                           \
1918                                                                 \
1919     if ( dupe ) {                                               \
1920         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1921         /* chain, so that when the bits of chain are later    */\
1922         /* linked together, the dups appear in the chain      */\
1923         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1924         trie->wordinfo[dupe].prev = curword;                    \
1925     } else {                                                    \
1926         /* we haven't inserted this word yet.                */ \
1927         trie->states[ state ].wordnum = curword;                \
1928     }                                                           \
1929 } STMT_END
1930
1931
1932 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1933      ( ( base + charid >=  ucharcount                                   \
1934          && base + charid < ubound                                      \
1935          && state == trie->trans[ base - ucharcount + charid ].check    \
1936          && trie->trans[ base - ucharcount + charid ].next )            \
1937            ? trie->trans[ base - ucharcount + charid ].next             \
1938            : ( state==1 ? special : 0 )                                 \
1939       )
1940
1941 #define MADE_TRIE       1
1942 #define MADE_JUMP_TRIE  2
1943 #define MADE_EXACT_TRIE 4
1944
1945 STATIC I32
1946 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1947                   regnode *first, regnode *last, regnode *tail,
1948                   U32 word_count, U32 flags, U32 depth)
1949 {
1950     dVAR;
1951     /* first pass, loop through and scan words */
1952     reg_trie_data *trie;
1953     HV *widecharmap = NULL;
1954     AV *revcharmap = newAV();
1955     regnode *cur;
1956     STRLEN len = 0;
1957     UV uvc = 0;
1958     U16 curword = 0;
1959     U32 next_alloc = 0;
1960     regnode *jumper = NULL;
1961     regnode *nextbranch = NULL;
1962     regnode *convert = NULL;
1963     U32 *prev_states; /* temp array mapping each state to previous one */
1964     /* we just use folder as a flag in utf8 */
1965     const U8 * folder = NULL;
1966
1967 #ifdef DEBUGGING
1968     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969     AV *trie_words = NULL;
1970     /* along with revcharmap, this only used during construction but both are
1971      * useful during debugging so we store them in the struct when debugging.
1972      */
1973 #else
1974     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975     STRLEN trie_charcount=0;
1976 #endif
1977     SV *re_trie_maxbuff;
1978     GET_RE_DEBUG_FLAGS_DECL;
1979
1980     PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982     PERL_UNUSED_ARG(depth);
1983 #endif
1984
1985     switch (flags) {
1986         case EXACT: break;
1987         case EXACTFA:
1988         case EXACTFU_SS:
1989         case EXACTFU: folder = PL_fold_latin1; break;
1990         case EXACTF:  folder = PL_fold; break;
1991         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992     }
1993
1994     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995     trie->refcount = 1;
1996     trie->startstate = 1;
1997     trie->wordcount = word_count;
1998     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000     if (flags == EXACT)
2001         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004
2005     DEBUG_r({
2006         trie_words = newAV();
2007     });
2008
2009     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010     if (!SvIOK(re_trie_maxbuff)) {
2011         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2012     }
2013     DEBUG_TRIE_COMPILE_r({
2014         PerlIO_printf( Perl_debug_log,
2015           "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2016           (int)depth * 2 + 2, "",
2017           REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2018           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2019     });
2020
2021    /* Find the node we are going to overwrite */
2022     if ( first == startbranch && OP( last ) != BRANCH ) {
2023         /* whole branch chain */
2024         convert = first;
2025     } else {
2026         /* branch sub-chain */
2027         convert = NEXTOPER( first );
2028     }
2029
2030     /*  -- First loop and Setup --
2031
2032        We first traverse the branches and scan each word to determine if it
2033        contains widechars, and how many unique chars there are, this is
2034        important as we have to build a table with at least as many columns as we
2035        have unique chars.
2036
2037        We use an array of integers to represent the character codes 0..255
2038        (trie->charmap) and we use a an HV* to store Unicode characters. We use
2039        the native representation of the character value as the key and IV's for
2040        the coded index.
2041
2042        *TODO* If we keep track of how many times each character is used we can
2043        remap the columns so that the table compression later on is more
2044        efficient in terms of memory by ensuring the most common value is in the
2045        middle and the least common are on the outside.  IMO this would be better
2046        than a most to least common mapping as theres a decent chance the most
2047        common letter will share a node with the least common, meaning the node
2048        will not be compressible. With a middle is most common approach the worst
2049        case is when we have the least common nodes twice.
2050
2051      */
2052
2053     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2054         regnode *noper = NEXTOPER( cur );
2055         const U8 *uc = (U8*)STRING( noper );
2056         const U8 *e  = uc + STR_LEN( noper );
2057         int foldlen = 0;
2058         U32 wordlen      = 0;         /* required init */
2059         STRLEN minchars = 0;
2060         STRLEN maxchars = 0;
2061         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2062                                                bitmap?*/
2063
2064         if (OP(noper) == NOTHING) {
2065             regnode *noper_next= regnext(noper);
2066             if (noper_next != tail && OP(noper_next) == flags) {
2067                 noper = noper_next;
2068                 uc= (U8*)STRING(noper);
2069                 e= uc + STR_LEN(noper);
2070                 trie->minlen= STR_LEN(noper);
2071             } else {
2072                 trie->minlen= 0;
2073                 continue;
2074             }
2075         }
2076
2077         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2078             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2079                                           regardless of encoding */
2080             if (OP( noper ) == EXACTFU_SS) {
2081                 /* false positives are ok, so just set this */
2082                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2083             }
2084         }
2085         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2086                                            branch */
2087             TRIE_CHARCOUNT(trie)++;
2088             TRIE_READ_CHAR;
2089
2090             /* TRIE_READ_CHAR returns the current character, or its fold if /i
2091              * is in effect.  Under /i, this character can match itself, or
2092              * anything that folds to it.  If not under /i, it can match just
2093              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2094              * all fold to k, and all are single characters.   But some folds
2095              * expand to more than one character, so for example LATIN SMALL
2096              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2097              * the string beginning at 'uc' is 'ffi', it could be matched by
2098              * three characters, or just by the one ligature character. (It
2099              * could also be matched by two characters: LATIN SMALL LIGATURE FF
2100              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2101              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2102              * match.)  The trie needs to know the minimum and maximum number
2103              * of characters that could match so that it can use size alone to
2104              * quickly reject many match attempts.  The max is simple: it is
2105              * the number of folded characters in this branch (since a fold is
2106              * never shorter than what folds to it. */
2107
2108             maxchars++;
2109
2110             /* And the min is equal to the max if not under /i (indicated by
2111              * 'folder' being NULL), or there are no multi-character folds.  If
2112              * there is a multi-character fold, the min is incremented just
2113              * once, for the character that folds to the sequence.  Each
2114              * character in the sequence needs to be added to the list below of
2115              * characters in the trie, but we count only the first towards the
2116              * min number of characters needed.  This is done through the
2117              * variable 'foldlen', which is returned by the macros that look
2118              * for these sequences as the number of bytes the sequence
2119              * occupies.  Each time through the loop, we decrement 'foldlen' by
2120              * how many bytes the current char occupies.  Only when it reaches
2121              * 0 do we increment 'minchars' or look for another multi-character
2122              * sequence. */
2123             if (folder == NULL) {
2124                 minchars++;
2125             }
2126             else if (foldlen > 0) {
2127                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2128             }
2129             else {
2130                 minchars++;
2131
2132                 /* See if *uc is the beginning of a multi-character fold.  If
2133                  * so, we decrement the length remaining to look at, to account
2134                  * for the current character this iteration.  (We can use 'uc'
2135                  * instead of the fold returned by TRIE_READ_CHAR because for
2136                  * non-UTF, the latin1_safe macro is smart enough to account
2137                  * for all the unfolded characters, and because for UTF, the
2138                  * string will already have been folded earlier in the
2139                  * compilation process */
2140                 if (UTF) {
2141                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2142                         foldlen -= UTF8SKIP(uc);
2143                     }
2144                 }
2145                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2146                     foldlen--;
2147                 }
2148             }
2149
2150             /* The current character (and any potential folds) should be added
2151              * to the possible matching characters for this position in this
2152              * branch */
2153             if ( uvc < 256 ) {
2154                 if ( folder ) {
2155                     U8 folded= folder[ (U8) uvc ];
2156                     if ( !trie->charmap[ folded ] ) {
2157                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
2158                         TRIE_STORE_REVCHAR( folded );
2159                     }
2160                 }
2161                 if ( !trie->charmap[ uvc ] ) {
2162                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2163                     TRIE_STORE_REVCHAR( uvc );
2164                 }
2165                 if ( set_bit ) {
2166                     /* store the codepoint in the bitmap, and its folded
2167                      * equivalent. */
2168                     TRIE_BITMAP_SET(trie, uvc);
2169
2170                     /* store the folded codepoint */
2171                     if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2172
2173                     if ( !UTF ) {
2174                         /* store first byte of utf8 representation of
2175                            variant codepoints */
2176                         if (! UVCHR_IS_INVARIANT(uvc)) {
2177                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2178                         }
2179                     }
2180                     set_bit = 0; /* We've done our bit :-) */
2181                 }
2182             } else {
2183
2184                 /* XXX We could come up with the list of code points that fold
2185                  * to this using PL_utf8_foldclosures, except not for
2186                  * multi-char folds, as there may be multiple combinations
2187                  * there that could work, which needs to wait until runtime to
2188                  * resolve (The comment about LIGATURE FFI above is such an
2189                  * example */
2190
2191                 SV** svpp;
2192                 if ( !widecharmap )
2193                     widecharmap = newHV();
2194
2195                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2196
2197                 if ( !svpp )
2198                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2199
2200                 if ( !SvTRUE( *svpp ) ) {
2201                     sv_setiv( *svpp, ++trie->uniquecharcount );
2202                     TRIE_STORE_REVCHAR(uvc);
2203                 }
2204             }
2205         } /* end loop through characters in this branch of the trie */
2206
2207         /* We take the min and max for this branch and combine to find the min
2208          * and max for all branches processed so far */
2209         if( cur == first ) {
2210             trie->minlen = minchars;
2211             trie->maxlen = maxchars;
2212         } else if (minchars < trie->minlen) {
2213             trie->minlen = minchars;
2214         } else if (maxchars > trie->maxlen) {
2215             trie->maxlen = maxchars;
2216         }
2217     } /* end first pass */
2218     DEBUG_TRIE_COMPILE_r(
2219         PerlIO_printf( Perl_debug_log,
2220                 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2221                 (int)depth * 2 + 2,"",
2222                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2223                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2224                 (int)trie->minlen, (int)trie->maxlen )
2225     );
2226
2227     /*
2228         We now know what we are dealing with in terms of unique chars and
2229         string sizes so we can calculate how much memory a naive
2230         representation using a flat table  will take. If it's over a reasonable
2231         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2232         conservative but potentially much slower representation using an array
2233         of lists.
2234
2235         At the end we convert both representations into the same compressed
2236         form that will be used in regexec.c for matching with. The latter
2237         is a form that cannot be used to construct with but has memory
2238         properties similar to the list form and access properties similar
2239         to the table form making it both suitable for fast searches and
2240         small enough that its feasable to store for the duration of a program.
2241
2242         See the comment in the code where the compressed table is produced
2243         inplace from the flat tabe representation for an explanation of how
2244         the compression works.
2245
2246     */
2247
2248
2249     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2250     prev_states[1] = 0;
2251
2252     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2253                                                     > SvIV(re_trie_maxbuff) )
2254     {
2255         /*
2256             Second Pass -- Array Of Lists Representation
2257
2258             Each state will be represented by a list of charid:state records
2259             (reg_trie_trans_le) the first such element holds the CUR and LEN
2260             points of the allocated array. (See defines above).
2261
2262             We build the initial structure using the lists, and then convert
2263             it into the compressed table form which allows faster lookups
2264             (but cant be modified once converted).
2265         */
2266
2267         STRLEN transcount = 1;
2268
2269         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2270             "%*sCompiling trie using list compiler\n",
2271             (int)depth * 2 + 2, ""));
2272
2273         trie->states = (reg_trie_state *)
2274             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2275                                   sizeof(reg_trie_state) );
2276         TRIE_LIST_NEW(1);
2277         next_alloc = 2;
2278
2279         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2280
2281             regnode *noper   = NEXTOPER( cur );
2282             U8 *uc           = (U8*)STRING( noper );
2283             const U8 *e      = uc + STR_LEN( noper );
2284             U32 state        = 1;         /* required init */
2285             U16 charid       = 0;         /* sanity init */
2286             U32 wordlen      = 0;         /* required init */
2287
2288             if (OP(noper) == NOTHING) {
2289                 regnode *noper_next= regnext(noper);
2290                 if (noper_next != tail && OP(noper_next) == flags) {
2291                     noper = noper_next;
2292                     uc= (U8*)STRING(noper);
2293                     e= uc + STR_LEN(noper);
2294                 }
2295             }
2296
2297             if (OP(noper) != NOTHING) {
2298                 for ( ; uc < e ; uc += len ) {
2299
2300                     TRIE_READ_CHAR;
2301
2302                     if ( uvc < 256 ) {
2303                         charid = trie->charmap[ uvc ];
2304                     } else {
2305                         SV** const svpp = hv_fetch( widecharmap,
2306                                                     (char*)&uvc,
2307                                                     sizeof( UV ),
2308                                                     0);
2309                         if ( !svpp ) {
2310                             charid = 0;
2311                         } else {
2312                             charid=(U16)SvIV( *svpp );
2313                         }
2314                     }
2315                     /* charid is now 0 if we dont know the char read, or
2316                      * nonzero if we do */
2317                     if ( charid ) {
2318
2319                         U16 check;
2320                         U32 newstate = 0;
2321
2322                         charid--;
2323                         if ( !trie->states[ state ].trans.list ) {
2324                             TRIE_LIST_NEW( state );
2325                         }
2326                         for ( check = 1;
2327                               check <= TRIE_LIST_USED( state );
2328                               check++ )
2329                         {
2330                             if ( TRIE_LIST_ITEM( state, check ).forid
2331                                                                     == charid )
2332                             {
2333                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2334                                 break;
2335                             }
2336                         }
2337                         if ( ! newstate ) {
2338                             newstate = next_alloc++;
2339                             prev_states[newstate] = state;
2340                             TRIE_LIST_PUSH( state, charid, newstate );
2341                             transcount++;
2342                         }
2343                         state = newstate;
2344                     } else {
2345                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2346                     }
2347                 }
2348             }
2349             TRIE_HANDLE_WORD(state);
2350
2351         } /* end second pass */
2352
2353         /* next alloc is the NEXT state to be allocated */
2354         trie->statecount = next_alloc;
2355         trie->states = (reg_trie_state *)
2356             PerlMemShared_realloc( trie->states,
2357                                    next_alloc
2358                                    * sizeof(reg_trie_state) );
2359
2360         /* and now dump it out before we compress it */
2361         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2362                                                          revcharmap, next_alloc,
2363                                                          depth+1)
2364         );
2365
2366         trie->trans = (reg_trie_trans *)
2367             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2368         {
2369             U32 state;
2370             U32 tp = 0;
2371             U32 zp = 0;
2372
2373
2374             for( state=1 ; state < next_alloc ; state ++ ) {
2375                 U32 base=0;
2376
2377                 /*
2378                 DEBUG_TRIE_COMPILE_MORE_r(
2379                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2380                 );
2381                 */
2382
2383                 if (trie->states[state].trans.list) {
2384                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2385                     U16 maxid=minid;
2386                     U16 idx;
2387
2388                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2389                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2390                         if ( forid < minid ) {
2391                             minid=forid;
2392                         } else if ( forid > maxid ) {
2393                             maxid=forid;
2394                         }
2395                     }
2396                     if ( transcount < tp + maxid - minid + 1) {
2397                         transcount *= 2;
2398                         trie->trans = (reg_trie_trans *)
2399                             PerlMemShared_realloc( trie->trans,
2400                                                      transcount
2401                                                      * sizeof(reg_trie_trans) );
2402                         Zero( trie->trans + (transcount / 2),
2403                               transcount / 2,
2404                               reg_trie_trans );
2405                     }
2406                     base = trie->uniquecharcount + tp - minid;
2407                     if ( maxid == minid ) {
2408                         U32 set = 0;
2409                         for ( ; zp < tp ; zp++ ) {
2410                             if ( ! trie->trans[ zp ].next ) {
2411                                 base = trie->uniquecharcount + zp - minid;
2412                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2413                                                                    1).newstate;
2414                                 trie->trans[ zp ].check = state;
2415                                 set = 1;
2416                                 break;
2417                             }
2418                         }
2419                         if ( !set ) {
2420                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2421                                                                    1).newstate;
2422                             trie->trans[ tp ].check = state;
2423                             tp++;
2424                             zp = tp;
2425                         }
2426                     } else {
2427                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2428                             const U32 tid = base
2429                                            - trie->uniquecharcount
2430                                            + TRIE_LIST_ITEM( state, idx ).forid;
2431                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2432                                                                 idx ).newstate;
2433                             trie->trans[ tid ].check = state;
2434                         }
2435                         tp += ( maxid - minid + 1 );
2436                     }
2437                     Safefree(trie->states[ state ].trans.list);
2438                 }
2439                 /*
2440                 DEBUG_TRIE_COMPILE_MORE_r(
2441                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2442                 );
2443                 */
2444                 trie->states[ state ].trans.base=base;
2445             }
2446             trie->lasttrans = tp + 1;
2447         }
2448     } else {
2449         /*
2450            Second Pass -- Flat Table Representation.
2451
2452            we dont use the 0 slot of either trans[] or states[] so we add 1 to
2453            each.  We know that we will need Charcount+1 trans at most to store
2454            the data (one row per char at worst case) So we preallocate both
2455            structures assuming worst case.
2456
2457            We then construct the trie using only the .next slots of the entry
2458            structs.
2459
2460            We use the .check field of the first entry of the node temporarily
2461            to make compression both faster and easier by keeping track of how
2462            many non zero fields are in the node.
2463
2464            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2465            transition.
2466
2467            There are two terms at use here: state as a TRIE_NODEIDX() which is
2468            a number representing the first entry of the node, and state as a
2469            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2470            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2471            if there are 2 entrys per node. eg:
2472
2473              A B       A B
2474           1. 2 4    1. 3 7
2475           2. 0 3    3. 0 5
2476           3. 0 0    5. 0 0
2477           4. 0 0    7. 0 0
2478
2479            The table is internally in the right hand, idx form. However as we
2480            also have to deal with the states array which is indexed by nodenum
2481            we have to use TRIE_NODENUM() to convert.
2482
2483         */
2484         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2485             "%*sCompiling trie using table compiler\n",
2486             (int)depth * 2 + 2, ""));
2487
2488         trie->trans = (reg_trie_trans *)
2489             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2490                                   * trie->uniquecharcount + 1,
2491                                   sizeof(reg_trie_trans) );
2492         trie->states = (reg_trie_state *)
2493             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2494                                   sizeof(reg_trie_state) );
2495         next_alloc = trie->uniquecharcount + 1;
2496
2497
2498         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2499
2500             regnode *noper   = NEXTOPER( cur );
2501             const U8 *uc     = (U8*)STRING( noper );
2502             const U8 *e      = uc + STR_LEN( noper );
2503
2504             U32 state        = 1;         /* required init */
2505
2506             U16 charid       = 0;         /* sanity init */
2507             U32 accept_state = 0;         /* sanity init */
2508
2509             U32 wordlen      = 0;         /* required init */
2510
2511             if (OP(noper) == NOTHING) {
2512                 regnode *noper_next= regnext(noper);
2513                 if (noper_next != tail && OP(noper_next) == flags) {
2514                     noper = noper_next;
2515                     uc= (U8*)STRING(noper);
2516                     e= uc + STR_LEN(noper);
2517                 }
2518             }
2519
2520             if ( OP(noper) != NOTHING ) {
2521                 for ( ; uc < e ; uc += len ) {
2522
2523                     TRIE_READ_CHAR;
2524
2525                     if ( uvc < 256 ) {
2526                         charid = trie->charmap[ uvc ];
2527                     } else {
2528                         SV* const * const svpp = hv_fetch( widecharmap,
2529                                                            (char*)&uvc,
2530                                                            sizeof( UV ),
2531                                                            0);
2532                         charid = svpp ? (U16)SvIV(*svpp) : 0;
2533                     }
2534                     if ( charid ) {
2535                         charid--;
2536                         if ( !trie->trans[ state + charid ].next ) {
2537                             trie->trans[ state + charid ].next = next_alloc;
2538                             trie->trans[ state ].check++;
2539                             prev_states[TRIE_NODENUM(next_alloc)]
2540                                     = TRIE_NODENUM(state);
2541                             next_alloc += trie->uniquecharcount;
2542                         }
2543                         state = trie->trans[ state + charid ].next;
2544                     } else {
2545                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2546                     }
2547                     /* charid is now 0 if we dont know the char read, or
2548                      * nonzero if we do */
2549                 }
2550             }
2551             accept_state = TRIE_NODENUM( state );
2552             TRIE_HANDLE_WORD(accept_state);
2553
2554         } /* end second pass */
2555
2556         /* and now dump it out before we compress it */
2557         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2558                                                           revcharmap,
2559                                                           next_alloc, depth+1));
2560
2561         {
2562         /*
2563            * Inplace compress the table.*
2564
2565            For sparse data sets the table constructed by the trie algorithm will
2566            be mostly 0/FAIL transitions or to put it another way mostly empty.
2567            (Note that leaf nodes will not contain any transitions.)
2568
2569            This algorithm compresses the tables by eliminating most such
2570            transitions, at the cost of a modest bit of extra work during lookup:
2571
2572            - Each states[] entry contains a .base field which indicates the
2573            index in the state[] array wheres its transition data is stored.
2574
2575            - If .base is 0 there are no valid transitions from that node.
2576
2577            - If .base is nonzero then charid is added to it to find an entry in
2578            the trans array.
2579
2580            -If trans[states[state].base+charid].check!=state then the
2581            transition is taken to be a 0/Fail transition. Thus if there are fail
2582            transitions at the front of the node then the .base offset will point
2583            somewhere inside the previous nodes data (or maybe even into a node
2584            even earlier), but the .check field determines if the transition is
2585            valid.
2586
2587            XXX - wrong maybe?
2588            The following process inplace converts the table to the compressed
2589            table: We first do not compress the root node 1,and mark all its
2590            .check pointers as 1 and set its .base pointer as 1 as well. This
2591            allows us to do a DFA construction from the compressed table later,
2592            and ensures that any .base pointers we calculate later are greater
2593            than 0.
2594
2595            - We set 'pos' to indicate the first entry of the second node.
2596
2597            - We then iterate over the columns of the node, finding the first and
2598            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2599            and set the .check pointers accordingly, and advance pos
2600            appropriately and repreat for the next node. Note that when we copy
2601            the next pointers we have to convert them from the original
2602            NODEIDX form to NODENUM form as the former is not valid post
2603            compression.
2604
2605            - If a node has no transitions used we mark its base as 0 and do not
2606            advance the pos pointer.
2607
2608            - If a node only has one transition we use a second pointer into the
2609            structure to fill in allocated fail transitions from other states.
2610            This pointer is independent of the main pointer and scans forward
2611            looking for null transitions that are allocated to a state. When it
2612            finds one it writes the single transition into the "hole".  If the
2613            pointer doesnt find one the single transition is appended as normal.
2614
2615            - Once compressed we can Renew/realloc the structures to release the
2616            excess space.
2617
2618            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2619            specifically Fig 3.47 and the associated pseudocode.
2620
2621            demq
2622         */
2623         const U32 laststate = TRIE_NODENUM( next_alloc );
2624         U32 state, charid;
2625         U32 pos = 0, zp=0;
2626         trie->statecount = laststate;
2627
2628         for ( state = 1 ; state < laststate ; state++ ) {
2629             U8 flag = 0;
2630             const U32 stateidx = TRIE_NODEIDX( state );
2631             const U32 o_used = trie->trans[ stateidx ].check;
2632             U32 used = trie->trans[ stateidx ].check;
2633             trie->trans[ stateidx ].check = 0;
2634
2635             for ( charid = 0;
2636                   used && charid < trie->uniquecharcount;
2637                   charid++ )
2638             {
2639                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2640                     if ( trie->trans[ stateidx + charid ].next ) {
2641                         if (o_used == 1) {
2642                             for ( ; zp < pos ; zp++ ) {
2643                                 if ( ! trie->trans[ zp ].next ) {
2644                                     break;
2645                                 }
2646                             }
2647                             trie->states[ state ].trans.base
2648                                                     = zp
2649                                                       + trie->uniquecharcount
2650                                                       - charid ;
2651                             trie->trans[ zp ].next
2652                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2653                                                              + charid ].next );
2654                             trie->trans[ zp ].check = state;
2655                             if ( ++zp > pos ) pos = zp;
2656                             break;
2657                         }
2658                         used--;
2659                     }
2660                     if ( !flag ) {
2661                         flag = 1;
2662                         trie->states[ state ].trans.base
2663                                        = pos + trie->uniquecharcount - charid ;
2664                     }
2665                     trie->trans[ pos ].next
2666                         = SAFE_TRIE_NODENUM(
2667                                        trie->trans[ stateidx + charid ].next );
2668                     trie->trans[ pos ].check = state;
2669                     pos++;
2670                 }
2671             }
2672         }
2673         trie->lasttrans = pos + 1;
2674         trie->states = (reg_trie_state *)
2675             PerlMemShared_realloc( trie->states, laststate
2676                                    * sizeof(reg_trie_state) );
2677         DEBUG_TRIE_COMPILE_MORE_r(
2678             PerlIO_printf( Perl_debug_log,
2679                 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2680                 (int)depth * 2 + 2,"",
2681                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2682                        + 1 ),
2683                 (IV)next_alloc,
2684                 (IV)pos,
2685                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2686             );
2687
2688         } /* end table compress */
2689     }
2690     DEBUG_TRIE_COMPILE_MORE_r(
2691             PerlIO_printf(Perl_debug_log,
2692                 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2693                 (int)depth * 2 + 2, "",
2694                 (UV)trie->statecount,
2695                 (UV)trie->lasttrans)
2696     );
2697     /* resize the trans array to remove unused space */
2698     trie->trans = (reg_trie_trans *)
2699         PerlMemShared_realloc( trie->trans, trie->lasttrans
2700                                * sizeof(reg_trie_trans) );
2701
2702     {   /* Modify the program and insert the new TRIE node */
2703         U8 nodetype =(U8)(flags & 0xFF);
2704         char *str=NULL;
2705
2706 #ifdef DEBUGGING
2707         regnode *optimize = NULL;
2708 #ifdef RE_TRACK_PATTERN_OFFSETS
2709
2710         U32 mjd_offset = 0;
2711         U32 mjd_nodelen = 0;
2712 #endif /* RE_TRACK_PATTERN_OFFSETS */
2713 #endif /* DEBUGGING */
2714         /*
2715            This means we convert either the first branch or the first Exact,
2716            depending on whether the thing following (in 'last') is a branch
2717            or not and whther first is the startbranch (ie is it a sub part of
2718            the alternation or is it the whole thing.)
2719            Assuming its a sub part we convert the EXACT otherwise we convert
2720            the whole branch sequence, including the first.
2721          */
2722         /* Find the node we are going to overwrite */
2723         if ( first != startbranch || OP( last ) == BRANCH ) {
2724             /* branch sub-chain */
2725             NEXT_OFF( first ) = (U16)(last - first);
2726 #ifdef RE_TRACK_PATTERN_OFFSETS
2727             DEBUG_r({
2728                 mjd_offset= Node_Offset((convert));
2729                 mjd_nodelen= Node_Length((convert));
2730             });
2731 #endif
2732             /* whole branch chain */
2733         }
2734 #ifdef RE_TRACK_PATTERN_OFFSETS
2735         else {
2736             DEBUG_r({
2737                 const  regnode *nop = NEXTOPER( convert );
2738                 mjd_offset= Node_Offset((nop));
2739                 mjd_nodelen= Node_Length((nop));
2740             });
2741         }
2742         DEBUG_OPTIMISE_r(
2743             PerlIO_printf(Perl_debug_log,
2744                 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2745                 (int)depth * 2 + 2, "",
2746                 (UV)mjd_offset, (UV)mjd_nodelen)
2747         );
2748 #endif
2749         /* But first we check to see if there is a common prefix we can
2750            split out as an EXACT and put in front of the TRIE node.  */
2751         trie->startstate= 1;
2752         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2753             U32 state;
2754             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2755                 U32 ofs = 0;
2756                 I32 idx = -1;
2757                 U32 count = 0;
2758                 const U32 base = trie->states[ state ].trans.base;
2759
2760                 if ( trie->states[state].wordnum )
2761                         count = 1;
2762
2763                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2764                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2765                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2766                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2767                     {
2768                         if ( ++count > 1 ) {
2769                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2770                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2771                             if ( state == 1 ) break;
2772                             if ( count == 2 ) {
2773                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2774                                 DEBUG_OPTIMISE_r(
2775                                     PerlIO_printf(Perl_debug_log,
2776                                         "%*sNew Start State=%"UVuf" Class: [",
2777                                         (int)depth * 2 + 2, "",
2778                                         (UV)state));
2779                                 if (idx >= 0) {
2780                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2781                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2782
2783                                     TRIE_BITMAP_SET(trie,*ch);
2784                                     if ( folder )
2785                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2786                                     DEBUG_OPTIMISE_r(
2787                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2788                                     );
2789                                 }
2790                             }
2791                             TRIE_BITMAP_SET(trie,*ch);
2792                             if ( folder )
2793                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2794                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2795                         }
2796                         idx = ofs;
2797                     }
2798                 }
2799                 if ( count == 1 ) {
2800                     SV **tmp = av_fetch( revcharmap, idx, 0);
2801                     STRLEN len;
2802                     char *ch = SvPV( *tmp, len );
2803                     DEBUG_OPTIMISE_r({
2804                         SV *sv=sv_newmortal();
2805                         PerlIO_printf( Perl_debug_log,
2806                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2807                             (int)depth * 2 + 2, "",
2808                             (UV)state, (UV)idx,
2809                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2810                                 PL_colors[0], PL_colors[1],
2811                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2812                                 PERL_PV_ESCAPE_FIRSTCHAR
2813                             )
2814                         );
2815                     });
2816                     if ( state==1 ) {
2817                         OP( convert ) = nodetype;
2818                         str=STRING(convert);
2819                         STR_LEN(convert)=0;
2820                     }
2821                     STR_LEN(convert) += len;
2822                     while (len--)
2823                         *str++ = *ch++;
2824                 } else {
2825 #ifdef DEBUGGING
2826                     if (state>1)
2827                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2828 #endif
2829                     break;
2830                 }
2831             }
2832             trie->prefixlen = (state-1);
2833             if (str) {
2834                 regnode *n = convert+NODE_SZ_STR(convert);
2835                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2836                 trie->startstate = state;
2837                 trie->minlen -= (state - 1);
2838                 trie->maxlen -= (state - 1);
2839 #ifdef DEBUGGING
2840                /* At least the UNICOS C compiler choked on this
2841                 * being argument to DEBUG_r(), so let's just have
2842                 * it right here. */
2843                if (
2844 #ifdef PERL_EXT_RE_BUILD
2845                    1
2846 #else
2847                    DEBUG_r_TEST
2848 #endif
2849                    ) {
2850                    regnode *fix = convert;
2851                    U32 word = trie->wordcount;
2852                    mjd_nodelen++;
2853                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2854                    while( ++fix < n ) {
2855                        Set_Node_Offset_Length(fix, 0, 0);
2856                    }
2857                    while (word--) {
2858                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2859                        if (tmp) {
2860                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2861                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2862                            else
2863                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2864                        }
2865                    }
2866                }
2867 #endif
2868                 if (trie->maxlen) {
2869                     convert = n;
2870                 } else {
2871                     NEXT_OFF(convert) = (U16)(tail - convert);
2872                     DEBUG_r(optimize= n);
2873                 }
2874             }
2875         }
2876         if (!jumper)
2877             jumper = last;
2878         if ( trie->maxlen ) {
2879             NEXT_OFF( convert ) = (U16)(tail - convert);
2880             ARG_SET( convert, data_slot );
2881             /* Store the offset to the first unabsorbed branch in
2882                jump[0], which is otherwise unused by the jump logic.
2883                We use this when dumping a trie and during optimisation. */
2884             if (trie->jump)
2885                 trie->jump[0] = (U16)(nextbranch - convert);
2886
2887             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2888              *   and there is a bitmap
2889              *   and the first "jump target" node we found leaves enough room
2890              * then convert the TRIE node into a TRIEC node, with the bitmap
2891              * embedded inline in the opcode - this is hypothetically faster.
2892              */
2893             if ( !trie->states[trie->startstate].wordnum
2894                  && trie->bitmap
2895                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2896             {
2897                 OP( convert ) = TRIEC;
2898                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2899                 PerlMemShared_free(trie->bitmap);
2900                 trie->bitmap= NULL;
2901             } else
2902                 OP( convert ) = TRIE;
2903
2904             /* store the type in the flags */
2905             convert->flags = nodetype;
2906             DEBUG_r({
2907             optimize = convert
2908                       + NODE_STEP_REGNODE
2909                       + regarglen[ OP( convert ) ];
2910             });
2911             /* XXX We really should free up the resource in trie now,
2912                    as we won't use them - (which resources?) dmq */
2913         }
2914         /* needed for dumping*/
2915         DEBUG_r(if (optimize) {
2916             regnode *opt = convert;
2917
2918             while ( ++opt < optimize) {
2919                 Set_Node_Offset_Length(opt,0,0);
2920             }
2921             /*
2922                 Try to clean up some of the debris left after the
2923                 optimisation.
2924              */
2925             while( optimize < jumper ) {
2926                 mjd_nodelen += Node_Length((optimize));
2927                 OP( optimize ) = OPTIMIZED;
2928                 Set_Node_Offset_Length(optimize,0,0);
2929                 optimize++;
2930             }
2931             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2932         });
2933     } /* end node insert */
2934
2935     /*  Finish populating the prev field of the wordinfo array.  Walk back
2936      *  from each accept state until we find another accept state, and if
2937      *  so, point the first word's .prev field at the second word. If the
2938      *  second already has a .prev field set, stop now. This will be the
2939      *  case either if we've already processed that word's accept state,
2940      *  or that state had multiple words, and the overspill words were
2941      *  already linked up earlier.
2942      */
2943     {
2944         U16 word;
2945         U32 state;
2946         U16 prev;
2947
2948         for (word=1; word <= trie->wordcount; word++) {
2949             prev = 0;
2950             if (trie->wordinfo[word].prev)
2951                 continue;
2952             state = trie->wordinfo[word].accept;
2953             while (state) {
2954                 state = prev_states[state];
2955                 if (!state)
2956                     break;
2957                 prev = trie->states[state].wordnum;
2958                 if (prev)
2959                     break;
2960             }
2961             trie->wordinfo[word].prev = prev;
2962         }
2963         Safefree(prev_states);
2964     }
2965
2966
2967     /* and now dump out the compressed format */
2968     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2969
2970     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2971 #ifdef DEBUGGING
2972     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2973     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2974 #else
2975     SvREFCNT_dec_NN(revcharmap);
2976 #endif
2977     return trie->jump
2978            ? MADE_JUMP_TRIE
2979            : trie->startstate>1
2980              ? MADE_EXACT_TRIE
2981              : MADE_TRIE;
2982 }
2983
2984 STATIC void
2985 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2986 {
2987 /* The Trie is constructed and compressed now so we can build a fail array if
2988  * it's needed
2989
2990    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2991    3.32 in the
2992    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2993    Ullman 1985/88
2994    ISBN 0-201-10088-6
2995
2996    We find the fail state for each state in the trie, this state is the longest
2997    proper suffix of the current state's 'word' that is also a proper prefix of
2998    another word in our trie. State 1 represents the word '' and is thus the
2999    default fail state. This allows the DFA not to have to restart after its
3000    tried and failed a word at a given point, it simply continues as though it
3001    had been matching the other word in the first place.
3002    Consider
3003       'abcdgu'=~/abcdefg|cdgu/
3004    When we get to 'd' we are still matching the first word, we would encounter
3005    'g' which would fail, which would bring us to the state representing 'd' in
3006    the second word where we would try 'g' and succeed, proceeding to match
3007    'cdgu'.
3008  */
3009  /* add a fail transition */
3010     const U32 trie_offset = ARG(source);
3011     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3012     U32 *q;
3013     const U32 ucharcount = trie->uniquecharcount;
3014     const U32 numstates = trie->statecount;
3015     const U32 ubound = trie->lasttrans + ucharcount;
3016     U32 q_read = 0;
3017     U32 q_write = 0;
3018     U32 charid;
3019     U32 base = trie->states[ 1 ].trans.base;
3020     U32 *fail;
3021     reg_ac_data *aho;
3022     const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3023     GET_RE_DEBUG_FLAGS_DECL;
3024
3025     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3026 #ifndef DEBUGGING
3027     PERL_UNUSED_ARG(depth);
3028 #endif
3029
3030
3031     ARG_SET( stclass, data_slot );
3032     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3033     RExC_rxi->data->data[ data_slot ] = (void*)aho;
3034     aho->trie=trie_offset;
3035     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3036     Copy( trie->states, aho->states, numstates, reg_trie_state );
3037     Newxz( q, numstates, U32);
3038     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3039     aho->refcount = 1;
3040     fail = aho->fail;
3041     /* initialize fail[0..1] to be 1 so that we always have
3042        a valid final fail state */
3043     fail[ 0 ] = fail[ 1 ] = 1;
3044
3045     for ( charid = 0; charid < ucharcount ; charid++ ) {
3046         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3047         if ( newstate ) {
3048             q[ q_write ] = newstate;
3049             /* set to point at the root */
3050             fail[ q[ q_write++ ] ]=1;
3051         }
3052     }
3053     while ( q_read < q_write) {
3054         const U32 cur = q[ q_read++ % numstates ];
3055         base = trie->states[ cur ].trans.base;
3056
3057         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3058             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3059             if (ch_state) {
3060                 U32 fail_state = cur;
3061                 U32 fail_base;
3062                 do {
3063                     fail_state = fail[ fail_state ];
3064                     fail_base = aho->states[ fail_state ].trans.base;
3065                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3066
3067                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3068                 fail[ ch_state ] = fail_state;
3069                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3070                 {
3071                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3072                 }
3073                 q[ q_write++ % numstates] = ch_state;
3074             }
3075         }
3076     }
3077     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3078        when we fail in state 1, this allows us to use the
3079        charclass scan to find a valid start char. This is based on the principle
3080        that theres a good chance the string being searched contains lots of stuff
3081        that cant be a start char.
3082      */
3083     fail[ 0 ] = fail[ 1 ] = 0;
3084     DEBUG_TRIE_COMPILE_r({
3085         PerlIO_printf(Perl_debug_log,
3086                       "%*sStclass Failtable (%"UVuf" states): 0",
3087                       (int)(depth * 2), "", (UV)numstates
3088         );
3089         for( q_read=1; q_read<numstates; q_read++ ) {
3090             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3091         }
3092         PerlIO_printf(Perl_debug_log, "\n");
3093     });
3094     Safefree(q);
3095     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3096 }
3097
3098
3099 #define DEBUG_PEEP(str,scan,depth) \
3100     DEBUG_OPTIMISE_r({if (scan){ \
3101        SV * const mysv=sv_newmortal(); \
3102        regnode *Next = regnext(scan); \
3103        regprop(RExC_rx, mysv, scan, NULL); \
3104        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3105        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3106        Next ? (REG_NODE_NUM(Next)) : 0 ); \
3107    }});
3108
3109
3110 /* The below joins as many adjacent EXACTish nodes as possible into a single
3111  * one.  The regop may be changed if the node(s) contain certain sequences that
3112  * require special handling.  The joining is only done if:
3113  * 1) there is room in the current conglomerated node to entirely contain the
3114  *    next one.
3115  * 2) they are the exact same node type
3116  *
3117  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3118  * these get optimized out
3119  *
3120  * If a node is to match under /i (folded), the number of characters it matches
3121  * can be different than its character length if it contains a multi-character
3122  * fold.  *min_subtract is set to the total delta number of characters of the
3123  * input nodes.
3124  *
3125  * And *unfolded_multi_char is set to indicate whether or not the node contains
3126  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3127  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3128  * SMALL LETTER SHARP S, as only if the target string being matched against
3129  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3130  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3131  * whose components are all above the Latin1 range are not run-time locale
3132  * dependent, and have already been folded by the time this function is
3133  * called.)
3134  *
3135  * This is as good a place as any to discuss the design of handling these
3136  * multi-character fold sequences.  It's been wrong in Perl for a very long
3137  * time.  There are three code points in Unicode whose multi-character folds
3138  * were long ago discovered to mess things up.  The previous designs for
3139  * dealing with these involved assigning a special node for them.  This
3140  * approach doesn't always work, as evidenced by this example:
3141  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3142  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3143  * would match just the \xDF, it won't be able to handle the case where a
3144  * successful match would have to cross the node's boundary.  The new approach
3145  * that hopefully generally solves the problem generates an EXACTFU_SS node
3146  * that is "sss" in this case.
3147  *
3148  * It turns out that there are problems with all multi-character folds, and not
3149  * just these three.  Now the code is general, for all such cases.  The
3150  * approach taken is:
3151  * 1)   This routine examines each EXACTFish node that could contain multi-
3152  *      character folded sequences.  Since a single character can fold into
3153  *      such a sequence, the minimum match length for this node is less than
3154  *      the number of characters in the node.  This routine returns in
3155  *      *min_subtract how many characters to subtract from the the actual
3156  *      length of the string to get a real minimum match length; it is 0 if
3157  *      there are no multi-char foldeds.  This delta is used by the caller to
3158  *      adjust the min length of the match, and the delta between min and max,
3159  *      so that the optimizer doesn't reject these possibilities based on size
3160  *      constraints.
3161  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3162  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3163  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3164  *      there is a possible fold length change.  That means that a regular
3165  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3166  *      with length changes, and so can be processed faster.  regexec.c takes
3167  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3168  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3169  *      known until runtime).  This saves effort in regex matching.  However,
3170  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3171  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3172  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3173  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3174  *      possibilities for the non-UTF8 patterns are quite simple, except for
3175  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3176  *      members of a fold-pair, and arrays are set up for all of them so that
3177  *      the other member of the pair can be found quickly.  Code elsewhere in
3178  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3179  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3180  *      described in the next item.
3181  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3182  *      validity of the fold won't be known until runtime, and so must remain
3183  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3184  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3185  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3186  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3187  *      The reason this is a problem is that the optimizer part of regexec.c
3188  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3189  *      that a character in the pattern corresponds to at most a single
3190  *      character in the target string.  (And I do mean character, and not byte
3191  *      here, unlike other parts of the documentation that have never been
3192  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3193  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3194  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3195  *      nodes, violate the assumption, and they are the only instances where it
3196  *      is violated.  I'm reluctant to try to change the assumption, as the
3197  *      code involved is impenetrable to me (khw), so instead the code here
3198  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3199  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3200  *      boolean indicating whether or not the node contains such a fold.  When
3201  *      it is true, the caller sets a flag that later causes the optimizer in
3202  *      this file to not set values for the floating and fixed string lengths,
3203  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3204  *      assumption.  Thus, there is no optimization based on string lengths for
3205  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3206  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3207  *      assumption is wrong only in these cases is that all other non-UTF-8
3208  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3209  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3210  *      EXACTF nodes because we don't know at compile time if it actually
3211  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3212  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3213  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3214  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3215  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3216  *      string would require the pattern to be forced into UTF-8, the overhead
3217  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3218  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3219  *      locale.)
3220  *
3221  *      Similarly, the code that generates tries doesn't currently handle
3222  *      not-already-folded multi-char folds, and it looks like a pain to change
3223  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3224  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3225  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3226  *      using /iaa matching will be doing so almost entirely with ASCII
3227  *      strings, so this should rarely be encountered in practice */
3228
3229 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3230     if (PL_regkind[OP(scan)] == EXACT) \
3231         join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3232
3233 STATIC U32
3234 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3235                    UV *min_subtract, bool *unfolded_multi_char,
3236                    U32 flags,regnode *val, U32 depth)
3237 {
3238     /* Merge several consecutive EXACTish nodes into one. */
3239     regnode *n = regnext(scan);
3240     U32 stringok = 1;
3241     regnode *next = scan + NODE_SZ_STR(scan);
3242     U32 merged = 0;
3243     U32 stopnow = 0;
3244 #ifdef DEBUGGING
3245     regnode *stop = scan;
3246     GET_RE_DEBUG_FLAGS_DECL;
3247 #else
3248     PERL_UNUSED_ARG(depth);
3249 #endif
3250
3251     PERL_ARGS_ASSERT_JOIN_EXACT;
3252 #ifndef EXPERIMENTAL_INPLACESCAN
3253     PERL_UNUSED_ARG(flags);
3254     PERL_UNUSED_ARG(val);
3255 #endif
3256     DEBUG_PEEP("join",scan,depth);
3257
3258     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3259      * EXACT ones that are mergeable to the current one. */
3260     while (n
3261            && (PL_regkind[OP(n)] == NOTHING
3262                || (stringok && OP(n) == OP(scan)))
3263            && NEXT_OFF(n)
3264            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3265     {
3266
3267         if (OP(n) == TAIL || n > next)
3268             stringok = 0;
3269         if (PL_regkind[OP(n)] == NOTHING) {
3270             DEBUG_PEEP("skip:",n,depth);
3271             NEXT_OFF(scan) += NEXT_OFF(n);
3272             next = n + NODE_STEP_REGNODE;
3273 #ifdef DEBUGGING
3274             if (stringok)
3275                 stop = n;
3276 #endif
3277             n = regnext(n);
3278         }
3279         else if (stringok) {
3280             const unsigned int oldl = STR_LEN(scan);
3281             regnode * const nnext = regnext(n);
3282
3283             /* XXX I (khw) kind of doubt that this works on platforms (should
3284              * Perl ever run on one) where U8_MAX is above 255 because of lots
3285              * of other assumptions */
3286             /* Don't join if the sum can't fit into a single node */
3287             if (oldl + STR_LEN(n) > U8_MAX)
3288                 break;
3289
3290             DEBUG_PEEP("merg",n,depth);
3291             merged++;
3292
3293             NEXT_OFF(scan) += NEXT_OFF(n);
3294             STR_LEN(scan) += STR_LEN(n);
3295             next = n + NODE_SZ_STR(n);
3296             /* Now we can overwrite *n : */
3297             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3298 #ifdef DEBUGGING
3299             stop = next - 1;
3300 #endif
3301             n = nnext;
3302             if (stopnow) break;
3303         }
3304
3305 #ifdef EXPERIMENTAL_INPLACESCAN
3306         if (flags && !NEXT_OFF(n)) {
3307             DEBUG_PEEP("atch", val, depth);
3308             if (reg_off_by_arg[OP(n)]) {
3309                 ARG_SET(n, val - n);
3310             }
3311             else {
3312                 NEXT_OFF(n) = val - n;
3313             }
3314             stopnow = 1;
3315         }
3316 #endif
3317     }
3318
3319     *min_subtract = 0;
3320     *unfolded_multi_char = FALSE;
3321
3322     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3323      * can now analyze for sequences of problematic code points.  (Prior to
3324      * this final joining, sequences could have been split over boundaries, and
3325      * hence missed).  The sequences only happen in folding, hence for any
3326      * non-EXACT EXACTish node */
3327     if (OP(scan) != EXACT) {
3328         U8* s0 = (U8*) STRING(scan);
3329         U8* s = s0;
3330         U8* s_end = s0 + STR_LEN(scan);
3331
3332         int total_count_delta = 0;  /* Total delta number of characters that
3333                                        multi-char folds expand to */
3334
3335         /* One pass is made over the node's string looking for all the
3336          * possibilities.  To avoid some tests in the loop, there are two main
3337          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3338          * non-UTF-8 */
3339         if (UTF) {
3340             U8* folded = NULL;
3341
3342             if (OP(scan) == EXACTFL) {
3343                 U8 *d;
3344
3345                 /* An EXACTFL node would already have been changed to another
3346                  * node type unless there is at least one character in it that
3347                  * is problematic; likely a character whose fold definition
3348                  * won't be known until runtime, and so has yet to be folded.
3349                  * For all but the UTF-8 locale, folds are 1-1 in length, but
3350                  * to handle the UTF-8 case, we need to create a temporary
3351                  * folded copy using UTF-8 locale rules in order to analyze it.
3352                  * This is because our macros that look to see if a sequence is
3353                  * a multi-char fold assume everything is folded (otherwise the
3354                  * tests in those macros would be too complicated and slow).
3355                  * Note that here, the non-problematic folds will have already
3356                  * been done, so we can just copy such characters.  We actually
3357                  * don't completely fold the EXACTFL string.  We skip the
3358                  * unfolded multi-char folds, as that would just create work
3359                  * below to figure out the size they already are */
3360
3361                 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3362                 d = folded;
3363                 while (s < s_end) {
3364                     STRLEN s_len = UTF8SKIP(s);
3365                     if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3366                         Copy(s, d, s_len, U8);
3367                         d += s_len;
3368                     }
3369                     else if (is_FOLDS_TO_MULTI_utf8(s)) {
3370                         *unfolded_multi_char = TRUE;
3371                         Copy(s, d, s_len, U8);
3372                         d += s_len;
3373                     }
3374                     else if (isASCII(*s)) {
3375                         *(d++) = toFOLD(*s);
3376                     }
3377                     else {
3378                         STRLEN len;
3379                         _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3380                         d += len;
3381                     }
3382                     s += s_len;
3383                 }
3384
3385                 /* Point the remainder of the routine to look at our temporary
3386                  * folded copy */
3387                 s = folded;
3388                 s_end = d;
3389             } /* End of creating folded copy of EXACTFL string */
3390
3391             /* Examine the string for a multi-character fold sequence.  UTF-8
3392              * patterns have all characters pre-folded by the time this code is
3393              * executed */
3394             while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3395                                      length sequence we are looking for is 2 */
3396             {
3397                 int count = 0;  /* How many characters in a multi-char fold */
3398                 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3399                 if (! len) {    /* Not a multi-char fold: get next char */
3400                     s += UTF8SKIP(s);
3401                     continue;
3402                 }
3403
3404                 /* Nodes with 'ss' require special handling, except for
3405                  * EXACTFA-ish for which there is no multi-char fold to this */
3406                 if (len == 2 && *s == 's' && *(s+1) == 's'
3407                     && OP(scan) != EXACTFA
3408                     && OP(scan) != EXACTFA_NO_TRIE)
3409                 {
3410                     count = 2;
3411                     if (OP(scan) != EXACTFL) {
3412                         OP(scan) = EXACTFU_SS;
3413                     }
3414                     s += 2;
3415                 }
3416                 else { /* Here is a generic multi-char fold. */
3417                     U8* multi_end  = s + len;
3418
3419                     /* Count how many characters in it.  In the case of /aa, no
3420                      * folds which contain ASCII code points are allowed, so
3421                      * check for those, and skip if found. */
3422                     if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3423                         count = utf8_length(s, multi_end);
3424                         s = multi_end;
3425                     }
3426                     else {
3427                         while (s < multi_end) {
3428                             if (isASCII(*s)) {
3429                                 s++;
3430                                 goto next_iteration;
3431                             }
3432                             else {
3433                                 s += UTF8SKIP(s);
3434                             }
3435                             count++;
3436                         }
3437                     }
3438                 }
3439
3440                 /* The delta is how long the sequence is minus 1 (1 is how long
3441                  * the character that folds to the sequence is) */
3442                 total_count_delta += count - 1;
3443               next_iteration: ;
3444             }
3445
3446             /* We created a temporary folded copy of the string in EXACTFL
3447              * nodes.  Therefore we need to be sure it doesn't go below zero,
3448              * as the real string could be shorter */
3449             if (OP(scan) == EXACTFL) {
3450                 int total_chars = utf8_length((U8*) STRING(scan),
3451                                            (U8*) STRING(scan) + STR_LEN(scan));
3452                 if (total_count_delta > total_chars) {
3453                     total_count_delta = total_chars;
3454                 }
3455             }
3456
3457             *min_subtract += total_count_delta;
3458             Safefree(folded);
3459         }
3460         else if (OP(scan) == EXACTFA) {
3461
3462             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3463              * fold to the ASCII range (and there are no existing ones in the
3464              * upper latin1 range).  But, as outlined in the comments preceding
3465              * this function, we need to flag any occurrences of the sharp s.
3466              * This character forbids trie formation (because of added
3467              * complexity) */
3468             while (s < s_end) {
3469                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3470                     OP(scan) = EXACTFA_NO_TRIE;
3471                     *unfolded_multi_char = TRUE;
3472                     break;
3473                 }
3474                 s++;
3475                 continue;
3476             }
3477         }
3478         else {
3479
3480             /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3481              * folds that are all Latin1.  As explained in the comments
3482              * preceding this function, we look also for the sharp s in EXACTF
3483              * and EXACTFL nodes; it can be in the final position.  Otherwise
3484              * we can stop looking 1 byte earlier because have to find at least
3485              * two characters for a multi-fold */
3486             const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3487                               ? s_end
3488                               : s_end -1;
3489
3490             while (s < upper) {
3491                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3492                 if (! len) {    /* Not a multi-char fold. */
3493                     if (*s == LATIN_SMALL_LETTER_SHARP_S
3494                         && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3495                     {
3496                         *unfolded_multi_char = TRUE;
3497                     }
3498                     s++;
3499                     continue;
3500                 }
3501
3502                 if (len == 2
3503                     && isARG2_lower_or_UPPER_ARG1('s', *s)
3504                     && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3505                 {
3506
3507                     /* EXACTF nodes need to know that the minimum length
3508                      * changed so that a sharp s in the string can match this
3509                      * ss in the pattern, but they remain EXACTF nodes, as they
3510                      * won't match this unless the target string is is UTF-8,
3511                      * which we don't know until runtime.  EXACTFL nodes can't
3512                      * transform into EXACTFU nodes */
3513                     if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3514                         OP(scan) = EXACTFU_SS;
3515                     }
3516                 }
3517
3518                 *min_subtract += len - 1;
3519                 s += len;
3520             }
3521         }
3522     }
3523
3524 #ifdef DEBUGGING
3525     /* Allow dumping but overwriting the collection of skipped
3526      * ops and/or strings with fake optimized ops */
3527     n = scan + NODE_SZ_STR(scan);
3528     while (n <= stop) {
3529         OP(n) = OPTIMIZED;
3530         FLAGS(n) = 0;
3531         NEXT_OFF(n) = 0;
3532         n++;
3533     }
3534 #endif
3535     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3536     return stopnow;
3537 }
3538
3539 /* REx optimizer.  Converts nodes into quicker variants "in place".
3540    Finds fixed substrings.  */
3541
3542 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3543    to the position after last scanned or to NULL. */
3544
3545 #define INIT_AND_WITHP \
3546     assert(!and_withp); \
3547     Newx(and_withp,1, regnode_ssc); \
3548     SAVEFREEPV(and_withp)
3549
3550 /* this is a chain of data about sub patterns we are processing that
3551    need to be handled separately/specially in study_chunk. Its so
3552    we can simulate recursion without losing state.  */
3553 struct scan_frame;
3554 typedef struct scan_frame {
3555     regnode *last;  /* last node to process in this frame */
3556     regnode *next;  /* next node to process when last is reached */
3557     struct scan_frame *prev; /*previous frame*/
3558     U32 prev_recursed_depth;
3559     I32 stop; /* what stopparen do we use */
3560 } scan_frame;
3561
3562
3563 STATIC SSize_t
3564 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3565                         SSize_t *minlenp, SSize_t *deltap,
3566                         regnode *last,
3567                         scan_data_t *data,
3568                         I32 stopparen,
3569                         U32 recursed_depth,
3570                         regnode_ssc *and_withp,
3571                         U32 flags, U32 depth)
3572                         /* scanp: Start here (read-write). */
3573                         /* deltap: Write maxlen-minlen here. */
3574                         /* last: Stop before this one. */
3575                         /* data: string data about the pattern */
3576                         /* stopparen: treat close N as END */
3577                         /* recursed: which subroutines have we recursed into */
3578                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3579 {
3580     dVAR;
3581     /* There must be at least this number of characters to match */
3582     SSize_t min = 0;
3583     I32 pars = 0, code;
3584     regnode *scan = *scanp, *next;
3585     SSize_t delta = 0;
3586     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3587     int is_inf_internal = 0;            /* The studied chunk is infinite */
3588     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3589     scan_data_t data_fake;
3590     SV *re_trie_maxbuff = NULL;
3591     regnode *first_non_open = scan;
3592     SSize_t stopmin = SSize_t_MAX;
3593     scan_frame *frame = NULL;
3594     GET_RE_DEBUG_FLAGS_DECL;
3595
3596     PERL_ARGS_ASSERT_STUDY_CHUNK;
3597
3598 #ifdef DEBUGGING
3599     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3600 #endif
3601     if ( depth == 0 ) {
3602         while (first_non_open && OP(first_non_open) == OPEN)
3603             first_non_open=regnext(first_non_open);
3604     }
3605
3606
3607   fake_study_recurse:
3608     while ( scan && OP(scan) != END && scan < last ){
3609         UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3610                                    node length to get a real minimum (because
3611                                    the folded version may be shorter) */
3612         bool unfolded_multi_char = FALSE;
3613         /* Peephole optimizer: */
3614         DEBUG_OPTIMISE_MORE_r(
3615         {
3616             PerlIO_printf(Perl_debug_log,
3617                 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3618                 ((int) depth*2), "", (long)stopparen,
3619                 (unsigned long)depth, (unsigned long)recursed_depth);
3620             if (recursed_depth) {
3621                 U32 i;
3622                 U32 j;
3623                 for ( j = 0 ; j < recursed_depth ; j++ ) {
3624                     PerlIO_printf(Perl_debug_log,"[");
3625                     for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3626                         PerlIO_printf(Perl_debug_log,"%d",
3627                             PAREN_TEST(RExC_study_chunk_recursed +
3628                                        (j * RExC_study_chunk_recursed_bytes), i)
3629                             ? 1 : 0
3630                         );
3631                     PerlIO_printf(Perl_debug_log,"]");
3632                 }
3633             }
3634             PerlIO_printf(Perl_debug_log,"\n");
3635         }
3636         );
3637         DEBUG_STUDYDATA("Peep:", data, depth);
3638         DEBUG_PEEP("Peep", scan, depth);
3639
3640
3641         /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3642          * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3643          * by a different invocation of reg() -- Yves
3644          */
3645         JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3646
3647         /* Follow the next-chain of the current node and optimize
3648            away all the NOTHINGs from it.  */
3649         if (OP(scan) != CURLYX) {
3650             const int max = (reg_off_by_arg[OP(scan)]
3651                        ? I32_MAX
3652                        /* I32 may be smaller than U16 on CRAYs! */
3653                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3654             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3655             int noff;
3656             regnode *n = scan;
3657
3658             /* Skip NOTHING and LONGJMP. */
3659             while ((n = regnext(n))
3660                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3661                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3662                    && off + noff < max)
3663                 off += noff;
3664             if (reg_off_by_arg[OP(scan)])
3665                 ARG(scan) = off;
3666             else
3667                 NEXT_OFF(scan) = off;
3668         }
3669
3670
3671
3672         /* The principal pseudo-switch.  Cannot be a switch, since we
3673            look into several different things.  */
3674         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3675                    || OP(scan) == IFTHEN) {
3676             next = regnext(scan);
3677             code = OP(scan);
3678             /* demq: the op(next)==code check is to see if we have
3679              * "branch-branch" AFAICT */
3680
3681             if (OP(next) == code || code == IFTHEN) {
3682                 /* NOTE - There is similar code to this block below for
3683                  * handling TRIE nodes on a re-study.  If you change stuff here
3684                  * check there too. */
3685                 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3686                 regnode_ssc accum;
3687                 regnode * const startbranch=scan;
3688
3689                 if (flags & SCF_DO_SUBSTR) {
3690                     /* Cannot merge strings after this. */
3691                     scan_commit(pRExC_state, data, minlenp, is_inf);
3692                 }
3693
3694                 if (flags & SCF_DO_STCLASS)
3695                     ssc_init_zero(pRExC_state, &accum);
3696
3697                 while (OP(scan) == code) {
3698                     SSize_t deltanext, minnext, fake;
3699                     I32 f = 0;
3700                     regnode_ssc this_class;
3701
3702                     num++;
3703                     data_fake.flags = 0;
3704                     if (data) {
3705                         data_fake.whilem_c = data->whilem_c;
3706                         data_fake.last_closep = data->last_closep;
3707                     }
3708                     else
3709                         data_fake.last_closep = &fake;
3710
3711                     data_fake.pos_delta = delta;
3712                     next = regnext(scan);
3713                     scan = NEXTOPER(scan);
3714                     if (code != BRANCH)
3715                         scan = NEXTOPER(scan);
3716                     if (flags & SCF_DO_STCLASS) {
3717                         ssc_init(pRExC_state, &this_class);
3718                         data_fake.start_class = &this_class;
3719                         f = SCF_DO_STCLASS_AND;
3720                     }
3721                     if (flags & SCF_WHILEM_VISITED_POS)
3722                         f |= SCF_WHILEM_VISITED_POS;
3723
3724                     /* we suppose the run is continuous, last=next...*/
3725                     minnext = study_chunk(pRExC_state, &scan, minlenp,
3726                                       &deltanext, next, &data_fake, stopparen,
3727                                       recursed_depth, NULL, f,depth+1);
3728                     if (min1 > minnext)
3729                         min1 = minnext;
3730                     if (deltanext == SSize_t_MAX) {
3731                         is_inf = is_inf_internal = 1;
3732                         max1 = SSize_t_MAX;
3733                     } else if (max1 < minnext + deltanext)
3734                         max1 = minnext + deltanext;
3735                     scan = next;
3736                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3737                         pars++;
3738                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3739                         if ( stopmin > minnext)
3740                             stopmin = min + min1;
3741                         flags &= ~SCF_DO_SUBSTR;
3742                         if (data)
3743                             data->flags |= SCF_SEEN_ACCEPT;
3744                     }
3745                     if (data) {
3746                         if (data_fake.flags & SF_HAS_EVAL)
3747                             data->flags |= SF_HAS_EVAL;
3748                         data->whilem_c = data_fake.whilem_c;
3749                     }
3750                     if (flags & SCF_DO_STCLASS)
3751                         ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3752                 }
3753                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3754                     min1 = 0;
3755                 if (flags & SCF_DO_SUBSTR) {
3756                     data->pos_min += min1;
3757                     if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3758                         data->pos_delta = SSize_t_MAX;
3759                     else
3760                         data->pos_delta += max1 - min1;
3761                     if (max1 != min1 || is_inf)
3762                         data->longest = &(data->longest_float);
3763                 }
3764                 min += min1;
3765                 if (delta == SSize_t_MAX
3766                  || SSize_t_MAX - delta - (max1 - min1) < 0)
3767                     delta = SSize_t_MAX;
3768                 else
3769                     delta += max1 - min1;
3770                 if (flags & SCF_DO_STCLASS_OR) {
3771                     ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3772                     if (min1) {
3773                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3774                         flags &= ~SCF_DO_STCLASS;
3775                     }
3776                 }
3777                 else if (flags & SCF_DO_STCLASS_AND) {
3778                     if (min1) {
3779                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3780                         flags &= ~SCF_DO_STCLASS;
3781                     }
3782                     else {
3783                         /* Switch to OR mode: cache the old value of
3784                          * data->start_class */
3785                         INIT_AND_WITHP;
3786                         StructCopy(data->start_class, and_withp, regnode_ssc);
3787                         flags &= ~SCF_DO_STCLASS_AND;
3788                         StructCopy(&accum, data->start_class, regnode_ssc);
3789                         flags |= SCF_DO_STCLASS_OR;
3790                     }
3791                 }
3792
3793                 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3794                         OP( startbranch ) == BRANCH )
3795                 {
3796                 /* demq.
3797
3798                    Assuming this was/is a branch we are dealing with: 'scan'
3799                    now points at the item that follows the branch sequence,
3800                    whatever it is. We now start at the beginning of the
3801                    sequence and look for subsequences of
3802
3803                    BRANCH->EXACT=>x1
3804                    BRANCH->EXACT=>x2
3805                    tail
3806
3807                    which would be constructed from a pattern like
3808                    /A|LIST|OF|WORDS/
3809
3810                    If we can find such a subsequence we need to turn the first
3811                    element into a trie and then add the subsequent branch exact
3812                    strings to the trie.
3813
3814                    We have two cases
3815
3816                      1. patterns where the whole set of branches can be
3817                         converted.
3818
3819                      2. patterns where only a subset can be converted.
3820
3821                    In case 1 we can replace the whole set with a single regop
3822                    for the trie. In case 2 we need to keep the start and end
3823                    branches so
3824
3825                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3826                      becomes BRANCH TRIE; BRANCH X;
3827
3828                   There is an additional case, that being where there is a
3829                   common prefix, which gets split out into an EXACT like node
3830                   preceding the TRIE node.
3831
3832                   If x(1..n)==tail then we can do a simple trie, if not we make
3833                   a "jump" trie, such that when we match the appropriate word
3834                   we "jump" to the appropriate tail node. Essentially we turn
3835                   a nested if into a case structure of sorts.
3836
3837                 */
3838
3839                     int made=0;
3840                     if (!re_trie_maxbuff) {
3841                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3842                         if (!SvIOK(re_trie_maxbuff))
3843                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3844                     }
3845                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3846                         regnode *cur;
3847                         regnode *first = (regnode *)NULL;
3848                         regnode *last = (regnode *)NULL;
3849                         regnode *tail = scan;
3850                         U8 trietype = 0;
3851                         U32 count=0;
3852
3853 #ifdef DEBUGGING
3854                         SV * const mysv = sv_newmortal();   /* for dumping */
3855 #endif
3856                         /* var tail is used because there may be a TAIL
3857                            regop in the way. Ie, the exacts will point to the
3858                            thing following the TAIL, but the last branch will
3859                            point at the TAIL. So we advance tail. If we
3860                            have nested (?:) we may have to move through several
3861                            tails.
3862                          */
3863
3864                         while ( OP( tail ) == TAIL ) {
3865                             /* this is the TAIL generated by (?:) */
3866                             tail = regnext( tail );
3867                         }
3868
3869
3870                         DEBUG_TRIE_COMPILE_r({
3871                             regprop(RExC_rx, mysv, tail, NULL);
3872                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3873                               (int)depth * 2 + 2, "",
3874                               "Looking for TRIE'able sequences. Tail node is: ",
3875                               SvPV_nolen_const( mysv )
3876                             );
3877                         });
3878
3879                         /*
3880
3881                             Step through the branches
3882                                 cur represents each branch,
3883                                 noper is the first thing to be matched as part
3884                                       of that branch
3885                                 noper_next is the regnext() of that node.
3886
3887                             We normally handle a case like this
3888                             /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3889                             support building with NOJUMPTRIE, which restricts
3890                             the trie logic to structures like /FOO|BAR/.
3891
3892                             If noper is a trieable nodetype then the branch is
3893                             a possible optimization target. If we are building
3894                             under NOJUMPTRIE then we require that noper_next is
3895                             the same as scan (our current position in the regex
3896                             program).
3897
3898                             Once we have two or more consecutive such branches
3899                             we can create a trie of the EXACT's contents and
3900                             stitch it in place into the program.
3901
3902                             If the sequence represents all of the branches in
3903                             the alternation we replace the entire thing with a
3904                             single TRIE node.
3905
3906                             Otherwise when it is a subsequence we need to
3907                             stitch it in place and replace only the relevant
3908                             branches. This means the first branch has to remain
3909                             as it is used by the alternation logic, and its
3910                             next pointer, and needs to be repointed at the item
3911                             on the branch chain following the last branch we
3912                             have optimized away.
3913
3914                             This could be either a BRANCH, in which case the
3915                             subsequence is internal, or it could be the item
3916                             following the branch sequence in which case the
3917                             subsequence is at the end (which does not
3918                             necessarily mean the first node is the start of the
3919                             alternation).
3920
3921                             TRIE_TYPE(X) is a define which maps the optype to a
3922                             trietype.
3923
3924                                 optype          |  trietype
3925                                 ----------------+-----------
3926                                 NOTHING         | NOTHING
3927                                 EXACT           | EXACT
3928                                 EXACTFU         | EXACTFU
3929                                 EXACTFU_SS      | EXACTFU
3930                                 EXACTFA         | EXACTFA
3931
3932
3933                         */
3934 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3935                        ( EXACT == (X) )   ? EXACT :        \
3936                        ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3937                        ( EXACTFA == (X) ) ? EXACTFA :        \
3938                        0 )
3939
3940                         /* dont use tail as the end marker for this traverse */
3941                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3942                             regnode * const noper = NEXTOPER( cur );
3943                             U8 noper_type = OP( noper );
3944                             U8 noper_trietype = TRIE_TYPE( noper_type );
3945 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3946                             regnode * const noper_next = regnext( noper );
3947                             U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3948                             U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3949 #endif
3950
3951                             DEBUG_TRIE_COMPILE_r({
3952                                 regprop(RExC_rx, mysv, cur, NULL);
3953                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3954                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3955
3956                                 regprop(RExC_rx, mysv, noper, NULL);
3957                                 PerlIO_printf( Perl_debug_log, " -> %s",
3958                                     SvPV_nolen_const(mysv));
3959
3960                                 if ( noper_next ) {
3961                                   regprop(RExC_rx, mysv, noper_next, NULL);
3962                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3963                                     SvPV_nolen_const(mysv));
3964                                 }
3965                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3966                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3967                                    PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3968                                 );
3969                             });
3970
3971                             /* Is noper a trieable nodetype that can be merged
3972                              * with the current trie (if there is one)? */
3973                             if ( noper_trietype
3974                                   &&
3975                                   (
3976                                         ( noper_trietype == NOTHING)
3977                                         || ( trietype == NOTHING )
3978                                         || ( trietype == noper_trietype )
3979                                   )
3980 #ifdef NOJUMPTRIE
3981                                   && noper_next == tail
3982 #endif
3983                                   && count < U16_MAX)
3984                             {
3985                                 /* Handle mergable triable node Either we are
3986                                  * the first node in a new trieable sequence,
3987                                  * in which case we do some bookkeeping,
3988                                  * otherwise we update the end pointer. */
3989                                 if ( !first ) {
3990                                     first = cur;
3991                                     if ( noper_trietype == NOTHING ) {
3992 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3993                                         regnode * const noper_next = regnext( noper );
3994                                         U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3995                                         U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3996 #endif
3997
3998                                         if ( noper_next_trietype ) {
3999                                             trietype = noper_next_trietype;
4000                                         } else if (noper_next_type)  {
4001                                             /* a NOTHING regop is 1 regop wide.
4002                                              * We need at least two for a trie
4003                                              * so we can't merge this in */
4004                                             first = NULL;
4005                                         }
4006                                     } else {
4007                                         trietype = noper_trietype;
4008                                     }
4009                                 } else {
4010                                     if ( trietype == NOTHING )
4011                                         trietype = noper_trietype;
4012                                     last = cur;
4013                                 }
4014                                 if (first)
4015                                     count++;
4016                             } /* end handle mergable triable node */
4017                             else {
4018                                 /* handle unmergable node -
4019                                  * noper may either be a triable node which can
4020                                  * not be tried together with the current trie,
4021                                  * or a non triable node */
4022                                 if ( last ) {
4023                                     /* If last is set and trietype is not
4024                                      * NOTHING then we have found at least two
4025                                      * triable branch sequences in a row of a
4026                                      * similar trietype so we can turn them
4027                                      * into a trie. If/when we allow NOTHING to
4028                                      * start a trie sequence this condition
4029                                      * will be required, and it isn't expensive
4030                                      * so we leave it in for now. */
4031                                     if ( trietype && trietype != NOTHING )
4032                                         make_trie( pRExC_state,
4033                                                 startbranch, first, cur, tail,
4034                                                 count, trietype, depth+1 );
4035                                     last = NULL; /* note: we clear/update
4036                                                     first, trietype etc below,
4037                                                     so we dont do it here */
4038                                 }
4039                                 if ( noper_trietype
4040 #ifdef NOJUMPTRIE
4041                                      && noper_next == tail
4042 #endif
4043                                 ){
4044                                     /* noper is triable, so we can start a new
4045                                      * trie sequence */
4046                                     count = 1;
4047                                     first = cur;
4048                                     trietype = noper_trietype;
4049                                 } else if (first) {
4050                                     /* if we already saw a first but the
4051                                      * current node is not triable then we have
4052                                      * to reset the first information. */
4053                                     count = 0;
4054                                     first = NULL;
4055                                     trietype = 0;
4056                                 }
4057                             } /* end handle unmergable node */
4058                         } /* loop over branches */
4059                         DEBUG_TRIE_COMPILE_r({
4060                             regprop(RExC_rx, mysv, cur, NULL);
4061                             PerlIO_printf( Perl_debug_log,
4062                               "%*s- %s (%d) <SCAN FINISHED>\n",
4063                               (int)depth * 2 + 2,
4064                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4065
4066                         });
4067                         if ( last && trietype ) {
4068                             if ( trietype != NOTHING ) {
4069                                 /* the last branch of the sequence was part of
4070                                  * a trie, so we have to construct it here
4071                                  * outside of the loop */
4072                                 made= make_trie( pRExC_state, startbranch,
4073                                                  first, scan, tail, count,
4074                                                  trietype, depth+1 );
4075 #ifdef TRIE_STUDY_OPT
4076                                 if ( ((made == MADE_EXACT_TRIE &&
4077                                      startbranch == first)
4078                                      || ( first_non_open == first )) &&
4079                                      depth==0 ) {
4080                                     flags |= SCF_TRIE_RESTUDY;
4081                                     if ( startbranch == first
4082                                          && scan == tail )
4083                                     {
4084                                         RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4085                                     }
4086                                 }
4087 #endif
4088                             } else {
4089                                 /* at this point we know whatever we have is a
4090                                  * NOTHING sequence/branch AND if 'startbranch'
4091                                  * is 'first' then we can turn the whole thing
4092                                  * into a NOTHING
4093                                  */
4094                                 if ( startbranch == first ) {
4095                                     regnode *opt;
4096                                     /* the entire thing is a NOTHING sequence,
4097                                      * something like this: (?:|) So we can
4098                                      * turn it into a plain NOTHING op. */
4099                                     DEBUG_TRIE_COMPILE_r({
4100                                         regprop(RExC_rx, mysv, cur, NULL);
4101                                         PerlIO_printf( Perl_debug_log,
4102                                           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4103                                           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4104
4105                                     });
4106                                     OP(startbranch)= NOTHING;
4107                                     NEXT_OFF(startbranch)= tail - startbranch;
4108                                     for ( opt= startbranch + 1; opt < tail ; opt++ )
4109                                         OP(opt)= OPTIMIZED;
4110                                 }
4111                             }
4112                         } /* end if ( last) */
4113                     } /* TRIE_MAXBUF is non zero */
4114
4115                 } /* do trie */
4116
4117             }
4118             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4119                 scan = NEXTOPER(NEXTOPER(scan));
4120             } else                      /* single branch is optimized. */
4121                 scan = NEXTOPER(scan);
4122             continue;
4123         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4124             scan_frame *newframe = NULL;
4125             I32 paren;
4126             regnode *start;
4127             regnode *end;
4128             U32 my_recursed_depth= recursed_depth;
4129
4130             if (OP(scan) != SUSPEND) {
4131                 /* set the pointer */
4132                 if (OP(scan) == GOSUB) {
4133                     paren = ARG(scan);
4134                     RExC_recurse[ARG2L(scan)] = scan;
4135                     start = RExC_open_parens[paren-1];
4136                     end   = RExC_close_parens[paren-1];
4137                 } else {
4138                     paren = 0;
4139                     start = RExC_rxi->program + 1;
4140                     end   = RExC_opend;
4141                 }
4142                 if (!recursed_depth
4143                     ||
4144                     !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4145                 ) {
4146                     if (!recursed_depth) {
4147                         Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4148                     } else {
4149                         Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4150                              RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4151                              RExC_study_chunk_recursed_bytes, U8);
4152                     }
4153                     /* we havent recursed into this paren yet, so recurse into it */
4154                     DEBUG_STUDYDATA("set:", data,depth);
4155                     PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4156                     my_recursed_depth= recursed_depth + 1;
4157                     Newx(newframe,1,scan_frame);
4158                 } else {
4159                     DEBUG_STUDYDATA("inf:", data,depth);
4160                     /* some form of infinite recursion, assume infinite length
4161                      * */
4162                     if (flags & SCF_DO_SUBSTR) {
4163                         scan_commit(pRExC_state, data, minlenp, is_inf);
4164                         data->longest = &(data->longest_float);
4165                     }
4166                     is_inf = is_inf_internal = 1;
4167                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4168                         ssc_anything(data->start_class);
4169                     flags &= ~SCF_DO_STCLASS;
4170                 }
4171             } else {
4172                 Newx(newframe,1,scan_frame);
4173                 paren = stopparen;
4174                 start = scan+2;
4175                 end = regnext(scan);
4176             }
4177             if (newframe) {
4178                 assert(start);
4179                 assert(end);
4180                 SAVEFREEPV(newframe);
4181                 newframe->next = regnext(scan);
4182                 newframe->last = last;
4183                 newframe->stop = stopparen;
4184                 newframe->prev = frame;
4185                 newframe->prev_recursed_depth = recursed_depth;
4186
4187                 DEBUG_STUDYDATA("frame-new:",data,depth);
4188                 DEBUG_PEEP("fnew", scan, depth);
4189
4190                 frame = newframe;
4191                 scan =  start;
4192                 stopparen = paren;
4193                 last = end;
4194                 depth = depth + 1;
4195                 recursed_depth= my_recursed_depth;
4196
4197                 continue;
4198             }
4199         }
4200         else if (OP(scan) == EXACT) {
4201             SSize_t l = STR_LEN(scan);
4202             UV uc;
4203             if (UTF) {
4204                 const U8 * const s = (U8*)STRING(scan);
4205                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4206                 l = utf8_length(s, s + l);
4207             } else {
4208                 uc = *((U8*)STRING(scan));
4209             }
4210             min += l;
4211             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4212                 /* The code below prefers earlier match for fixed
4213                    offset, later match for variable offset.  */
4214                 if (data->last_end == -1) { /* Update the start info. */
4215                     data->last_start_min = data->pos_min;
4216                     data->last_start_max = is_inf
4217                         ? SSize_t_MAX : data->pos_min + data->pos_delta;
4218                 }
4219                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4220                 if (UTF)
4221                     SvUTF8_on(data->last_found);
4222                 {
4223                     SV * const sv = data->last_found;
4224                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4225                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4226                     if (mg && mg->mg_len >= 0)
4227                         mg->mg_len += utf8_length((U8*)STRING(scan),
4228                                               (U8*)STRING(scan)+STR_LEN(scan));
4229                 }
4230                 data->last_end = data->pos_min + l;
4231                 data->pos_min += l; /* As in the first entry. */
4232                 data->flags &= ~SF_BEFORE_EOL;
4233             }
4234
4235             /* ANDing the code point leaves at most it, and not in locale, and
4236              * can't match null string */
4237             if (flags & SCF_DO_STCLASS_AND) {
4238                 ssc_cp_and(data->start_class, uc);
4239                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4240                 ssc_clear_locale(data->start_class);
4241             }
4242             else if (flags & SCF_DO_STCLASS_OR) {
4243                 ssc_add_cp(data->start_class, uc);
4244                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4245
4246                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4247                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4248             }
4249             flags &= ~SCF_DO_STCLASS;
4250         }
4251         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4252             SSize_t l = STR_LEN(scan);
4253             UV uc = *((U8*)STRING(scan));
4254             SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4255                                                      separate code points */
4256
4257             /* Search for fixed substrings supports EXACT only. */
4258             if (flags & SCF_DO_SUBSTR) {
4259                 assert(data);
4260                 scan_commit(pRExC_state, data, minlenp, is_inf);
4261             }
4262             if (UTF) {
4263                 const U8 * const s = (U8 *)STRING(scan);
4264                 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4265                 l = utf8_length(s, s + l);
4266             }
4267             if (unfolded_multi_char) {
4268                 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4269             }
4270             min += l - min_subtract;
4271             assert (min >= 0);
4272             delta += min_subtract;
4273             if (flags & SCF_DO_SUBSTR) {
4274                 data->pos_min += l - min_subtract;
4275                 if (data->pos_min < 0) {
4276                     data->pos_min = 0;
4277                 }
4278                 data->pos_delta += min_subtract;
4279                 if (min_subtract) {
4280                     data->longest = &(data->longest_float);
4281                 }
4282             }
4283             if (OP(scan) == EXACTFL) {
4284
4285                 /* We don't know what the folds are; it could be anything. XXX
4286                  * Actually, we only support UTF-8 encoding for code points
4287                  * above Latin1, so we could know what those folds are. */
4288                 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4289                                                        0,
4290                                                        UV_MAX);
4291             }
4292             else {  /* Non-locale EXACTFish */
4293                 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4294                 if (flags & SCF_DO_STCLASS_AND) {
4295                     ssc_clear_locale(data->start_class);
4296                 }
4297                 if (uc < 256) { /* We know what the Latin1 folds are ... */
4298                     if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4299                                                        know if anything folds
4300                                                        with this */
4301                         EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4302                                                            PL_fold_latin1[uc]);
4303                         if (OP(scan) != EXACTFA) { /* The folds below aren't
4304                                                       legal under /iaa */
4305                             if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4306                                 EXACTF_invlist
4307                                     = add_cp_to_invlist(EXACTF_invlist,
4308                                                 LATIN_SMALL_LETTER_SHARP_S);
4309                             }
4310                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4311                                 EXACTF_invlist
4312                                     = add_cp_to_invlist(EXACTF_invlist, 's');
4313                                 EXACTF_invlist
4314                                     = add_cp_to_invlist(EXACTF_invlist, 'S');
4315                             }
4316                         }
4317
4318                         /* We also know if there are above-Latin1 code points
4319                          * that fold to this (none legal for ASCII and /iaa) */
4320                         if ((! isASCII(uc) || OP(scan) != EXACTFA)
4321                             && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4322                         {
4323                             /* XXX We could know exactly what does fold to this
4324                              * if the reverse folds are loaded, as currently in
4325                              * S_regclass() */
4326                             _invlist_union(EXACTF_invlist,
4327                                            PL_AboveLatin1,
4328                                            &EXACTF_invlist);
4329                         }
4330                     }
4331                 }
4332                 else {  /* Non-locale, above Latin1.  XXX We don't currently
4333                            know what participates in folds with this, so have
4334                            to assume anything could */
4335
4336                     /* XXX We could know exactly what does fold to this if the
4337                      * reverse folds are loaded, as currently in S_regclass().
4338                      * But we do know that under /iaa nothing in the ASCII
4339                      * range can participate */
4340                     if (OP(scan) == EXACTFA) {
4341                         _invlist_union_complement_2nd(EXACTF_invlist,
4342                                                       PL_XPosix_ptrs[_CC_ASCII],
4343                                                       &EXACTF_invlist);
4344                     }
4345                     else {
4346                         EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4347                                                                0, UV_MAX);
4348                     }
4349                 }
4350             }
4351             if (flags & SCF_DO_STCLASS_AND) {
4352                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4353                 ANYOF_POSIXL_ZERO(data->start_class);
4354                 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4355             }
4356             else if (flags & SCF_DO_STCLASS_OR) {
4357                 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4358                 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4359
4360                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4361                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4362             }
4363             flags &= ~SCF_DO_STCLASS;
4364             SvREFCNT_dec(EXACTF_invlist);
4365         }
4366         else if (REGNODE_VARIES(OP(scan))) {
4367             SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4368             I32 fl = 0, f = flags;
4369             regnode * const oscan = scan;
4370             regnode_ssc this_class;
4371             regnode_ssc *oclass = NULL;
4372             I32 next_is_eval = 0;
4373
4374             switch (PL_regkind[OP(scan)]) {
4375             case WHILEM:                /* End of (?:...)* . */
4376                 scan = NEXTOPER(scan);
4377                 goto finish;
4378             case PLUS:
4379                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4380                     next = NEXTOPER(scan);
4381                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4382                         mincount = 1;
4383                         maxcount = REG_INFTY;
4384                         next = regnext(scan);
4385                         scan = NEXTOPER(scan);
4386                         goto do_curly;
4387                     }
4388                 }
4389                 if (flags & SCF_DO_SUBSTR)
4390                     data->pos_min++;
4391                 min++;
4392                 /* Fall through. */
4393             case STAR:
4394                 if (flags & SCF_DO_STCLASS) {
4395                     mincount = 0;
4396                     maxcount = REG_INFTY;
4397                     next = regnext(scan);
4398                     scan = NEXTOPER(scan);
4399                     goto do_curly;
4400                 }
4401                 if (flags & SCF_DO_SUBSTR) {
4402                     scan_commit(pRExC_state, data, minlenp, is_inf);
4403                     /* Cannot extend fixed substrings */
4404                     data->longest = &(data->longest_float);
4405                 }
4406                 is_inf = is_inf_internal = 1;
4407                 scan = regnext(scan);
4408                 goto optimize_curly_tail;
4409             case CURLY:
4410                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4411                     && (scan->flags == stopparen))
4412                 {
4413                     mincount = 1;
4414                     maxcount = 1;
4415                 } else {
4416                     mincount = ARG1(scan);
4417                     maxcount = ARG2(scan);
4418                 }
4419                 next = regnext(scan);
4420                 if (OP(scan) == CURLYX) {
4421                     I32 lp = (data ? *(data->last_closep) : 0);
4422                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4423                 }
4424                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4425                 next_is_eval = (OP(scan) == EVAL);
4426               do_curly:
4427                 if (flags & SCF_DO_SUBSTR) {
4428                     if (mincount == 0)
4429                         scan_commit(pRExC_state, data, minlenp, is_inf);
4430                     /* Cannot extend fixed substrings */
4431                     pos_before = data->pos_min;
4432                 }
4433                 if (data) {
4434                     fl = data->flags;
4435                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4436                     if (is_inf)
4437                         data->flags |= SF_IS_INF;
4438                 }
4439                 if (flags & SCF_DO_STCLASS) {
4440                     ssc_init(pRExC_state, &this_class);
4441                     oclass = data->start_class;
4442                     data->start_class = &this_class;
4443                     f |= SCF_DO_STCLASS_AND;
4444                     f &= ~SCF_DO_STCLASS_OR;
4445                 }
4446                 /* Exclude from super-linear cache processing any {n,m}
4447                    regops for which the combination of input pos and regex
4448                    pos is not enough information to determine if a match
4449                    will be possible.
4450
4451                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
4452                    regex pos at the \s*, the prospects for a match depend not
4453                    only on the input position but also on how many (bar\s*)
4454                    repeats into the {4,8} we are. */
4455                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4456                     f &= ~SCF_WHILEM_VISITED_POS;
4457
4458                 /* This will finish on WHILEM, setting scan, or on NULL: */
4459                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4460                                   last, data, stopparen, recursed_depth, NULL,
4461                                   (mincount == 0
4462                                    ? (f & ~SCF_DO_SUBSTR)
4463                                    : f)
4464                                   ,depth+1);
4465
4466                 if (flags & SCF_DO_STCLASS)
4467                     data->start_class = oclass;
4468                 if (mincount == 0 || minnext == 0) {
4469                     if (flags & SCF_DO_STCLASS_OR) {
4470                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4471                     }
4472                     else if (flags & SCF_DO_STCLASS_AND) {
4473                         /* Switch to OR mode: cache the old value of
4474                          * data->start_class */
4475                         INIT_AND_WITHP;
4476                         StructCopy(data->start_class, and_withp, regnode_ssc);
4477                         flags &= ~SCF_DO_STCLASS_AND;
4478                         StructCopy(&this_class, data->start_class, regnode_ssc);
4479                         flags |= SCF_DO_STCLASS_OR;
4480                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4481                     }
4482                 } else {                /* Non-zero len */
4483                     if (flags & SCF_DO_STCLASS_OR) {
4484                         ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4485                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4486                     }
4487                     else if (flags & SCF_DO_STCLASS_AND)
4488                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4489                     flags &= ~SCF_DO_STCLASS;
4490                 }
4491                 if (!scan)              /* It was not CURLYX, but CURLY. */
4492                     scan = next;
4493                 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4494                     /* ? quantifier ok, except for (?{ ... }) */
4495                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
4496                     && (minnext == 0) && (deltanext == 0)
4497                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4498                     && maxcount <= REG_INFTY/3) /* Complement check for big
4499                                                    count */
4500                 {
4501                     /* Fatal warnings may leak the regexp without this: */
4502                     SAVEFREESV(RExC_rx_sv);
4503                     ckWARNreg(RExC_parse,
4504                             "Quantifier unexpected on zero-length expression");
4505                     (void)ReREFCNT_inc(RExC_rx_sv);
4506                 }
4507
4508                 min += minnext * mincount;
4509                 is_inf_internal |= deltanext == SSize_t_MAX
4510                          || (maxcount == REG_INFTY && minnext + deltanext > 0);
4511                 is_inf |= is_inf_internal;
4512                 if (is_inf) {
4513                     delta = SSize_t_MAX;
4514                 } else {
4515                     delta += (minnext + deltanext) * maxcount
4516                              - minnext * mincount;
4517                 }
4518                 /* Try powerful optimization CURLYX => CURLYN. */
4519                 if (  OP(oscan) == CURLYX && data
4520                       && data->flags & SF_IN_PAR
4521                       && !(data->flags & SF_HAS_EVAL)
4522                       && !deltanext && minnext == 1 ) {
4523                     /* Try to optimize to CURLYN.  */
4524                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4525                     regnode * const nxt1 = nxt;
4526 #ifdef DEBUGGING
4527                     regnode *nxt2;
4528 #endif
4529
4530                     /* Skip open. */
4531                     nxt = regnext(nxt);
4532                     if (!REGNODE_SIMPLE(OP(nxt))
4533                         && !(PL_regkind[OP(nxt)] == EXACT
4534                              && STR_LEN(nxt) == 1))
4535                         goto nogo;
4536 #ifdef DEBUGGING
4537                     nxt2 = nxt;
4538 #endif
4539                     nxt = regnext(nxt);
4540                     if (OP(nxt) != CLOSE)
4541                         goto nogo;
4542                     if (RExC_open_parens) {
4543                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4544                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4545                     }
4546                     /* Now we know that nxt2 is the only contents: */
4547                     oscan->flags = (U8)ARG(nxt);
4548                     OP(oscan) = CURLYN;
4549                     OP(nxt1) = NOTHING; /* was OPEN. */
4550
4551 #ifdef DEBUGGING
4552                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4553                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4554                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4555                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
4556                     OP(nxt + 1) = OPTIMIZED; /* was count. */
4557                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4558 #endif
4559                 }
4560               nogo:
4561
4562                 /* Try optimization CURLYX => CURLYM. */
4563                 if (  OP(oscan) == CURLYX && data
4564                       && !(data->flags & SF_HAS_PAR)
4565                       && !(data->flags & SF_HAS_EVAL)
4566                       && !deltanext     /* atom is fixed width */
4567                       && minnext != 0   /* CURLYM can't handle zero width */
4568
4569                          /* Nor characters whose fold at run-time may be
4570                           * multi-character */
4571                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4572                 ) {
4573                     /* XXXX How to optimize if data == 0? */
4574                     /* Optimize to a simpler form.  */
4575                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4576                     regnode *nxt2;
4577
4578                     OP(oscan) = CURLYM;
4579                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4580                             && (OP(nxt2) != WHILEM))
4581                         nxt = nxt2;
4582                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4583                     /* Need to optimize away parenths. */
4584                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4585                         /* Set the parenth number.  */
4586                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4587
4588                         oscan->flags = (U8)ARG(nxt);
4589                         if (RExC_open_parens) {
4590                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4591                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4592                         }
4593                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
4594                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
4595
4596 #ifdef DEBUGGING
4597                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4598                         OP(nxt + 1) = OPTIMIZED; /* was count. */
4599                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4600                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4601 #endif
4602 #if 0
4603                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
4604                             regnode *nnxt = regnext(nxt1);
4605                             if (nnxt == nxt) {
4606                                 if (reg_off_by_arg[OP(nxt1)])
4607                                     ARG_SET(nxt1, nxt2 - nxt1);
4608                                 else if (nxt2 - nxt1 < U16_MAX)
4609                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
4610                                 else
4611                                     OP(nxt) = NOTHING;  /* Cannot beautify */
4612                             }
4613                             nxt1 = nnxt;
4614                         }
4615 #endif
4616                         /* Optimize again: */
4617                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4618                                     NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4619                     }
4620                     else
4621                         oscan->flags = 0;
4622                 }
4623                 else if ((OP(oscan) == CURLYX)
4624                          && (flags & SCF_WHILEM_VISITED_POS)
4625                          /* See the comment on a similar expression above.
4626                             However, this time it's not a subexpression
4627                             we care about, but the expression itself. */
4628                          && (maxcount == REG_INFTY)
4629                          && data && ++data->whilem_c < 16) {
4630                     /* This stays as CURLYX, we can put the count/of pair. */
4631                     /* Find WHILEM (as in regexec.c) */
4632                     regnode *nxt = oscan + NEXT_OFF(oscan);
4633
4634                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4635                         nxt += ARG(nxt);
4636                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
4637                         | (RExC_whilem_seen << 4)); /* On WHILEM */
4638                 }
4639                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4640                     pars++;
4641                 if (flags & SCF_DO_SUBSTR) {
4642                     SV *last_str = NULL;
4643                     STRLEN last_chrs = 0;
4644                     int counted = mincount != 0;
4645
4646                     if (data->last_end > 0 && mincount != 0) { /* Ends with a
4647                                                                   string. */
4648                         SSize_t b = pos_before >= data->last_start_min
4649                             ? pos_before : data->last_start_min;
4650                         STRLEN l;
4651                         const char * const s = SvPV_const(data->last_found, l);
4652                         SSize_t old = b - data->last_start_min;
4653
4654                         if (UTF)
4655                             old = utf8_hop((U8*)s, old) - (U8*)s;
4656                         l -= old;
4657                         /* Get the added string: */
4658                         last_str = newSVpvn_utf8(s  + old, l, UTF);
4659                         last_chrs = UTF ? utf8_length((U8*)(s + old),
4660                                             (U8*)(s + old + l)) : l;
4661                         if (deltanext == 0 && pos_before == b) {
4662                             /* What was added is a constant string */
4663                             if (mincount > 1) {
4664
4665                                 SvGROW(last_str, (mincount * l) + 1);
4666                                 repeatcpy(SvPVX(last_str) + l,
4667                                           SvPVX_const(last_str), l,
4668                                           mincount - 1);
4669                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4670                                 /* Add additional parts. */
4671                                 SvCUR_set(data->last_found,
4672                                           SvCUR(data->last_found) - l);
4673                                 sv_catsv(data->last_found, last_str);
4674                                 {
4675                                     SV * sv = data->last_found;
4676                                     MAGIC *mg =
4677                                         SvUTF8(sv) && SvMAGICAL(sv) ?
4678                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4679                                     if (mg && mg->mg_len >= 0)
4680                                         mg->mg_len += last_chrs * (mincount-1);
4681                                 }
4682                                 last_chrs *= mincount;
4683                                 data->last_end += l * (mincount - 1);
4684                             }
4685                         } else {
4686                             /* start offset must point into the last copy */
4687                             data->last_start_min += minnext * (mincount - 1);
4688                             data->last_start_max += is_inf ? SSize_t_MAX
4689                                 : (maxcount - 1) * (minnext + data->pos_delta);
4690                         }
4691                     }
4692                     /* It is counted once already... */
4693                     data->pos_min += minnext * (mincount - counted);
4694 #if 0
4695 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4696                               " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4697                               " maxcount=%"UVdf" mincount=%"UVdf"\n",
4698     (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4699     (UV)mincount);
4700 if (deltanext != SSize_t_MAX)
4701 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4702     (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4703           - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4704 #endif
4705                     if (deltanext == SSize_t_MAX
4706                         || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4707                         data->pos_delta = SSize_t_MAX;
4708                     else
4709                         data->pos_delta += - counted * deltanext +
4710                         (minnext + deltanext) * maxcount - minnext * mincount;
4711                     if (mincount != maxcount) {
4712                          /* Cannot extend fixed substrings found inside
4713                             the group.  */
4714                         scan_commit(pRExC_state, data, minlenp, is_inf);
4715                         if (mincount && last_str) {
4716                             SV * const sv = data->last_found;
4717                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4718                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4719
4720                             if (mg)
4721                                 mg->mg_len = -1;
4722                             sv_setsv(sv, last_str);
4723                             data->last_end = data->pos_min;
4724                             data->last_start_min = data->pos_min - last_chrs;
4725                             data->last_start_max = is_inf
4726                                 ? SSize_t_MAX
4727                                 : data->pos_min + data->pos_delta - last_chrs;
4728                         }
4729                         data->longest = &(data->longest_float);
4730                     }
4731                     SvREFCNT_dec(last_str);
4732                 }
4733                 if (data && (fl & SF_HAS_EVAL))
4734                     data->flags |= SF_HAS_EVAL;
4735               optimize_curly_tail:
4736                 if (OP(oscan) != CURLYX) {
4737                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4738                            && NEXT_OFF(next))
4739                         NEXT_OFF(oscan) += NEXT_OFF(next);
4740                 }
4741                 continue;
4742
4743             default:
4744 #ifdef DEBUGGING
4745                 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4746                                                                     OP(scan));
4747 #endif
4748             case REF:
4749             case CLUMP:
4750                 if (flags & SCF_DO_SUBSTR) {
4751                     /* Cannot expect anything... */
4752                     scan_commit(pRExC_state, data, minlenp, is_inf);
4753                     data->longest = &(data->longest_float);
4754                 }
4755                 is_inf = is_inf_internal = 1;
4756                 if (flags & SCF_DO_STCLASS_OR) {
4757                     if (OP(scan) == CLUMP) {
4758                         /* Actually is any start char, but very few code points
4759                          * aren't start characters */
4760                         ssc_match_all_cp(data->start_class);
4761                     }
4762                     else {
4763                         ssc_anything(data->start_class);
4764                     }
4765                 }
4766                 flags &= ~SCF_DO_STCLASS;
4767                 break;
4768             }
4769         }
4770         else if (OP(scan) == LNBREAK) {
4771             if (flags & SCF_DO_STCLASS) {
4772                 if (flags & SCF_DO_STCLASS_AND) {
4773                     ssc_intersection(data->start_class,
4774                                     PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4775                     ssc_clear_locale(data->start_class);
4776                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4777                 }
4778                 else if (flags & SCF_DO_STCLASS_OR) {
4779                     ssc_union(data->start_class,
4780                               PL_XPosix_ptrs[_CC_VERTSPACE],
4781                               FALSE);
4782                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4783
4784                     /* See commit msg for
4785                      * 749e076fceedeb708a624933726e7989f2302f6a */
4786                     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4787                 }
4788                 flags &= ~SCF_DO_STCLASS;
4789             }
4790             min++;
4791             delta++;    /* Because of the 2 char string cr-lf */
4792             if (flags & SCF_DO_SUBSTR) {
4793                 /* Cannot expect anything... */
4794                 scan_commit(pRExC_state, data, minlenp, is_inf);
4795                 data->pos_min += 1;
4796                 data->pos_delta += 1;
4797                 data->longest = &(data->longest_float);
4798             }
4799         }
4800         else if (REGNODE_SIMPLE(OP(scan))) {
4801
4802             if (flags & SCF_DO_SUBSTR) {
4803                 scan_commit(pRExC_state, data, minlenp, is_inf);
4804                 data->pos_min++;
4805             }
4806             min++;
4807             if (flags & SCF_DO_STCLASS) {
4808                 bool invert = 0;
4809                 SV* my_invlist = NULL;
4810                 U8 namedclass;
4811
4812                 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4813                 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4814
4815                 /* Some of the logic below assumes that switching
4816                    locale on will only add false positives. */
4817                 switch (OP(scan)) {
4818
4819                 default:
4820 #ifdef DEBUGGING
4821                    Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4822                                                                      OP(scan));
4823 #endif
4824                 case CANY:
4825                 case SANY:
4826                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4827                         ssc_match_all_cp(data->start_class);
4828                     break;
4829
4830                 case REG_ANY:
4831                     {
4832                         SV* REG_ANY_invlist = _new_invlist(2);
4833                         REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4834                                                             '\n');
4835                         if (flags & SCF_DO_STCLASS_OR) {
4836                             ssc_union(data->start_class,
4837                                       REG_ANY_invlist,
4838                                       TRUE /* TRUE => invert, hence all but \n
4839                                             */
4840                                       );
4841                         }
4842                         else if (flags & SCF_DO_STCLASS_AND) {
4843                             ssc_intersection(data->start_class,
4844                                              REG_ANY_invlist,
4845                                              TRUE  /* TRUE => invert */
4846                                              );
4847                             ssc_clear_locale(data->start_class);
4848                         }
4849                         SvREFCNT_dec_NN(REG_ANY_invlist);
4850                     }
4851                     break;
4852
4853                 case ANYOF:
4854                     if (flags & SCF_DO_STCLASS_AND)
4855                         ssc_and(pRExC_state, data->start_class,
4856                                 (regnode_charclass *) scan);
4857                     else
4858                         ssc_or(pRExC_state, data->start_class,
4859                                                           (regnode_charclass *) scan);
4860                     break;
4861
4862                 case NPOSIXL:
4863                     invert = 1;
4864                     /* FALL THROUGH */
4865
4866                 case POSIXL:
4867                     namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4868                     if (flags & SCF_DO_STCLASS_AND) {
4869                         bool was_there = cBOOL(
4870                                           ANYOF_POSIXL_TEST(data->start_class,
4871                                                                  namedclass));
4872                         ANYOF_POSIXL_ZERO(data->start_class);
4873                         if (was_there) {    /* Do an AND */
4874                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4875                         }
4876                         /* No individual code points can now match */
4877                         data->start_class->invlist
4878                                                 = sv_2mortal(_new_invlist(0));
4879                     }
4880                     else {
4881                         int complement = namedclass + ((invert) ? -1 : 1);
4882
4883                         assert(flags & SCF_DO_STCLASS_OR);
4884
4885                         /* If the complement of this class was already there,
4886                          * the result is that they match all code points,
4887                          * (\d + \D == everything).  Remove the classes from
4888                          * future consideration.  Locale is not relevant in
4889                          * this case */
4890                         if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4891                             ssc_match_all_cp(data->start_class);
4892                             ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4893                             ANYOF_POSIXL_CLEAR(data->start_class, complement);
4894                         }
4895                         else {  /* The usual case; just add this class to the
4896                                    existing set */
4897                             ANYOF_POSIXL_SET(data->start_class, namedclass);
4898                         }
4899                     }
4900                     break;
4901
4902                 case NPOSIXA:   /* For these, we always know the exact set of
4903                                    what's matched */
4904                     invert = 1;
4905                     /* FALL THROUGH */
4906                 case POSIXA:
4907                     if (FLAGS(scan) == _CC_ASCII) {
4908                         my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
4909                     }
4910                     else {
4911                         _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4912                                               PL_XPosix_ptrs[_CC_ASCII],
4913                                               &my_invlist);
4914                     }
4915                     goto join_posix;
4916
4917                 case NPOSIXD:
4918                 case NPOSIXU:
4919                     invert = 1;
4920                     /* FALL THROUGH */
4921                 case POSIXD:
4922                 case POSIXU:
4923                     my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4924
4925                     /* NPOSIXD matches all upper Latin1 code points unless the
4926                      * target string being matched is UTF-8, which is
4927                      * unknowable until match time.  Since we are going to
4928                      * invert, we want to get rid of all of them so that the
4929                      * inversion will match all */
4930                     if (OP(scan) == NPOSIXD) {
4931                         _invlist_subtract(my_invlist, PL_UpperLatin1,
4932                                           &my_invlist);
4933                     }
4934
4935                   join_posix:
4936
4937                     if (flags & SCF_DO_STCLASS_AND) {
4938                         ssc_intersection(data->start_class, my_invlist, invert);
4939                         ssc_clear_locale(data->start_class);
4940                     }
4941                     else {
4942                         assert(flags & SCF_DO_STCLASS_OR);
4943                         ssc_union(data->start_class, my_invlist, invert);
4944                     }
4945                     SvREFCNT_dec(my_invlist);
4946                 }
4947                 if (flags & SCF_DO_STCLASS_OR)
4948                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4949                 flags &= ~SCF_DO_STCLASS;
4950             }
4951         }
4952         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4953             data->flags |= (OP(scan) == MEOL
4954                             ? SF_BEFORE_MEOL
4955                             : SF_BEFORE_SEOL);
4956             scan_commit(pRExC_state, data, minlenp, is_inf);
4957
4958         }
4959         else if (  PL_regkind[OP(scan)] == BRANCHJ
4960                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4961                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4962                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4963             if ( OP(scan) == UNLESSM &&
4964                  scan->flags == 0 &&
4965                  OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4966                  OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4967             ) {
4968                 regnode *opt;
4969                 regnode *upto= regnext(scan);
4970                 DEBUG_PARSE_r({
4971                     SV * const mysv_val=sv_newmortal();
4972                     DEBUG_STUDYDATA("OPFAIL",data,depth);
4973
4974                     /*DEBUG_PARSE_MSG("opfail");*/
4975                     regprop(RExC_rx, mysv_val, upto, NULL);
4976                     PerlIO_printf(Perl_debug_log,
4977                         "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4978                         SvPV_nolen_const(mysv_val),
4979                         (IV)REG_NODE_NUM(upto),
4980                         (IV)(upto - scan)
4981                     );
4982                 });
4983                 OP(scan) = OPFAIL;
4984                 NEXT_OFF(scan) = upto - scan;
4985                 for (opt= scan + 1; opt < upto ; opt++)
4986                     OP(opt) = OPTIMIZED;
4987                 scan= upto;
4988                 continue;
4989             }
4990             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4991                 || OP(scan) == UNLESSM )
4992             {
4993                 /* Negative Lookahead/lookbehind
4994                    In this case we can't do fixed string optimisation.
4995                 */
4996
4997                 SSize_t deltanext, minnext, fake = 0;
4998                 regnode *nscan;
4999                 regnode_ssc intrnl;
5000                 int f = 0;
5001
5002                 data_fake.flags = 0;
5003                 if (data) {
5004                     data_fake.whilem_c = data->whilem_c;
5005                     data_fake.last_closep = data->last_closep;
5006                 }
5007                 else
5008                     data_fake.last_closep = &fake;
5009                 data_fake.pos_delta = delta;
5010                 if ( flags & SCF_DO_STCLASS && !scan->flags
5011                      && OP(scan) == IFMATCH ) { /* Lookahead */
5012                     ssc_init(pRExC_state, &intrnl);
5013                     data_fake.start_class = &intrnl;
5014                     f |= SCF_DO_STCLASS_AND;
5015                 }
5016                 if (flags & SCF_WHILEM_VISITED_POS)
5017                     f |= SCF_WHILEM_VISITED_POS;
5018                 next = regnext(scan);
5019                 nscan = NEXTOPER(NEXTOPER(scan));
5020                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5021                                       last, &data_fake, stopparen,
5022                                       recursed_depth, NULL, f, depth+1);
5023                 if (scan->flags) {
5024                     if (deltanext) {
5025                         FAIL("Variable length lookbehind not implemented");
5026                     }
5027                     else if (minnext > (I32)U8_MAX) {
5028                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5029                               (UV)U8_MAX);
5030                     }
5031                     scan->flags = (U8)minnext;
5032                 }
5033                 if (data) {
5034                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5035                         pars++;
5036                     if (data_fake.flags & SF_HAS_EVAL)
5037                         data->flags |= SF_HAS_EVAL;
5038                     data->whilem_c = data_fake.whilem_c;
5039                 }
5040                 if (f & SCF_DO_STCLASS_AND) {
5041                     if (flags & SCF_DO_STCLASS_OR) {
5042                         /* OR before, AND after: ideally we would recurse with
5043                          * data_fake to get the AND applied by study of the
5044                          * remainder of the pattern, and then derecurse;
5045                          * *** HACK *** for now just treat as "no information".
5046                          * See [perl #56690].
5047                          */
5048                         ssc_init(pRExC_state, data->start_class);
5049                     }  else {
5050                         /* AND before and after: combine and continue.  These
5051                          * assertions are zero-length, so can match an EMPTY
5052                          * string */
5053                         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5054                         ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5055                     }
5056                 }
5057             }
5058 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5059             else {
5060                 /* Positive Lookahead/lookbehind
5061                    In this case we can do fixed string optimisation,
5062                    but we must be careful about it. Note in the case of
5063                    lookbehind the positions will be offset by the minimum
5064                    length of the pattern, something we won't know about
5065                    until after the recurse.
5066                 */
5067                 SSize_t deltanext, fake = 0;
5068                 regnode *nscan;
5069                 regnode_ssc intrnl;
5070                 int f = 0;
5071                 /* We use SAVEFREEPV so that when the full compile
5072                     is finished perl will clean up the allocated
5073                     minlens when it's all done. This way we don't
5074                     have to worry about freeing them when we know
5075                     they wont be used, which would be a pain.
5076                  */
5077                 SSize_t *minnextp;
5078                 Newx( minnextp, 1, SSize_t );
5079                 SAVEFREEPV(minnextp);
5080
5081                 if (data) {
5082                     StructCopy(data, &data_fake, scan_data_t);
5083                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5084                         f |= SCF_DO_SUBSTR;
5085                         if (scan->flags)
5086                             scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5087                         data_fake.last_found=newSVsv(data->last_found);
5088                     }
5089                 }
5090                 else
5091                     data_fake.last_closep = &fake;
5092                 data_fake.flags = 0;
5093                 data_fake.pos_delta = delta;
5094                 if (is_inf)
5095                     data_fake.flags |= SF_IS_INF;
5096                 if ( flags & SCF_DO_STCLASS && !scan->flags
5097                      && OP(scan) == IFMATCH ) { /* Lookahead */
5098                     ssc_init(pRExC_state, &intrnl);
5099                     data_fake.start_class = &intrnl;
5100                     f |= SCF_DO_STCLASS_AND;
5101                 }
5102                 if (flags & SCF_WHILEM_VISITED_POS)
5103                     f |= SCF_WHILEM_VISITED_POS;
5104                 next = regnext(scan);
5105                 nscan = NEXTOPER(NEXTOPER(scan));
5106
5107                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5108                                         &deltanext, last, &data_fake,
5109                                         stopparen, recursed_depth, NULL,
5110                                         f,depth+1);
5111                 if (scan->flags) {
5112                     if (deltanext) {
5113                         FAIL("Variable length lookbehind not implemented");
5114                     }
5115                     else if (*minnextp > (I32)U8_MAX) {
5116                         FAIL2("Lookbehind longer than %"UVuf" not implemented",
5117                               (UV)U8_MAX);
5118                     }
5119                     scan->flags = (U8)*minnextp;
5120                 }
5121
5122                 *minnextp += min;
5123
5124                 if (f & SCF_DO_STCLASS_AND) {
5125                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5126                     ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5127                 }
5128                 if (data) {
5129                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5130                         pars++;
5131                     if (data_fake.flags & SF_HAS_EVAL)
5132                         data->flags |= SF_HAS_EVAL;
5133                     data->whilem_c = data_fake.whilem_c;
5134                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5135                         if (RExC_rx->minlen<*minnextp)
5136                             RExC_rx->minlen=*minnextp;
5137                         scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5138                         SvREFCNT_dec_NN(data_fake.last_found);
5139
5140                         if ( data_fake.minlen_fixed != minlenp )
5141                         {
5142                             data->offset_fixed= data_fake.offset_fixed;
5143                             data->minlen_fixed= data_fake.minlen_fixed;
5144                             data->lookbehind_fixed+= scan->flags;
5145                         }
5146                         if ( data_fake.minlen_float != minlenp )
5147                         {
5148                             data->minlen_float= data_fake.minlen_float;
5149                             data->offset_float_min=data_fake.offset_float_min;
5150                             data->offset_float_max=data_fake.offset_float_max;
5151                             data->lookbehind_float+= scan->flags;
5152                         }
5153                     }
5154                 }
5155             }
5156 #endif
5157         }
5158         else if (OP(scan) == OPEN) {
5159             if (stopparen != (I32)ARG(scan))
5160                 pars++;
5161         }
5162         else if (OP(scan) == CLOSE) {
5163             if (stopparen == (I32)ARG(scan)) {
5164                 break;
5165             }
5166             if ((I32)ARG(scan) == is_par) {
5167                 next = regnext(scan);
5168
5169                 if ( next && (OP(next) != WHILEM) && next < last)
5170                     is_par = 0;         /* Disable optimization */
5171             }
5172             if (data)
5173                 *(data->last_closep) = ARG(scan);
5174         }
5175         else if (OP(scan) == EVAL) {
5176                 if (data)
5177                     data->flags |= SF_HAS_EVAL;
5178         }
5179         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5180             if (flags & SCF_DO_SUBSTR) {
5181                 scan_commit(pRExC_state, data, minlenp, is_inf);
5182                 flags &= ~SCF_DO_SUBSTR;
5183             }
5184             if (data && OP(scan)==ACCEPT) {
5185                 data->flags |= SCF_SEEN_ACCEPT;
5186                 if (stopmin > min)
5187                     stopmin = min;
5188             }
5189         }
5190         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5191         {
5192                 if (flags & SCF_DO_SUBSTR) {
5193                     scan_commit(pRExC_state, data, minlenp, is_inf);
5194                     data->longest = &(data->longest_float);
5195                 }
5196                 is_inf = is_inf_internal = 1;
5197                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5198                     ssc_anything(data->start_class);
5199                 flags &= ~SCF_DO_STCLASS;
5200         }
5201         else if (OP(scan) == GPOS) {
5202             if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5203                 !(delta || is_inf || (data && data->pos_delta)))
5204             {
5205                 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5206                     RExC_rx->intflags |= PREGf_ANCH_GPOS;
5207                 if (RExC_rx->gofs < (STRLEN)min)
5208                     RExC_rx->gofs = min;
5209             } else {
5210                 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5211                 RExC_rx->gofs = 0;
5212             }
5213         }
5214 #ifdef TRIE_STUDY_OPT
5215 #ifdef FULL_TRIE_STUDY
5216         else if (PL_regkind[OP(scan)] == TRIE) {
5217             /* NOTE - There is similar code to this block above for handling
5218                BRANCH nodes on the initial study.  If you change stuff here
5219                check there too. */
5220             regnode *trie_node= scan;
5221             regnode *tail= regnext(scan);
5222             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5223             SSize_t max1 = 0, min1 = SSize_t_MAX;
5224             regnode_ssc accum;
5225
5226             if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5227                 /* Cannot merge strings after this. */
5228                 scan_commit(pRExC_state, data, minlenp, is_inf);
5229             }
5230             if (flags & SCF_DO_STCLASS)
5231                 ssc_init_zero(pRExC_state, &accum);
5232
5233             if (!trie->jump) {
5234                 min1= trie->minlen;
5235                 max1= trie->maxlen;
5236             } else {
5237                 const regnode *nextbranch= NULL;
5238                 U32 word;
5239
5240                 for ( word=1 ; word <= trie->wordcount ; word++)
5241                 {
5242                     SSize_t deltanext=0, minnext=0, f = 0, fake;
5243                     regnode_ssc this_class;
5244
5245                     data_fake.flags = 0;
5246                     if (data) {
5247                         data_fake.whilem_c = data->whilem_c;
5248                         data_fake.last_closep = data->last_closep;
5249                     }
5250                     else
5251                         data_fake.last_closep = &fake;
5252                     data_fake.pos_delta = delta;
5253                     if (flags & SCF_DO_STCLASS) {
5254                         ssc_init(pRExC_state, &this_class);
5255                         data_fake.start_class = &this_class;
5256                         f = SCF_DO_STCLASS_AND;
5257                     }
5258                     if (flags & SCF_WHILEM_VISITED_POS)
5259                         f |= SCF_WHILEM_VISITED_POS;
5260
5261                     if (trie->jump[word]) {
5262                         if (!nextbranch)
5263                             nextbranch = trie_node + trie->jump[0];
5264                         scan= trie_node + trie->jump[word];
5265                         /* We go from the jump point to the branch that follows
5266                            it. Note this means we need the vestigal unused
5267                            branches even though they arent otherwise used. */
5268                         minnext = study_chunk(pRExC_state, &scan, minlenp,
5269                             &deltanext, (regnode *)nextbranch, &data_fake,
5270                             stopparen, recursed_depth, NULL, f,depth+1);
5271                     }
5272                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5273                         nextbranch= regnext((regnode*)nextbranch);
5274
5275                     if (min1 > (SSize_t)(minnext + trie->minlen))
5276                         min1 = minnext + trie->minlen;
5277                     if (deltanext == SSize_t_MAX) {
5278                         is_inf = is_inf_internal = 1;
5279                         max1 = SSize_t_MAX;
5280                     } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5281                         max1 = minnext + deltanext + trie->maxlen;
5282
5283                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5284                         pars++;
5285                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
5286                         if ( stopmin > min + min1)
5287                             stopmin = min + min1;
5288                         flags &= ~SCF_DO_SUBSTR;
5289                         if (data)
5290                             data->flags |= SCF_SEEN_ACCEPT;
5291                     }
5292                     if (data) {
5293                         if (data_fake.flags & SF_HAS_EVAL)
5294                             data->flags |= SF_HAS_EVAL;
5295                         data->whilem_c = data_fake.whilem_c;
5296                     }
5297                     if (flags & SCF_DO_STCLASS)
5298                         ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5299                 }
5300             }
5301             if (flags & SCF_DO_SUBSTR) {
5302                 data->pos_min += min1;
5303                 data->pos_delta += max1 - min1;
5304                 if (max1 != min1 || is_inf)
5305                     data->longest = &(data->longest_float);
5306             }
5307             min += min1;
5308             delta += max1 - min1;
5309             if (flags & SCF_DO_STCLASS_OR) {
5310                 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5311                 if (min1) {
5312                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5313                     flags &= ~SCF_DO_STCLASS;
5314                 }
5315             }
5316             else if (flags & SCF_DO_STCLASS_AND) {
5317                 if (min1) {
5318                     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5319                     flags &= ~SCF_DO_STCLASS;
5320                 }
5321                 else {
5322                     /* Switch to OR mode: cache the old value of
5323                      * data->start_class */
5324                     INIT_AND_WITHP;
5325                     StructCopy(data->start_class, and_withp, regnode_ssc);
5326                     flags &= ~SCF_DO_STCLASS_AND;
5327                     StructCopy(&accum, data->start_class, regnode_ssc);
5328                     flags |= SCF_DO_STCLASS_OR;
5329                 }
5330             }
5331             scan= tail;
5332             continue;
5333         }
5334 #else
5335         else if (PL_regkind[OP(scan)] == TRIE) {
5336             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5337             U8*bang=NULL;
5338
5339             min += trie->minlen;
5340             delta += (trie->maxlen - trie->minlen);
5341             flags &= ~SCF_DO_STCLASS; /* xxx */
5342             if (flags & SCF_DO_SUBSTR) {
5343                 /* Cannot expect anything... */
5344                 scan_commit(pRExC_state, data, minlenp, is_inf);
5345                 data->pos_min += trie->minlen;
5346                 data->pos_delta += (trie->maxlen - trie->minlen);
5347                 if (trie->maxlen != trie->minlen)
5348                     data->longest = &(data->longest_float);
5349             }
5350             if (trie->jump) /* no more substrings -- for now /grr*/
5351                flags &= ~SCF_DO_SUBSTR;
5352         }
5353 #endif /* old or new */
5354 #endif /* TRIE_STUDY_OPT */
5355
5356         /* Else: zero-length, ignore. */
5357         scan = regnext(scan);
5358     }
5359     /* If we are exiting a recursion we can unset its recursed bit
5360      * and allow ourselves to enter it again - no danger of an
5361      * infinite loop there.
5362     if (stopparen > -1 && recursed) {
5363         DEBUG_STUDYDATA("unset:", data,depth);
5364         PAREN_UNSET( recursed, stopparen);
5365     }
5366     */
5367     if (frame) {
5368         DEBUG_STUDYDATA("frame-end:",data,depth);
5369         DEBUG_PEEP("fend", scan, depth);
5370         /* restore previous context */
5371         last = frame->last;
5372         scan = frame->next;
5373         stopparen = frame->stop;
5374         recursed_depth = frame->prev_recursed_depth;
5375         depth = depth - 1;
5376
5377         frame = frame->prev;
5378         goto fake_study_recurse;
5379     }
5380
5381   finish:
5382     assert(!frame);
5383     DEBUG_STUDYDATA("pre-fin:",data,depth);
5384
5385     *scanp = scan;
5386     *deltap = is_inf_internal ? SSize_t_MAX : delta;
5387
5388     if (flags & SCF_DO_SUBSTR && is_inf)
5389         data->pos_delta = SSize_t_MAX - data->pos_min;
5390     if (is_par > (I32)U8_MAX)
5391         is_par = 0;
5392     if (is_par && pars==1 && data) {
5393         data->flags |= SF_IN_PAR;
5394         data->flags &= ~SF_HAS_PAR;
5395     }
5396     else if (pars && data) {
5397         data->flags |= SF_HAS_PAR;
5398         data->flags &= ~SF_IN_PAR;
5399     }
5400     if (flags & SCF_DO_STCLASS_OR)
5401         ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5402     if (flags & SCF_TRIE_RESTUDY)
5403         data->flags |=  SCF_TRIE_RESTUDY;
5404
5405     DEBUG_STUDYDATA("post-fin:",data,depth);
5406
5407     {
5408         SSize_t final_minlen= min < stopmin ? min : stopmin;
5409
5410         if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5411             RExC_maxlen = final_minlen + delta;
5412         }
5413         return final_minlen;
5414     }
5415     /* not-reached */
5416 }
5417
5418 STATIC U32
5419 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5420 {
5421     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5422
5423     PERL_ARGS_ASSERT_ADD_DATA;
5424
5425     Renewc(RExC_rxi->data,
5426            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5427            char, struct reg_data);
5428     if(count)
5429         Renew(RExC_rxi->data->what, count + n, U8);
5430     else
5431         Newx(RExC_rxi->data->what, n, U8);
5432     RExC_rxi->data->count = count + n;
5433     Copy(s, RExC_rxi->data->what + count, n, U8);
5434     return count;
5435 }
5436
5437 /*XXX: todo make this not included in a non debugging perl */
5438 #ifndef PERL_IN_XSUB_RE
5439 void
5440 Perl_reginitcolors(pTHX)
5441 {
5442     dVAR;
5443     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5444     if (s) {
5445         char *t = savepv(s);
5446         int i = 0;
5447         PL_colors[0] = t;
5448         while (++i < 6) {
5449             t = strchr(t, '\t');
5450             if (t) {
5451                 *t = '\0';
5452                 PL_colors[i] = ++t;
5453             }
5454             else
5455                 PL_colors[i] = t = (char *)"";
5456         }
5457     } else {
5458         int i = 0;
5459         while (i < 6)
5460             PL_colors[i++] = (char *)"";
5461     }
5462     PL_colorset = 1;
5463 }
5464 #endif
5465
5466
5467 #ifdef TRIE_STUDY_OPT
5468 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5469     STMT_START {                                            \
5470         if (                                                \
5471               (data.flags & SCF_TRIE_RESTUDY)               \
5472               && ! restudied++                              \
5473         ) {                                                 \
5474             dOsomething;                                    \
5475             goto reStudy;                                   \
5476         }                                                   \
5477     } STMT_END
5478 #else
5479 #define CHECK_RESTUDY_GOTO_butfirst
5480 #endif
5481
5482 /*
5483  * pregcomp - compile a regular expression into internal code
5484  *
5485  * Decides which engine's compiler to call based on the hint currently in
5486  * scope
5487  */
5488
5489 #ifndef PERL_IN_XSUB_RE
5490
5491 /* return the currently in-scope regex engine (or the default if none)  */
5492
5493 regexp_engine const *
5494 Perl_current_re_engine(pTHX)
5495 {
5496     dVAR;
5497
5498     if (IN_PERL_COMPILETIME) {
5499         HV * const table = GvHV(PL_hintgv);
5500         SV **ptr;
5501
5502         if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5503             return &PL_core_reg_engine;
5504         ptr = hv_fetchs(table, "regcomp", FALSE);
5505         if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5506             return &PL_core_reg_engine;
5507         return INT2PTR(regexp_engine*,SvIV(*ptr));
5508     }
5509     else {
5510         SV *ptr;
5511         if (!PL_curcop->cop_hints_hash)
5512             return &PL_core_reg_engine;
5513         ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5514         if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5515             return &PL_core_reg_engine;
5516         return INT2PTR(regexp_engine*,SvIV(ptr));
5517     }
5518 }
5519
5520
5521 REGEXP *
5522 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5523 {
5524     dVAR;
5525     regexp_engine const *eng = current_re_engine();
5526     GET_RE_DEBUG_FLAGS_DECL;
5527
5528     PERL_ARGS_ASSERT_PREGCOMP;
5529
5530     /* Dispatch a request to compile a regexp to correct regexp engine. */
5531     DEBUG_COMPILE_r({
5532         PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5533                         PTR2UV(eng));
5534     });
5535     return CALLREGCOMP_ENG(eng, pattern, flags);
5536 }
5537 #endif
5538
5539 /* public(ish) entry point for the perl core's own regex compiling code.
5540  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5541  * pattern rather than a list of OPs, and uses the internal engine rather
5542  * than the current one */
5543
5544 REGEXP *
5545 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5546 {
5547     SV *pat = pattern; /* defeat constness! */
5548     PERL_ARGS_ASSERT_RE_COMPILE;
5549     return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5550 #ifdef PERL_IN_XSUB_RE
5551                                 &my_reg_engine,
5552 #else
5553                                 &PL_core_reg_engine,
5554 #endif
5555                                 NULL, NULL, rx_flags, 0);
5556 }
5557
5558
5559 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5560  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5561  * point to the realloced string and length.
5562  *
5563  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5564  * stuff added */
5565
5566 static void
5567 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5568                     char **pat_p, STRLEN *plen_p, int num_code_blocks)
5569 {
5570     U8 *const src = (U8*)*pat_p;
5571     U8 *dst;
5572     int n=0;
5573     STRLEN s = 0, d = 0;
5574     bool do_end = 0;
5575     GET_RE_DEBUG_FLAGS_DECL;
5576
5577     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5578         "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5579
5580     Newx(dst, *plen_p * 2 + 1, U8);
5581
5582     while (s < *plen_p) {
5583         if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5584             dst[d]   = src[s];
5585         else {
5586             dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5587             dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5588         }
5589         if (n < num_code_blocks) {
5590             if (!do_end && pRExC_state->code_blocks[n].start == s) {
5591                 pRExC_state->code_blocks[n].start = d;
5592                 assert(dst[d] == '(');
5593                 do_end = 1;
5594             }
5595             else if (do_end && pRExC_state->code_blocks[n].end == s) {
5596                 pRExC_state->code_blocks[n].end = d;
5597                 assert(dst[d] == ')');
5598                 do_end = 0;
5599                 n++;
5600             }
5601         }
5602         s++;
5603         d++;
5604     }
5605     dst[d] = '\0';
5606     *plen_p = d;
5607     *pat_p = (char*) dst;
5608     SAVEFREEPV(*pat_p);
5609     RExC_orig_utf8 = RExC_utf8 = 1;
5610 }
5611
5612
5613
5614 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5615  * while recording any code block indices, and handling overloading,
5616  * nested qr// objects etc.  If pat is null, it will allocate a new
5617  * string, or just return the first arg, if there's only one.
5618  *
5619  * Returns the malloced/updated pat.
5620  * patternp and pat_count is the array of SVs to be concatted;
5621  * oplist is the optional list of ops that generated the SVs;
5622  * recompile_p is a pointer to a boolean that will be set if
5623  *   the regex will need to be recompiled.
5624  * delim, if non-null is an SV that will be inserted between each element
5625  */
5626
5627 static SV*
5628 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5629                 SV *pat, SV ** const patternp, int pat_count,
5630                 OP *oplist, bool *recompile_p, SV *delim)
5631 {
5632     SV **svp;
5633     int n = 0;
5634     bool use_delim = FALSE;
5635     bool alloced = FALSE;
5636
5637     /* if we know we have at least two args, create an empty string,
5638      * then concatenate args to that. For no args, return an empty string */
5639     if (!pat && pat_count != 1) {
5640         pat = newSVpvn("", 0);
5641         SAVEFREESV(pat);
5642         alloced = TRUE;
5643     }
5644
5645     for (svp = patternp; svp < patternp + pat_count; svp++) {
5646         SV *sv;
5647         SV *rx  = NULL;
5648         STRLEN orig_patlen = 0;
5649         bool code = 0;
5650         SV *msv = use_delim ? delim : *svp;
5651         if (!msv) msv = &PL_sv_undef;
5652
5653         /* if we've got a delimiter, we go round the loop twice for each
5654          * svp slot (except the last), using the delimiter the second
5655          * time round */
5656         if (use_delim) {
5657             svp--;
5658             use_delim = FALSE;
5659         }
5660         else if (delim)
5661             use_delim = TRUE;
5662
5663         if (SvTYPE(msv) == SVt_PVAV) {
5664             /* we've encountered an interpolated array within
5665              * the pattern, e.g. /...@a..../. Expand the list of elements,
5666              * then recursively append elements.
5667              * The code in this block is based on S_pushav() */
5668
5669             AV *const av = (AV*)msv;
5670             const SSize_t maxarg = AvFILL(av) + 1;
5671             SV **array;
5672
5673             if (oplist) {
5674                 assert(oplist->op_type == OP_PADAV
5675                     || oplist->op_type == OP_RV2AV);
5676                 oplist = oplist->op_sibling;;
5677             }
5678
5679             if (SvRMAGICAL(av)) {
5680                 SSize_t i;
5681
5682                 Newx(array, maxarg, SV*);
5683                 SAVEFREEPV(array);
5684                 for (i=0; i < maxarg; i++) {
5685                     SV ** const svp = av_fetch(av, i, FALSE);
5686                     array[i] = svp ? *svp : &PL_sv_undef;
5687                 }
5688             }
5689             else
5690                 array = AvARRAY(av);
5691
5692             pat = S_concat_pat(aTHX_ pRExC_state, pat,
5693                                 array, maxarg, NULL, recompile_p,
5694                                 /* $" */
5695                                 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5696
5697             continue;
5698         }
5699
5700
5701         /* we make the assumption here that each op in the list of
5702          * op_siblings maps to one SV pushed onto the stack,
5703          * except for code blocks, with have both an OP_NULL and
5704          * and OP_CONST.
5705          * This allows us to match up the list of SVs against the
5706          * list of OPs to find the next code block.
5707          *
5708          * Note that       PUSHMARK PADSV PADSV ..
5709          * is optimised to
5710          *                 PADRANGE PADSV  PADSV  ..
5711          * so the alignment still works. */
5712
5713         if (oplist) {
5714             if (oplist->op_type == OP_NULL
5715                 && (oplist->op_flags & OPf_SPECIAL))
5716             {
5717                 assert(n < pRExC_state->num_code_blocks);
5718                 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5719                 pRExC_state->code_blocks[n].block = oplist;
5720                 pRExC_state->code_blocks[n].src_regex = NULL;
5721                 n++;
5722                 code = 1;
5723                 oplist = oplist->op_sibling; /* skip CONST */
5724                 assert(oplist);
5725             }
5726             oplist = oplist->op_sibling;;
5727         }
5728
5729         /* apply magic and QR overloading to arg */
5730
5731         SvGETMAGIC(msv);
5732         if (SvROK(msv) && SvAMAGIC(msv)) {
5733             SV *sv = AMG_CALLunary(msv, regexp_amg);
5734             if (sv) {
5735                 if (SvROK(sv))
5736                     sv = SvRV(sv);
5737                 if (SvTYPE(sv) != SVt_REGEXP)
5738                     Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5739                 msv = sv;
5740             }
5741         }
5742
5743         /* try concatenation overload ... */
5744         if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5745                 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5746         {
5747             sv_setsv(pat, sv);
5748             /* overloading involved: all bets are off over literal
5749              * code. Pretend we haven't seen it */
5750             pRExC_state->num_code_blocks -= n;
5751             n = 0;
5752         }
5753         else  {
5754             /* ... or failing that, try "" overload */
5755             while (SvAMAGIC(msv)
5756                     && (sv = AMG_CALLunary(msv, string_amg))
5757                     && sv != msv
5758                     &&  !(   SvROK(msv)
5759                           && SvROK(sv)
5760                           && SvRV(msv) == SvRV(sv))
5761             ) {
5762                 msv = sv;
5763                 SvGETMAGIC(msv);
5764             }
5765             if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5766                 msv = SvRV(msv);
5767
5768             if (pat) {
5769                 /* this is a partially unrolled
5770                  *     sv_catsv_nomg(pat, msv);
5771                  * that allows us to adjust code block indices if
5772                  * needed */
5773                 STRLEN dlen;
5774                 char *dst = SvPV_force_nomg(pat, dlen);
5775                 orig_patlen = dlen;
5776                 if (SvUTF8(msv) && !SvUTF8(pat)) {
5777                     S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5778                     sv_setpvn(pat, dst, dlen);
5779                     SvUTF8_on(pat);
5780                 }
5781                 sv_catsv_nomg(pat, msv);
5782                 rx = msv;
5783             }
5784             else
5785                 pat = msv;
5786
5787             if (code)
5788                 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5789         }
5790
5791         /* extract any code blocks within any embedded qr//'s */
5792         if (rx && SvTYPE(rx) == SVt_REGEXP
5793             && RX_ENGINE((REGEXP*)rx)->op_comp)
5794         {
5795
5796             RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5797             if (ri->num_code_blocks) {
5798                 int i;
5799                 /* the presence of an embedded qr// with code means
5800                  * we should always recompile: the text of the
5801                  * qr// may not have changed, but it may be a
5802                  * different closure than last time */
5803                 *recompile_p = 1;
5804                 Renew(pRExC_state->code_blocks,
5805                     pRExC_state->num_code_blocks + ri->num_code_blocks,
5806                     struct reg_code_block);
5807                 pRExC_state->num_code_blocks += ri->num_code_blocks;
5808
5809                 for (i=0; i < ri->num_code_blocks; i++) {
5810                     struct reg_code_block *src, *dst;
5811                     STRLEN offset =  orig_patlen
5812                         + ReANY((REGEXP *)rx)->pre_prefix;
5813                     assert(n < pRExC_state->num_code_blocks);
5814                     src = &ri->code_blocks[i];
5815                     dst = &pRExC_state->code_blocks[n];
5816                     dst->start      = src->start + offset;
5817                     dst->end        = src->end   + offset;
5818                     dst->block      = src->block;
5819                     dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5820                                             src->src_regex
5821                                                 ? src->src_regex
5822                                                 : (REGEXP*)rx);
5823                     n++;
5824                 }
5825             }
5826         }
5827     }
5828     /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5829     if (alloced)
5830         SvSETMAGIC(pat);
5831
5832     return pat;
5833 }
5834
5835
5836
5837 /* see if there are any run-time code blocks in the pattern.
5838  * False positives are allowed */
5839
5840 static bool
5841 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5842                     char *pat, STRLEN plen)
5843 {
5844     int n = 0;
5845     STRLEN s;
5846
5847     for (s = 0; s < plen; s++) {
5848         if (n < pRExC_state->num_code_blocks
5849             && s == pRExC_state->code_blocks[n].start)
5850         {
5851             s = pRExC_state->code_blocks[n].end;
5852             n++;
5853             continue;
5854         }
5855         /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5856          * positives here */
5857         if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5858             (pat[s+2] == '{'
5859                 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5860         )
5861             return 1;
5862     }
5863     return 0;
5864 }
5865
5866 /* Handle run-time code blocks. We will already have compiled any direct
5867  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5868  * copy of it, but with any literal code blocks blanked out and
5869  * appropriate chars escaped; then feed it into
5870  *
5871  *    eval "qr'modified_pattern'"
5872  *
5873  * For example,
5874  *
5875  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5876  *
5877  * becomes
5878  *
5879  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5880  *
5881  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5882  * and merge them with any code blocks of the original regexp.
5883  *
5884  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5885  * instead, just save the qr and return FALSE; this tells our caller that
5886  * the original pattern needs upgrading to utf8.
5887  */
5888
5889 static bool
5890 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5891     char *pat, STRLEN plen)
5892 {
5893     SV *qr;
5894
5895     GET_RE_DEBUG_FLAGS_DECL;
5896
5897     if (pRExC_state->runtime_code_qr) {
5898         /* this is the second time we've been called; this should
5899          * only happen if the main pattern got upgraded to utf8
5900          * during compilation; re-use the qr we compiled first time
5901          * round (which should be utf8 too)
5902          */
5903         qr = pRExC_state->runtime_code_qr;
5904         pRExC_state->runtime_code_qr = NULL;
5905         assert(RExC_utf8 && SvUTF8(qr));
5906     }
5907     else {
5908         int n = 0;
5909         STRLEN s;
5910         char *p, *newpat;
5911         int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5912         SV *sv, *qr_ref;
5913         dSP;
5914
5915         /* determine how many extra chars we need for ' and \ escaping */
5916         for (s = 0; s < plen; s++) {
5917             if (pat[s] == '\'' || pat[s] == '\\')
5918                 newlen++;
5919         }
5920
5921         Newx(newpat, newlen, char);
5922         p = newpat;
5923         *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5924
5925         for (s = 0; s < plen; s++) {
5926             if (n < pRExC_state->num_code_blocks
5927                 && s == pRExC_state->code_blocks[n].start)
5928             {
5929                 /* blank out literal code block */
5930                 assert(pat[s] == '(');
5931                 while (s <= pRExC_state->code_blocks[n].end) {
5932                     *p++ = '_';
5933                     s++;
5934                 }
5935                 s--;
5936                 n++;
5937                 continue;
5938             }
5939             if (pat[s] == '\'' || pat[s] == '\\')
5940                 *p++ = '\\';
5941             *p++ = pat[s];
5942         }
5943         *p++ = '\'';
5944         if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5945             *p++ = 'x';
5946         *p++ = '\0';
5947         DEBUG_COMPILE_r({
5948             PerlIO_printf(Perl_debug_log,
5949                 "%sre-parsing pattern for runtime code:%s %s\n",
5950                 PL_colors[4],PL_colors[5],newpat);
5951         });
5952
5953         sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5954         Safefree(newpat);
5955
5956         ENTER;
5957         SAVETMPS;
5958         save_re_context();
5959         PUSHSTACKi(PERLSI_REQUIRE);
5960         /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5961          * parsing qr''; normally only q'' does this. It also alters
5962          * hints handling */
5963         eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5964         SvREFCNT_dec_NN(sv);
5965         SPAGAIN;
5966         qr_ref = POPs;
5967         PUTBACK;
5968         {
5969             SV * const errsv = ERRSV;
5970             if (SvTRUE_NN(errsv))
5971             {
5972                 Safefree(pRExC_state->code_blocks);
5973                 /* use croak_sv ? */
5974                 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5975             }
5976         }
5977         assert(SvROK(qr_ref));
5978         qr = SvRV(qr_ref);
5979         assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5980         /* the leaving below frees the tmp qr_ref.
5981          * Give qr a life of its own */
5982         SvREFCNT_inc(qr);
5983         POPSTACK;
5984         FREETMPS;
5985         LEAVE;
5986
5987     }
5988
5989     if (!RExC_utf8 && SvUTF8(qr)) {
5990         /* first time through; the pattern got upgraded; save the
5991          * qr for the next time through */
5992         assert(!pRExC_state->runtime_code_qr);
5993         pRExC_state->runtime_code_qr = qr;
5994         return 0;
5995     }
5996
5997
5998     /* extract any code blocks within the returned qr//  */
5999
6000
6001     /* merge the main (r1) and run-time (r2) code blocks into one */
6002     {
6003         RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6004         struct reg_code_block *new_block, *dst;
6005         RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6006         int i1 = 0, i2 = 0;
6007
6008         if (!r2->num_code_blocks) /* we guessed wrong */
6009         {
6010             SvREFCNT_dec_NN(qr);
6011             return 1;
6012         }
6013
6014         Newx(new_block,
6015             r1->num_code_blocks + r2->num_code_blocks,
6016             struct reg_code_block);
6017         dst = new_block;
6018
6019         while (    i1 < r1->num_code_blocks
6020                 || i2 < r2->num_code_blocks)
6021         {
6022             struct reg_code_block *src;
6023             bool is_qr = 0;
6024
6025             if (i1 == r1->num_code_blocks) {
6026                 src = &r2->code_blocks[i2++];
6027                 is_qr = 1;
6028             }
6029             else if (i2 == r2->num_code_blocks)
6030                 src = &r1->code_blocks[i1++];
6031             else if (  r1->code_blocks[i1].start
6032                      < r2->code_blocks[i2].start)
6033             {
6034                 src = &r1->code_blocks[i1++];
6035                 assert(src->end < r2->code_blocks[i2].start);
6036             }
6037             else {
6038                 assert(  r1->code_blocks[i1].start
6039                        > r2->code_blocks[i2].start);
6040                 src = &r2->code_blocks[i2++];
6041                 is_qr = 1;
6042                 assert(src->end < r1->code_blocks[i1].start);
6043             }
6044
6045             assert(pat[src->start] == '(');
6046             assert(pat[src->end]   == ')');
6047             dst->start      = src->start;
6048             dst->end        = src->end;
6049             dst->block      = src->block;
6050             dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6051                                     : src->src_regex;
6052             dst++;
6053         }
6054         r1->num_code_blocks += r2->num_code_blocks;
6055         Safefree(r1->code_blocks);
6056         r1->code_blocks = new_block;
6057     }
6058
6059     SvREFCNT_dec_NN(qr);
6060     return 1;
6061 }
6062
6063
6064 STATIC bool
6065 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6066                       SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6067                       SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6068                       STRLEN longest_length, bool eol, bool meol)
6069 {
6070     /* This is the common code for setting up the floating and fixed length
6071      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6072      * as to whether succeeded or not */
6073
6074     I32 t;
6075     SSize_t ml;
6076
6077     if (! (longest_length
6078            || (eol /* Can't have SEOL and MULTI */
6079                && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6080           )
6081             /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6082         || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6083     {
6084         return FALSE;
6085     }
6086
6087     /* copy the information about the longest from the reg_scan_data
6088         over to the program. */
6089     if (SvUTF8(sv_longest)) {
6090         *rx_utf8 = sv_longest;
6091         *rx_substr = NULL;
6092     } else {
6093         *rx_substr = sv_longest;
6094         *rx_utf8 = NULL;
6095     }
6096     /* end_shift is how many chars that must be matched that
6097         follow this item. We calculate it ahead of time as once the
6098         lookbehind offset is added in we lose the ability to correctly
6099         calculate it.*/
6100     ml = minlen ? *(minlen) : (SSize_t)longest_length;
6101     *rx_end_shift = ml - offset
6102         - longest_length + (SvTAIL(sv_longest) != 0)
6103         + lookbehind;
6104
6105     t = (eol/* Can't have SEOL and MULTI */
6106          && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6107     fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6108
6109     return TRUE;
6110 }
6111
6112 /*
6113  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6114  * regular expression into internal code.
6115  * The pattern may be passed either as:
6116  *    a list of SVs (patternp plus pat_count)
6117  *    a list of OPs (expr)
6118  * If both are passed, the SV list is used, but the OP list indicates
6119  * which SVs are actually pre-compiled code blocks
6120  *
6121  * The SVs in the list have magic and qr overloading applied to them (and
6122  * the list may be modified in-place with replacement SVs in the latter
6123  * case).
6124  *
6125  * If the pattern hasn't changed from old_re, then old_re will be
6126  * returned.
6127  *
6128  * eng is the current engine. If that engine has an op_comp method, then
6129  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6130  * do the initial concatenation of arguments and pass on to the external
6131  * engine.
6132  *
6133  * If is_bare_re is not null, set it to a boolean indicating whether the
6134  * arg list reduced (after overloading) to a single bare regex which has
6135  * been returned (i.e. /$qr/).
6136  *
6137  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6138  *
6139  * pm_flags contains the PMf_* flags, typically based on those from the
6140  * pm_flags field of the related PMOP. Currently we're only interested in
6141  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6142  *
6143  * We can't allocate space until we know how big the compiled form will be,
6144  * but we can't compile it (and thus know how big it is) until we've got a
6145  * place to put the code.  So we cheat:  we compile it twice, once with code
6146  * generation turned off and size counting turned on, and once "for real".
6147  * This also means that we don't allocate space until we are sure that the
6148  * thing really will compile successfully, and we never have to move the
6149  * code and thus invalidate pointers into it.  (Note that it has to be in
6150  * one piece because free() must be able to free it all.) [NB: not true in perl]
6151  *
6152  * Beware that the optimization-preparation code in here knows about some
6153  * of the structure of the compiled regexp.  [I'll say.]
6154  */
6155
6156 REGEXP *
6157 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6158                     OP *expr, const regexp_engine* eng, REGEXP *old_re,
6159                      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6160 {
6161     dVAR;
6162     REGEXP *rx;
6163     struct regexp *r;
6164     regexp_internal *ri;
6165     STRLEN plen;
6166     char *exp;
6167     regnode *scan;
6168     I32 flags;
6169     SSize_t minlen = 0;
6170     U32 rx_flags;
6171     SV *pat;
6172     SV *code_blocksv = NULL;
6173     SV** new_patternp = patternp;
6174
6175     /* these are all flags - maybe they should be turned
6176      * into a single int with different bit masks */
6177     I32 sawlookahead = 0;
6178     I32 sawplus = 0;
6179     I32 sawopen = 0;
6180     I32 sawminmod = 0;
6181
6182     regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6183     bool recompile = 0;
6184     bool runtime_code = 0;
6185     scan_data_t data;
6186     RExC_state_t RExC_state;
6187     RExC_state_t * const pRExC_state = &RExC_state;
6188 #ifdef TRIE_STUDY_OPT
6189     int restudied = 0;
6190     RExC_state_t copyRExC_state;
6191 #endif
6192     GET_RE_DEBUG_FLAGS_DECL;
6193
6194     PERL_ARGS_ASSERT_RE_OP_COMPILE;
6195
6196     DEBUG_r(if (!PL_colorset) reginitcolors());
6197
6198 #ifndef PERL_IN_XSUB_RE
6199     /* Initialize these here instead of as-needed, as is quick and avoids
6200      * having to test them each time otherwise */
6201     if (! PL_AboveLatin1) {
6202         PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6203         PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6204         PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6205         PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6206         PL_HasMultiCharFold =
6207                        _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6208     }
6209 #endif
6210
6211     pRExC_state->code_blocks = NULL;
6212     pRExC_state->num_code_blocks = 0;
6213
6214     if (is_bare_re)
6215         *is_bare_re = FALSE;
6216
6217     if (expr && (expr->op_type == OP_LIST ||
6218                 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6219         /* allocate code_blocks if needed */
6220         OP *o;
6221         int ncode = 0;
6222
6223         for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6224             if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6225                 ncode++; /* count of DO blocks */
6226         if (ncode) {
6227             pRExC_state->num_code_blocks = ncode;
6228             Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6229         }
6230     }
6231
6232     if (!pat_count) {
6233         /* compile-time pattern with just OP_CONSTs and DO blocks */
6234
6235         int n;
6236         OP *o;
6237
6238         /* find how many CONSTs there are */
6239         assert(expr);
6240         n = 0;
6241         if (expr->op_type == OP_CONST)
6242             n = 1;
6243         else
6244             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6245                 if (o->op_type == OP_CONST)
6246                     n++;
6247             }
6248
6249         /* fake up an SV array */
6250
6251         assert(!new_patternp);
6252         Newx(new_patternp, n, SV*);
6253         SAVEFREEPV(new_patternp);
6254         pat_count = n;
6255
6256         n = 0;
6257         if (expr->op_type == OP_CONST)
6258             new_patternp[n] = cSVOPx_sv(expr);
6259         else
6260             for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6261                 if (o->op_type == OP_CONST)
6262                     new_patternp[n++] = cSVOPo_sv;
6263             }
6264
6265     }
6266
6267     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6268         "Assembling pattern from %d elements%s\n", pat_count,
6269             orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6270
6271     /* set expr to the first arg op */
6272
6273     if (pRExC_state->num_code_blocks
6274          && expr->op_type != OP_CONST)
6275     {
6276             expr = cLISTOPx(expr)->op_first;
6277             assert(   expr->op_type == OP_PUSHMARK
6278                    || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6279                    || expr->op_type == OP_PADRANGE);
6280             expr = expr->op_sibling;
6281     }
6282
6283     pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6284                         expr, &recompile, NULL);
6285
6286     /* handle bare (possibly after overloading) regex: foo =~ $re */
6287     {
6288         SV *re = pat;
6289         if (SvROK(re))
6290             re = SvRV(re);
6291         if (SvTYPE(re) == SVt_REGEXP) {
6292             if (is_bare_re)
6293                 *is_bare_re = TRUE;
6294             SvREFCNT_inc(re);
6295             Safefree(pRExC_state->code_blocks);
6296             DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6297                 "Precompiled pattern%s\n",
6298                     orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6299
6300             return (REGEXP*)re;
6301         }
6302     }
6303
6304     exp = SvPV_nomg(pat, plen);
6305
6306     if (!eng->op_comp) {
6307         if ((SvUTF8(pat) && IN_BYTES)
6308                 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6309         {
6310             /* make a temporary copy; either to convert to bytes,
6311              * or to avoid repeating get-magic / overloaded stringify */
6312             pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6313                                         (IN_BYTES ? 0 : SvUTF8(pat)));
6314         }
6315         Safefree(pRExC_state->code_blocks);
6316         return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6317     }
6318
6319     /* ignore the utf8ness if the pattern is 0 length */
6320     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6321     RExC_uni_semantics = 0;
6322     RExC_contains_locale = 0;
6323     RExC_contains_i = 0;
6324     pRExC_state->runtime_code_qr = NULL;
6325
6326     DEBUG_COMPILE_r({
6327             SV *dsv= sv_newmortal();
6328             RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6329             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6330                           PL_colors[4],PL_colors[5],s);
6331         });
6332
6333   redo_first_pass:
6334     /* we jump here if we upgrade the pattern to utf8 and have to
6335      * recompile */
6336
6337     if ((pm_flags & PMf_USE_RE_EVAL)
6338                 /* this second condition covers the non-regex literal case,
6339                  * i.e.  $foo =~ '(?{})'. */
6340                 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6341     )
6342         runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6343
6344     /* return old regex if pattern hasn't changed */
6345     /* XXX: note in the below we have to check the flags as well as the
6346      * pattern.
6347      *
6348      * Things get a touch tricky as we have to compare the utf8 flag
6349      * independently from the compile flags.  */
6350
6351     if (   old_re
6352         && !recompile
6353         && !!RX_UTF8(old_re) == !!RExC_utf8
6354         && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6355         && RX_PRECOMP(old_re)
6356         && RX_PRELEN(old_re) == plen
6357         && memEQ(RX_PRECOMP(old_re), exp, plen)
6358         && !runtime_code /* with runtime code, always recompile */ )
6359     {
6360         Safefree(pRExC_state->code_blocks);
6361         return old_re;
6362     }
6363
6364     rx_flags = orig_rx_flags;
6365
6366     if (rx_flags & PMf_FOLD) {
6367         RExC_contains_i = 1;
6368     }
6369     if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6370
6371         /* Set to use unicode semantics if the pattern is in utf8 and has the
6372          * 'depends' charset specified, as it means unicode when utf8  */
6373         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6374     }
6375
6376     RExC_precomp = exp;
6377     RExC_flags = rx_flags;
6378     RExC_pm_flags = pm_flags;
6379
6380     if (runtime_code) {
6381         if (TAINTING_get && TAINT_get)
6382             Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6383
6384         if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6385             /* whoops, we have a non-utf8 pattern, whilst run-time code
6386              * got compiled as utf8. Try again with a utf8 pattern */
6387             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6388                                     pRExC_state->num_code_blocks);
6389             goto redo_first_pass;
6390         }
6391     }
6392     assert(!pRExC_state->runtime_code_qr);
6393
6394     RExC_sawback = 0;
6395
6396     RExC_seen = 0;
6397     RExC_maxlen = 0;
6398     RExC_in_lookbehind = 0;
6399     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6400     RExC_extralen = 0;
6401     RExC_override_recoding = 0;
6402     RExC_in_multi_char_class = 0;
6403
6404     /* First pass: determine size, legality. */
6405     RExC_parse = exp;
6406     RExC_start = exp;
6407     RExC_end = exp + plen;
6408     RExC_naughty = 0;
6409     RExC_npar = 1;
6410     RExC_nestroot = 0;
6411     RExC_size = 0L;
6412     RExC_emit = (regnode *) &RExC_emit_dummy;
6413     RExC_whilem_seen = 0;
6414     RExC_open_parens = NULL;
6415     RExC_close_parens = NULL;
6416     RExC_opend = NULL;
6417     RExC_paren_names = NULL;
6418 #ifdef DEBUGGING
6419     RExC_paren_name_list = NULL;
6420 #endif
6421     RExC_recurse = NULL;
6422     RExC_study_chunk_recursed = NULL;
6423     RExC_study_chunk_recursed_bytes= 0;
6424     RExC_recurse_count = 0;
6425     pRExC_state->code_index = 0;
6426
6427 #if 0 /* REGC() is (currently) a NOP at the first pass.
6428        * Clever compilers notice this and complain. --jhi */
6429     REGC((U8)REG_MAGIC, (char*)RExC_emit);
6430 #endif
6431     DEBUG_PARSE_r(
6432         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6433         RExC_lastnum=0;
6434         RExC_lastparse=NULL;
6435     );
6436     /* reg may croak on us, not giving us a chance to free
6437        pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6438        need it to survive as long as the regexp (qr/(?{})/).
6439        We must check that code_blocksv is not already set, because we may
6440        have jumped back to restart the sizing pass. */
6441     if (pRExC_state->code_blocks && !code_blocksv) {
6442         code_blocksv = newSV_type(SVt_PV);
6443         SAVEFREESV(code_blocksv);
6444         SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6445         SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6446     }
6447     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6448         /* It's possible to write a regexp in ascii that represents Unicode
6449         codepoints outside of the byte range, such as via \x{100}. If we
6450         detect such a sequence we have to convert the entire pattern to utf8
6451         and then recompile, as our sizing calculation will have been based
6452         on 1 byte == 1 character, but we will need to use utf8 to encode
6453         at least some part of the pattern, and therefore must convert the whole
6454         thing.
6455         -- dmq */
6456         if (flags & RESTART_UTF8) {
6457             S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6458                                     pRExC_state->num_code_blocks);
6459             goto redo_first_pass;
6460         }
6461         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6462     }
6463     if (code_blocksv)
6464         SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6465
6466     DEBUG_PARSE_r({
6467         PerlIO_printf(Perl_debug_log,
6468             "Required size %"IVdf" nodes\n"
6469             "Starting second pass (creation)\n",
6470             (IV)RExC_size);
6471         RExC_lastnum=0;
6472         RExC_lastparse=NULL;
6473     });
6474
6475     /* The first pass could have found things that force Unicode semantics */
6476     if ((RExC_utf8 || RExC_uni_semantics)
6477          && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6478     {
6479         set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6480     }
6481
6482     /* Small enough for pointer-storage convention?
6483        If extralen==0, this means that we will not need long jumps. */
6484     if (RExC_size >= 0x10000L && RExC_extralen)
6485         RExC_size += RExC_extralen;
6486     else
6487         RExC_extralen = 0;
6488     if (RExC_whilem_seen > 15)
6489         RExC_whilem_seen = 15;
6490
6491     /* Allocate space and zero-initialize. Note, the two step process
6492        of zeroing when in debug mode, thus anything assigned has to
6493        happen after that */
6494     rx = (REGEXP*) newSV_type(SVt_REGEXP);
6495     r = ReANY(rx);
6496     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6497          char, regexp_internal);
6498     if ( r == NULL || ri == NULL )
6499         FAIL("Regexp out of space");
6500 #ifdef DEBUGGING
6501     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6502     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6503          char);
6504 #else
6505     /* bulk initialize base fields with 0. */
6506     Zero(ri, sizeof(regexp_internal), char);
6507 #endif
6508
6509     /* non-zero initialization begins here */
6510     RXi_SET( r, ri );
6511     r->engine= eng;
6512     r->extflags = rx_flags;
6513     RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6514
6515     if (pm_flags & PMf_IS_QR) {
6516         ri->code_blocks = pRExC_state->code_blocks;
6517         ri->num_code_blocks = pRExC_state->num_code_blocks;
6518     }
6519     else
6520     {
6521         int n;
6522         for (n = 0; n < pRExC_state->num_code_blocks; n++)
6523             if (pRExC_state->code_blocks[n].src_regex)
6524                 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6525         SAVEFREEPV(pRExC_state->code_blocks);
6526     }
6527
6528     {
6529         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6530         bool has_charset = (get_regex_charset(r->extflags)
6531                                                     != REGEX_DEPENDS_CHARSET);
6532
6533         /* The caret is output if there are any defaults: if not all the STD
6534          * flags are set, or if no character set specifier is needed */
6535         bool has_default =
6536                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6537                     || ! has_charset);
6538         bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6539                                                    == REG_RUN_ON_COMMENT_SEEN);
6540         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6541                             >> RXf_PMf_STD_PMMOD_SHIFT);
6542         const char *fptr = STD_PAT_MODS;        /*"msix"*/
6543         char *p;
6544         /* Allocate for the worst case, which is all the std flags are turned
6545          * on.  If more precision is desired, we could do a population count of
6546          * the flags set.  This could be done with a small lookup table, or by
6547          * shifting, masking and adding, or even, when available, assembly
6548          * language for a machine-language population count.
6549          * We never output a minus, as all those are defaults, so are
6550          * covered by the caret */
6551         const STRLEN wraplen = plen + has_p + has_runon
6552             + has_default       /* If needs a caret */
6553
6554                 /* If needs a character set specifier */
6555             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6556             + (sizeof(STD_PAT_MODS) - 1)
6557             + (sizeof("(?:)") - 1);
6558
6559         Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6560         r->xpv_len_u.xpvlenu_pv = p;
6561         if (RExC_utf8)
6562             SvFLAGS(rx) |= SVf_UTF8;
6563         *p++='('; *p++='?';
6564
6565         /* If a default, cover it using the caret */
6566         if (has_default) {
6567             *p++= DEFAULT_PAT_MOD;
6568         }
6569         if (has_charset) {
6570             STRLEN len;
6571             const char* const name = get_regex_charset_name(r->extflags, &len);
6572             Copy(name, p, len, char);
6573             p += len;
6574         }
6575         if (has_p)
6576             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6577         {
6578             char ch;
6579             while((ch = *fptr++)) {
6580                 if(reganch & 1)
6581                     *p++ = ch;
6582                 reganch >>= 1;
6583             }
6584         }
6585
6586         *p++ = ':';
6587         Copy(RExC_precomp, p, plen, char);
6588         assert ((RX_WRAPPED(rx) - p) < 16);
6589         r->pre_prefix = p - RX_WRAPPED(rx);
6590         p += plen;
6591         if (has_runon)
6592             *p++ = '\n';
6593         *p++ = ')';
6594         *p = 0;
6595         SvCUR_set(rx, p - RX_WRAPPED(rx));
6596     }
6597
6598     r->intflags = 0;
6599     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6600
6601     /* setup various meta data about recursion, this all requires
6602      * RExC_npar to be correctly set, and a bit later on we clear it */
6603     if (RExC_seen & REG_RECURSE_SEEN) {
6604         Newxz(RExC_open_parens, RExC_npar,regnode *);
6605         SAVEFREEPV(RExC_open_parens);
6606         Newxz(RExC_close_parens,RExC_npar,regnode *);
6607         SAVEFREEPV(RExC_close_parens);
6608     }
6609     if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6610         /* Note, RExC_npar is 1 + the number of parens in a pattern.
6611          * So its 1 if there are no parens. */
6612         RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6613                                          ((RExC_npar & 0x07) != 0);
6614         Newx(RExC_study_chunk_recursed,
6615              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6616         SAVEFREEPV(RExC_study_chunk_recursed);
6617     }
6618
6619     /* Useful during FAIL. */
6620 #ifdef RE_TRACK_PATTERN_OFFSETS
6621     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6622     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6623                           "%s %"UVuf" bytes for offset annotations.\n",
6624                           ri->u.offsets ? "Got" : "Couldn't get",
6625                           (UV)((2*RExC_size+1) * sizeof(U32))));
6626 #endif
6627     SetProgLen(ri,RExC_size);
6628     RExC_rx_sv = rx;
6629     RExC_rx = r;
6630     RExC_rxi = ri;
6631
6632     /* Second pass: emit code. */
6633     RExC_flags = rx_flags;      /* don't let top level (?i) bleed */
6634     RExC_pm_flags = pm_flags;
6635     RExC_parse = exp;
6636     RExC_end = exp + plen;
6637     RExC_naughty = 0;
6638     RExC_npar = 1;
6639     RExC_emit_start = ri->program;
6640     RExC_emit = ri->program;
6641     RExC_emit_bound = ri->program + RExC_size + 1;
6642     pRExC_state->code_index = 0;
6643
6644     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6645     if (reg(pRExC_state, 0, &flags,1) == NULL) {
6646         ReREFCNT_dec(rx);
6647         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6648     }
6649     /* XXXX To minimize changes to RE engine we always allocate
6650        3-units-long substrs field. */
6651     Newx(r->substrs, 1, struct reg_substr_data);
6652     if (RExC_recurse_count) {
6653         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6654         SAVEFREEPV(RExC_recurse);
6655     }
6656
6657 reStudy:
6658     r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6659     Zero(r->substrs, 1, struct reg_substr_data);
6660     if (RExC_study_chunk_recursed)
6661         Zero(RExC_study_chunk_recursed,
6662              RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6663
6664 #ifdef TRIE_STUDY_OPT
6665     if (!restudied) {
6666         StructCopy(&zero_scan_data, &data, scan_data_t);
6667         copyRExC_state = RExC_state;
6668     } else {
6669         U32 seen=RExC_seen;
6670         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6671
6672         RExC_state = copyRExC_state;
6673         if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6674             RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6675         else
6676             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6677         StructCopy(&zero_scan_data, &data, scan_data_t);
6678     }
6679 #else
6680     StructCopy(&zero_scan_data, &data, scan_data_t);
6681 #endif
6682
6683     /* Dig out information for optimizations. */
6684     r->extflags = RExC_flags; /* was pm_op */
6685     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6686
6687     if (UTF)
6688         SvUTF8_on(rx);  /* Unicode in it? */
6689     ri->regstclass = NULL;
6690     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
6691         r->intflags |= PREGf_NAUGHTY;
6692     scan = ri->program + 1;             /* First BRANCH. */
6693
6694     /* testing for BRANCH here tells us whether there is "must appear"
6695        data in the pattern. If there is then we can use it for optimisations */
6696     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6697                                                   */
6698         SSize_t fake;
6699         STRLEN longest_float_length, longest_fixed_length;
6700         regnode_ssc ch_class; /* pointed to by data */
6701         int stclass_flag;
6702         SSize_t last_close = 0; /* pointed to by data */
6703         regnode *first= scan;
6704         regnode *first_next= regnext(first);
6705         /*
6706          * Skip introductions and multiplicators >= 1
6707          * so that we can extract the 'meat' of the pattern that must
6708          * match in the large if() sequence following.
6709          * NOTE that EXACT is NOT covered here, as it is normally
6710          * picked up by the optimiser separately.
6711          *
6712          * This is unfortunate as the optimiser isnt handling lookahead
6713          * properly currently.
6714          *
6715          */
6716         while ((OP(first) == OPEN && (sawopen = 1)) ||
6717                /* An OR of *one* alternative - should not happen now. */
6718             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6719             /* for now we can't handle lookbehind IFMATCH*/
6720             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6721             (OP(first) == PLUS) ||
6722             (OP(first) == MINMOD) ||
6723                /* An {n,m} with n>0 */
6724             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6725             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6726         {
6727                 /*
6728                  * the only op that could be a regnode is PLUS, all the rest
6729                  * will be regnode_1 or regnode_2.
6730                  *
6731                  * (yves doesn't think this is true)
6732                  */
6733                 if (OP(first) == PLUS)
6734                     sawplus = 1;
6735                 else {
6736                     if (OP(first) == MINMOD)
6737                         sawminmod = 1;
6738                     first += regarglen[OP(first)];
6739                 }
6740                 first = NEXTOPER(first);
6741                 first_next= regnext(first);
6742         }
6743
6744         /* Starting-point info. */
6745       again:
6746         DEBUG_PEEP("first:",first,0);
6747         /* Ignore EXACT as we deal with it later. */
6748         if (PL_regkind[OP(first)] == EXACT) {
6749             if (OP(first) == EXACT)
6750                 NOOP;   /* Empty, get anchored substr later. */
6751             else
6752                 ri->regstclass = first;
6753         }
6754 #ifdef TRIE_STCLASS
6755         else if (PL_regkind[OP(first)] == TRIE &&
6756                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6757         {
6758             regnode *trie_op;
6759             /* this can happen only on restudy */
6760             if ( OP(first) == TRIE ) {
6761                 struct regnode_1 *trieop = (struct regnode_1 *)
6762                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
6763                 StructCopy(first,trieop,struct regnode_1);
6764                 trie_op=(regnode *)trieop;
6765             } else {
6766                 struct regnode_charclass *trieop = (struct regnode_charclass *)
6767                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6768                 StructCopy(first,trieop,struct regnode_charclass);
6769                 trie_op=(regnode *)trieop;
6770             }
6771             OP(trie_op)+=2;
6772             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6773             ri->regstclass = trie_op;
6774         }
6775 #endif
6776         else if (REGNODE_SIMPLE(OP(first)))
6777             ri->regstclass = first;
6778         else if (PL_regkind[OP(first)] == BOUND ||
6779                  PL_regkind[OP(first)] == NBOUND)
6780             ri->regstclass = first;
6781         else if (PL_regkind[OP(first)] == BOL) {
6782             r->intflags |= (OP(first) == MBOL
6783                            ? PREGf_ANCH_MBOL
6784                            : (OP(first) == SBOL
6785                               ? PREGf_ANCH_SBOL
6786                               : PREGf_ANCH_BOL));
6787             first = NEXTOPER(first);
6788             goto again;
6789         }
6790         else if (OP(first) == GPOS) {
6791             r->intflags |= PREGf_ANCH_GPOS;
6792             first = NEXTOPER(first);
6793             goto again;
6794         }
6795         else if ((!sawopen || !RExC_sawback) &&
6796             (OP(first) == STAR &&
6797             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6798             !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6799         {
6800             /* turn .* into ^.* with an implied $*=1 */
6801             const int type =
6802                 (OP(NEXTOPER(first)) == REG_ANY)
6803                     ? PREGf_ANCH_MBOL
6804                     : PREGf_ANCH_SBOL;
6805             r->intflags |= (type | PREGf_IMPLICIT);
6806             first = NEXTOPER(first);
6807             goto again;
6808         }
6809         if (sawplus && !sawminmod && !sawlookahead
6810             && (!sawopen || !RExC_sawback)
6811             && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6812             /* x+ must match at the 1st pos of run of x's */
6813             r->intflags |= PREGf_SKIP;
6814
6815         /* Scan is after the zeroth branch, first is atomic matcher. */
6816 #ifdef TRIE_STUDY_OPT
6817         DEBUG_PARSE_r(
6818             if (!restudied)
6819                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6820                               (IV)(first - scan + 1))
6821         );
6822 #else
6823         DEBUG_PARSE_r(
6824             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6825                 (IV)(first - scan + 1))
6826         );
6827 #endif
6828
6829
6830         /*
6831         * If there's something expensive in the r.e., find the
6832         * longest literal string that must appear and make it the
6833         * regmust.  Resolve ties in favor of later strings, since
6834         * the regstart check works with the beginning of the r.e.
6835         * and avoiding duplication strengthens checking.  Not a
6836         * strong reason, but sufficient in the absence of others.
6837         * [Now we resolve ties in favor of the earlier string if
6838         * it happens that c_offset_min has been invalidated, since the
6839         * earlier string may buy us something the later one won't.]
6840         */
6841
6842         data.longest_fixed = newSVpvs("");
6843         data.longest_float = newSVpvs("");
6844         data.last_found = newSVpvs("");
6845         data.longest = &(data.longest_fixed);
6846         ENTER_with_name("study_chunk");
6847         SAVEFREESV(data.longest_fixed);
6848         SAVEFREESV(data.longest_float);
6849         SAVEFREESV(data.last_found);
6850         first = scan;
6851         if (!ri->regstclass) {
6852             ssc_init(pRExC_state, &ch_class);
6853             data.start_class = &ch_class;
6854             stclass_flag = SCF_DO_STCLASS_AND;
6855         } else                          /* XXXX Check for BOUND? */
6856             stclass_flag = 0;
6857         data.last_closep = &last_close;
6858
6859         DEBUG_RExC_seen();
6860         minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6861                              scan + RExC_size, /* Up to end */
6862             &data, -1, 0, NULL,
6863             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6864                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6865             0);
6866
6867
6868         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6869
6870
6871         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6872              && data.last_start_min == 0 && data.last_end > 0
6873              && !RExC_seen_zerolen
6874              && !(RExC_seen & REG_VERBARG_SEEN)
6875              && !(RExC_seen & REG_GPOS_SEEN)
6876         ){
6877             r->extflags |= RXf_CHECK_ALL;
6878         }
6879         scan_commit(pRExC_state, &data,&minlen,0);
6880
6881         longest_float_length = CHR_SVLEN(data.longest_float);
6882
6883         if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6884                    && data.offset_fixed == data.offset_float_min
6885                    && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6886             && S_setup_longest (aTHX_ pRExC_state,
6887                                     data.longest_float,
6888                                     &(r->float_utf8),
6889                                     &(r->float_substr),
6890                                     &(r->float_end_shift),
6891                                     data.lookbehind_float,
6892                                     data.offset_float_min,
6893                                     data.minlen_float,
6894                                     longest_float_length,
6895                                     cBOOL(data.flags & SF_FL_BEFORE_EOL),
6896                                     cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6897         {
6898             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6899             r->float_max_offset = data.offset_float_max;
6900             if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6901                 r->float_max_offset -= data.lookbehind_float;
6902             SvREFCNT_inc_simple_void_NN(data.longest_float);
6903         }
6904         else {
6905             r->float_substr = r->float_utf8 = NULL;
6906             longest_float_length = 0;
6907         }
6908
6909         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6910
6911         if (S_setup_longest (aTHX_ pRExC_state,
6912                                 data.longest_fixed,
6913                                 &(r->anchored_utf8),
6914                                 &(r->anchored_substr),
6915                                 &(r->anchored_end_shift),
6916                                 data.lookbehind_fixed,
6917                                 data.offset_fixed,
6918                                 data.minlen_fixed,
6919                                 longest_fixed_length,
6920                                 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6921                                 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6922         {
6923             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6924             SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6925         }
6926         else {
6927             r->anchored_substr = r->anchored_utf8 = NULL;
6928             longest_fixed_length = 0;
6929         }
6930         LEAVE_with_name("study_chunk");
6931
6932         if (ri->regstclass
6933             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6934             ri->regstclass = NULL;
6935
6936         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6937             && stclass_flag
6938             && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6939             && !ssc_is_anything(data.start_class))
6940         {
6941             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6942
6943             ssc_finalize(pRExC_state, data.start_class);
6944
6945             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6946             StructCopy(data.start_class,
6947                        (regnode_ssc*)RExC_rxi->data->data[n],
6948                        regnode_ssc);
6949             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6950             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6951             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6952                       regprop(r, sv, (regnode*)data.start_class, NULL);
6953                       PerlIO_printf(Perl_debug_log,
6954                                     "synthetic stclass \"%s\".\n",
6955                                     SvPVX_const(sv));});
6956             data.start_class = NULL;
6957         }
6958
6959         /* A temporary algorithm prefers floated substr to fixed one to dig
6960          * more info. */
6961         if (longest_fixed_length > longest_float_length) {
6962             r->substrs->check_ix = 0;
6963             r->check_end_shift = r->anchored_end_shift;
6964             r->check_substr = r->anchored_substr;
6965             r->check_utf8 = r->anchored_utf8;
6966             r->check_offset_min = r->check_offset_max = r->anchored_offset;
6967             if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6968                 r->intflags |= PREGf_NOSCAN;
6969         }
6970         else {
6971             r->substrs->check_ix = 1;
6972             r->check_end_shift = r->float_end_shift;
6973             r->check_substr = r->float_substr;
6974             r->check_utf8 = r->float_utf8;
6975             r->check_offset_min = r->float_min_offset;
6976             r->check_offset_max = r->float_max_offset;
6977         }
6978         if ((r->check_substr || r->check_utf8) ) {
6979             r->extflags |= RXf_USE_INTUIT;
6980             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6981                 r->extflags |= RXf_INTUIT_TAIL;
6982         }
6983         r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6984
6985         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6986         if ( (STRLEN)minlen < longest_float_length )
6987             minlen= longest_float_length;
6988         if ( (STRLEN)minlen < longest_fixed_length )
6989             minlen= longest_fixed_length;
6990         */
6991     }
6992     else {
6993         /* Several toplevels. Best we can is to set minlen. */
6994         SSize_t fake;
6995         regnode_ssc ch_class;
6996         SSize_t last_close = 0;
6997
6998         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6999
7000         scan = ri->program + 1;
7001         ssc_init(pRExC_state, &ch_class);
7002         data.start_class = &ch_class;
7003         data.last_closep = &last_close;
7004
7005         DEBUG_RExC_seen();
7006         minlen = study_chunk(pRExC_state,
7007             &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7008             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7009                                                       ? SCF_TRIE_DOING_RESTUDY
7010                                                       : 0),
7011             0);
7012
7013         CHECK_RESTUDY_GOTO_butfirst(NOOP);
7014
7015         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7016                 = r->float_substr = r->float_utf8 = NULL;
7017
7018         if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7019             && ! ssc_is_anything(data.start_class))
7020         {
7021             const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7022
7023             ssc_finalize(pRExC_state, data.start_class);
7024
7025             Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7026             StructCopy(data.start_class,
7027                        (regnode_ssc*)RExC_rxi->data->data[n],
7028                        regnode_ssc);
7029             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7030             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7031             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7032                       regprop(r, sv, (regnode*)data.start_class, NULL);
7033                       PerlIO_printf(Perl_debug_log,
7034                                     "synthetic stclass \"%s\".\n",
7035                                     SvPVX_const(sv));});
7036             data.start_class = NULL;
7037         }
7038     }
7039
7040     if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7041         r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7042         r->maxlen = REG_INFTY;
7043     }
7044     else {
7045         r->maxlen = RExC_maxlen;
7046     }
7047
7048     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7049        the "real" pattern. */
7050     DEBUG_OPTIMISE_r({
7051         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7052                       (IV)minlen, (IV)r->minlen, RExC_maxlen);
7053     });
7054     r->minlenret = minlen;
7055     if (r->minlen < minlen)
7056         r->minlen = minlen;
7057
7058     if (RExC_seen & REG_GPOS_SEEN)
7059         r->intflags |= PREGf_GPOS_SEEN;
7060     if (RExC_seen & REG_LOOKBEHIND_SEEN)
7061         r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7062                                                 lookbehind */
7063     if (pRExC_state->num_code_blocks)
7064         r->extflags |= RXf_EVAL_SEEN;
7065     if (RExC_seen & REG_CANY_SEEN)
7066         r->intflags |= PREGf_CANY_SEEN;
7067     if (RExC_seen & REG_VERBARG_SEEN)
7068     {
7069         r->intflags |= PREGf_VERBARG_SEEN;
7070         r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7071     }
7072     if (RExC_seen & REG_CUTGROUP_SEEN)
7073         r->intflags |= PREGf_CUTGROUP_SEEN;
7074     if (pm_flags & PMf_USE_RE_EVAL)
7075         r->intflags |= PREGf_USE_RE_EVAL;
7076     if (RExC_paren_names)
7077         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7078     else
7079         RXp_PAREN_NAMES(r) = NULL;
7080
7081     /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7082      * so it can be used in pp.c */
7083     if (r->intflags & PREGf_ANCH)
7084         r->extflags |= RXf_IS_ANCHORED;
7085
7086
7087     {
7088         /* this is used to identify "special" patterns that might result
7089          * in Perl NOT calling the regex engine and instead doing the match "itself",
7090          * particularly special cases in split//. By having the regex compiler
7091          * do this pattern matching at a regop level (instead of by inspecting the pattern)
7092          * we avoid weird issues with equivalent patterns resulting in different behavior,
7093          * AND we allow non Perl engines to get the same optimizations by the setting the
7094          * flags appropriately - Yves */
7095         regnode *first = ri->program + 1;
7096         U8 fop = OP(first);
7097         regnode *next = NEXTOPER(first);
7098         U8 nop = OP(next);
7099
7100         if (PL_regkind[fop] == NOTHING && nop == END)
7101             r->extflags |= RXf_NULL;
7102         else if (PL_regkind[fop] == BOL && nop == END)
7103             r->extflags |= RXf_START_ONLY;
7104         else if (fop == PLUS
7105                  && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7106                  && OP(regnext(first)) == END)
7107             r->extflags |= RXf_WHITE;
7108         else if ( r->extflags & RXf_SPLIT
7109                   && fop == EXACT
7110                   && STR_LEN(first) == 1
7111                   && *(STRING(first)) == ' '
7112                   && OP(regnext(first)) == END )
7113             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7114
7115     }
7116
7117     if (RExC_contains_locale) {
7118         RXp_EXTFLAGS(r) |= RXf_TAINTED;
7119     }
7120
7121 #ifdef DEBUGGING
7122     if (RExC_paren_names) {
7123         ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7124         ri->data->data[ri->name_list_idx]
7125                                    = (void*)SvREFCNT_inc(RExC_paren_name_list);
7126     } else
7127 #endif
7128         ri->name_list_idx = 0;
7129
7130     if (RExC_recurse_count) {
7131         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7132             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7133             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7134         }
7135     }
7136     Newxz(r->offs, RExC_npar, regexp_paren_pair);
7137     /* assume we don't need to swap parens around before we match */
7138
7139     DEBUG_DUMP_r({
7140         DEBUG_RExC_seen();
7141         PerlIO_printf(Perl_debug_log,"Final program:\n");
7142         regdump(r);
7143     });
7144 #ifdef RE_TRACK_PATTERN_OFFSETS
7145     DEBUG_OFFSETS_r(if (ri->u.offsets) {
7146         const STRLEN len = ri->u.offsets[0];
7147         STRLEN i;
7148         GET_RE_DEBUG_FLAGS_DECL;
7149         PerlIO_printf(Perl_debug_log,
7150                       "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7151         for (i = 1; i <= len; i++) {
7152             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7153                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7154                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7155             }
7156         PerlIO_printf(Perl_debug_log, "\n");
7157     });
7158 #endif
7159
7160 #ifdef USE_ITHREADS
7161     /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7162      * by setting the regexp SV to readonly-only instead. If the
7163      * pattern's been recompiled, the USEDness should remain. */
7164     if (old_re && SvREADONLY(old_re))
7165         SvREADONLY_on(rx);
7166 #endif
7167     return rx;
7168 }
7169
7170
7171 SV*
7172 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7173                     const U32 flags)
7174 {
7175     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7176
7177     PERL_UNUSED_ARG(value);
7178
7179     if (flags & RXapif_FETCH) {
7180         return reg_named_buff_fetch(rx, key, flags);
7181     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7182         Perl_croak_no_modify();
7183         return NULL;
7184     } else if (flags & RXapif_EXISTS) {
7185         return reg_named_buff_exists(rx, key, flags)
7186             ? &PL_sv_yes
7187             : &PL_sv_no;
7188     } else if (flags & RXapif_REGNAMES) {
7189         return reg_named_buff_all(rx, flags);
7190     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7191         return reg_named_buff_scalar(rx, flags);
7192     } else {
7193         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7194         return NULL;
7195     }
7196 }
7197
7198 SV*
7199 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7200                          const U32 flags)
7201 {
7202     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7203     PERL_UNUSED_ARG(lastkey);
7204
7205     if (flags & RXapif_FIRSTKEY)
7206         return reg_named_buff_firstkey(rx, flags);
7207     else if (flags & RXapif_NEXTKEY)
7208         return reg_named_buff_nextkey(rx, flags);
7209     else {
7210         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7211                                             (int)flags);
7212         return NULL;
7213     }
7214 }
7215
7216 SV*
7217 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7218                           const U32 flags)
7219 {
7220     AV *retarray = NULL;
7221     SV *ret;
7222     struct regexp *const rx = ReANY(r);
7223
7224     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7225
7226     if (flags & RXapif_ALL)
7227         retarray=newAV();
7228
7229     if (rx && RXp_PAREN_NAMES(rx)) {
7230         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7231         if (he_str) {
7232             IV i;
7233             SV* sv_dat=HeVAL(he_str);
7234             I32 *nums=(I32*)SvPVX(sv_dat);
7235             for ( i=0; i<SvIVX(sv_dat); i++ ) {
7236                 if ((I32)(rx->nparens) >= nums[i]
7237                     && rx->offs[nums[i]].start != -1
7238                     && rx->offs[nums[i]].end != -1)
7239                 {
7240                     ret = newSVpvs("");
7241                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7242                     if (!retarray)
7243                         return ret;
7244                 } else {
7245                     if (retarray)
7246                         ret = newSVsv(&PL_sv_undef);
7247                 }
7248                 if (retarray)
7249                     av_push(retarray, ret);
7250             }
7251             if (retarray)
7252                 return newRV_noinc(MUTABLE_SV(retarray));
7253         }
7254     }
7255     return NULL;
7256 }
7257
7258 bool
7259 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7260                            const U32 flags)
7261 {
7262     struct regexp *const rx = ReANY(r);
7263
7264     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7265
7266     if (rx && RXp_PAREN_NAMES(rx)) {
7267         if (flags & RXapif_ALL) {
7268             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7269         } else {
7270             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7271             if (sv) {
7272                 SvREFCNT_dec_NN(sv);
7273                 return TRUE;
7274             } else {
7275                 return FALSE;
7276             }
7277         }
7278     } else {
7279         return FALSE;
7280     }
7281 }
7282
7283 SV*
7284 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7285 {
7286     struct regexp *const rx = ReANY(r);
7287
7288     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7289
7290     if ( rx && RXp_PAREN_NAMES(rx) ) {
7291         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7292
7293         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7294     } else {
7295         return FALSE;
7296     }
7297 }
7298
7299 SV*
7300 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7301 {
7302     struct regexp *const rx = ReANY(r);
7303     GET_RE_DEBUG_FLAGS_DECL;
7304
7305     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7306
7307     if (rx && RXp_PAREN_NAMES(rx)) {
7308         HV *hv = RXp_PAREN_NAMES(rx);
7309         HE *temphe;
7310         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7311             IV i;
7312             IV parno = 0;
7313             SV* sv_dat = HeVAL(temphe);
7314             I32 *nums = (I32*)SvPVX(sv_dat);
7315             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7316                 if ((I32)(rx->lastparen) >= nums[i] &&
7317                     rx->offs[nums[i]].start != -1 &&
7318                     rx->offs[nums[i]].end != -1)
7319                 {
7320                     parno = nums[i];
7321                     break;
7322                 }
7323             }
7324             if (parno || flags & RXapif_ALL) {
7325                 return newSVhek(HeKEY_hek(temphe));
7326             }
7327         }
7328     }
7329     return NULL;
7330 }
7331
7332 SV*
7333 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7334 {
7335     SV *ret;
7336     AV *av;
7337     SSize_t length;
7338     struct regexp *const rx = ReANY(r);
7339
7340     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7341
7342     if (rx && RXp_PAREN_NAMES(rx)) {
7343         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7344             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7345         } else if (flags & RXapif_ONE) {
7346             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7347             av = MUTABLE_AV(SvRV(ret));
7348             length = av_tindex(av);
7349             SvREFCNT_dec_NN(ret);
7350             return newSViv(length + 1);
7351         } else {
7352             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7353                                                 (int)flags);
7354             return NULL;
7355         }
7356     }
7357     return &PL_sv_undef;
7358 }
7359
7360 SV*
7361 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7362 {
7363     struct regexp *const rx = ReANY(r);
7364     AV *av = newAV();
7365
7366     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7367
7368     if (rx && RXp_PAREN_NAMES(rx)) {
7369         HV *hv= RXp_PAREN_NAMES(rx);
7370         HE *temphe;
7371         (void)hv_iterinit(hv);
7372         while ( (temphe = hv_iternext_flags(hv,0)) ) {
7373             IV i;
7374             IV parno = 0;
7375             SV* sv_dat = HeVAL(temphe);
7376             I32 *nums = (I32*)SvPVX(sv_dat);
7377             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7378                 if ((I32)(rx->lastparen) >= nums[i] &&
7379                     rx->offs[nums[i]].start != -1 &&
7380                     rx->offs[nums[i]].end != -1)
7381                 {
7382                     parno = nums[i];
7383                     break;
7384                 }
7385             }
7386             if (parno || flags & RXapif_ALL) {
7387                 av_push(av, newSVhek(HeKEY_hek(temphe)));
7388             }
7389         }
7390     }
7391
7392     return newRV_noinc(MUTABLE_SV(av));
7393 }
7394
7395 void
7396 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7397                              SV * const sv)
7398 {
7399     struct regexp *const rx = ReANY(r);
7400     char *s = NULL;
7401     SSize_t i = 0;
7402     SSize_t s1, t1;
7403     I32 n = paren;
7404
7405     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7406
7407     if (      n == RX_BUFF_IDX_CARET_PREMATCH
7408            || n == RX_BUFF_IDX_CARET_FULLMATCH
7409            || n == RX_BUFF_IDX_CARET_POSTMATCH
7410        )
7411     {
7412         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7413         if (!keepcopy) {
7414             /* on something like
7415              *    $r = qr/.../;
7416              *    /$qr/p;
7417              * the KEEPCOPY is set on the PMOP rather than the regex */
7418             if (PL_curpm && r == PM_GETRE(PL_curpm))
7419                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7420         }
7421         if (!keepcopy)
7422             goto ret_undef;
7423     }
7424
7425     if (!rx->subbeg)
7426         goto ret_undef;
7427
7428     if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7429         /* no need to distinguish between them any more */
7430         n = RX_BUFF_IDX_FULLMATCH;
7431
7432     if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7433         && rx->offs[0].start != -1)
7434     {
7435         /* $`, ${^PREMATCH} */
7436         i = rx->offs[0].start;
7437         s = rx->subbeg;
7438     }
7439     else
7440     if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7441         && rx->offs[0].end != -1)
7442     {
7443         /* $', ${^POSTMATCH} */
7444         s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7445         i = rx->sublen + rx->suboffset - rx->offs[0].end;
7446     }
7447     else
7448     if ( 0 <= n && n <= (I32)rx->nparens &&
7449         (s1 = rx->offs[n].start) != -1 &&
7450         (t1 = rx->offs[n].end) != -1)
7451     {
7452         /* $&, ${^MATCH},  $1 ... */
7453         i = t1 - s1;
7454         s = rx->subbeg + s1 - rx->suboffset;
7455     } else {
7456         goto ret_undef;
7457     }
7458
7459     assert(s >= rx->subbeg);
7460     assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7461     if (i >= 0) {
7462 #ifdef NO_TAINT_SUPPORT
7463         sv_setpvn(sv, s, i);
7464 #else
7465         const int oldtainted = TAINT_get;
7466         TAINT_NOT;
7467         sv_setpvn(sv, s, i);
7468         TAINT_set(oldtainted);
7469 #endif
7470         if ( (rx->intflags & PREGf_CANY_SEEN)
7471             ? (RXp_MATCH_UTF8(rx)
7472                         && (!i || is_utf8_string((U8*)s, i)))
7473             : (RXp_MATCH_UTF8(rx)) )
7474         {
7475             SvUTF8_on(sv);
7476         }
7477         else
7478             SvUTF8_off(sv);
7479         if (TAINTING_get) {
7480             if (RXp_MATCH_TAINTED(rx)) {
7481                 if (SvTYPE(sv) >= SVt_PVMG) {
7482                     MAGIC* const mg = SvMAGIC(sv);
7483                     MAGIC* mgt;
7484                     TAINT;
7485                     SvMAGIC_set(sv, mg->mg_moremagic);
7486                     SvTAINT(sv);
7487                     if ((mgt = SvMAGIC(sv))) {
7488                         mg->mg_moremagic = mgt;
7489                         SvMAGIC_set(sv, mg);
7490                     }
7491                 } else {
7492                     TAINT;
7493                     SvTAINT(sv);
7494                 }
7495             } else
7496                 SvTAINTED_off(sv);
7497         }
7498     } else {
7499       ret_undef:
7500         sv_setsv(sv,&PL_sv_undef);
7501         return;
7502     }
7503 }
7504
7505 void
7506 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7507                                                          SV const * const value)
7508 {
7509     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7510
7511     PERL_UNUSED_ARG(rx);
7512     PERL_UNUSED_ARG(paren);
7513     PERL_UNUSED_ARG(value);
7514
7515     if (!PL_localizing)
7516         Perl_croak_no_modify();
7517 }
7518
7519 I32
7520 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7521                               const I32 paren)
7522 {
7523     struct regexp *const rx = ReANY(r);
7524     I32 i;
7525     I32 s1, t1;
7526
7527     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7528
7529     if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7530         || paren == RX_BUFF_IDX_CARET_FULLMATCH
7531         || paren == RX_BUFF_IDX_CARET_POSTMATCH
7532     )
7533     {
7534         bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7535         if (!keepcopy) {
7536             /* on something like
7537              *    $r = qr/.../;
7538              *    /$qr/p;
7539              * the KEEPCOPY is set on the PMOP rather than the regex */
7540             if (PL_curpm && r == PM_GETRE(PL_curpm))
7541                  keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7542         }
7543         if (!keepcopy)
7544             goto warn_undef;
7545     }
7546
7547     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7548     switch (paren) {
7549       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7550       case RX_BUFF_IDX_PREMATCH:       /* $` */
7551         if (rx->offs[0].start != -1) {
7552                         i = rx->offs[0].start;
7553                         if (i > 0) {
7554                                 s1 = 0;
7555                                 t1 = i;
7556                                 goto getlen;
7557                         }
7558             }
7559         return 0;
7560
7561       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7562       case RX_BUFF_IDX_POSTMATCH:       /* $' */
7563             if (rx->offs[0].end != -1) {
7564                         i = rx->sublen - rx->offs[0].end;
7565                         if (i > 0) {
7566                                 s1 = rx->offs[0].end;
7567                                 t1 = rx->sublen;
7568                                 goto getlen;
7569                         }
7570             }
7571         return 0;
7572
7573       default: /* $& / ${^MATCH}, $1, $2, ... */
7574             if (paren <= (I32)rx->nparens &&
7575             (s1 = rx->offs[paren].start) != -1 &&
7576             (t1 = rx->offs[paren].end) != -1)
7577             {
7578             i = t1 - s1;
7579             goto getlen;
7580         } else {
7581           warn_undef:
7582             if (ckWARN(WARN_UNINITIALIZED))
7583                 report_uninit((const SV *)sv);
7584             return 0;
7585         }
7586     }
7587   getlen:
7588     if (i > 0 && RXp_MATCH_UTF8(rx)) {
7589         const char * const s = rx->subbeg - rx->suboffset + s1;
7590         const U8 *ep;
7591         STRLEN el;
7592
7593         i = t1 - s1;
7594         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7595                         i = el;
7596     }
7597     return i;
7598 }
7599
7600 SV*
7601 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7602 {
7603     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7604         PERL_UNUSED_ARG(rx);
7605         if (0)
7606             return NULL;
7607         else
7608             return newSVpvs("Regexp");
7609 }
7610
7611 /* Scans the name of a named buffer from the pattern.
7612  * If flags is REG_RSN_RETURN_NULL returns null.
7613  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7614  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7615  * to the parsed name as looked up in the RExC_paren_names hash.
7616  * If there is an error throws a vFAIL().. type exception.
7617  */
7618
7619 #define REG_RSN_RETURN_NULL    0
7620 #define REG_RSN_RETURN_NAME    1
7621 #define REG_RSN_RETURN_DATA    2
7622
7623 STATIC SV*
7624 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7625 {
7626     char *name_start = RExC_parse;
7627
7628     PERL_ARGS_ASSERT_REG_SCAN_NAME;
7629
7630     assert (RExC_parse <= RExC_end);
7631     if (RExC_parse == RExC_end) NOOP;
7632     else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7633          /* skip IDFIRST by using do...while */
7634         if (UTF)
7635             do {
7636                 RExC_parse += UTF8SKIP(RExC_parse);
7637             } while (isWORDCHAR_utf8((U8*)RExC_parse));
7638         else
7639             do {
7640                 RExC_parse++;
7641             } while (isWORDCHAR(*RExC_parse));
7642     } else {
7643         RExC_parse++; /* so the <- from the vFAIL is after the offending
7644                          character */
7645         vFAIL("Group name must start with a non-digit word character");
7646     }
7647     if ( flags ) {
7648         SV* sv_name
7649             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7650                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7651         if ( flags == REG_RSN_RETURN_NAME)
7652             return sv_name;
7653         else if (flags==REG_RSN_RETURN_DATA) {
7654             HE *he_str = NULL;
7655             SV *sv_dat = NULL;
7656             if ( ! sv_name )      /* should not happen*/
7657                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7658             if (RExC_paren_names)
7659                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7660             if ( he_str )
7661                 sv_dat = HeVAL(he_str);
7662             if ( ! sv_dat )
7663                 vFAIL("Reference to nonexistent named group");
7664             return sv_dat;
7665         }
7666         else {
7667             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7668                        (unsigned long) flags);
7669         }
7670         assert(0); /* NOT REACHED */
7671     }
7672     return NULL;
7673 }
7674
7675 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7676     int rem=(int)(RExC_end - RExC_parse);                       \
7677     int cut;                                                    \
7678     int num;                                                    \
7679     int iscut=0;                                                \
7680     if (rem>10) {                                               \
7681         rem=10;                                                 \
7682         iscut=1;                                                \
7683     }                                                           \
7684     cut=10-rem;                                                 \
7685     if (RExC_lastparse!=RExC_parse)                             \
7686         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7687             rem, RExC_parse,                                    \
7688             cut + 4,                                            \
7689             iscut ? "..." : "<"                                 \
7690         );                                                      \
7691     else                                                        \
7692         PerlIO_printf(Perl_debug_log,"%16s","");                \
7693                                                                 \
7694     if (SIZE_ONLY)                                              \
7695        num = RExC_size + 1;                                     \
7696     else                                                        \
7697        num=REG_NODE_NUM(RExC_emit);                             \
7698     if (RExC_lastnum!=num)                                      \
7699        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7700     else                                                        \
7701        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7702     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7703         (int)((depth*2)), "",                                   \
7704         (funcname)                                              \
7705     );                                                          \
7706     RExC_lastnum=num;                                           \
7707     RExC_lastparse=RExC_parse;                                  \
7708 })
7709
7710
7711
7712 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7713     DEBUG_PARSE_MSG((funcname));                            \
7714     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7715 })
7716 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7717     DEBUG_PARSE_MSG((funcname));                            \
7718     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7719 })
7720
7721 /* This section of code defines the inversion list object and its methods.  The
7722  * interfaces are highly subject to change, so as much as possible is static to
7723  * this file.  An inversion list is here implemented as a malloc'd C UV array
7724  * as an SVt_INVLIST scalar.
7725  *
7726  * An inversion list for Unicode is an array of code points, sorted by ordinal
7727  * number.  The zeroth element is the first code point in the list.  The 1th
7728  * element is the first element beyond that not in the list.  In other words,
7729  * the first range is
7730  *  invlist[0]..(invlist[1]-1)
7731  * The other ranges follow.  Thus every element whose index is divisible by two
7732  * marks the beginning of a range that is in the list, and every element not
7733  * divisible by two marks the beginning of a range not in the list.  A single
7734  * element inversion list that contains the single code point N generally
7735  * consists of two elements
7736  *  invlist[0] == N
7737  *  invlist[1] == N+1
7738  * (The exception is when N is the highest representable value on the
7739  * machine, in which case the list containing just it would be a single
7740  * element, itself.  By extension, if the last range in the list extends to
7741  * infinity, then the first element of that range will be in the inversion list
7742  * at a position that is divisible by two, and is the final element in the
7743  * list.)
7744  * Taking the complement (inverting) an inversion list is quite simple, if the
7745  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7746  * This implementation reserves an element at the beginning of each inversion
7747  * list to always contain 0; there is an additional flag in the header which
7748  * indicates if the list begins at the 0, or is offset to begin at the next
7749  * element.
7750  *
7751  * More about inversion lists can be found in "Unicode Demystified"
7752  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7753  * More will be coming when functionality is added later.
7754  *
7755  * The inversion list data structure is currently implemented as an SV pointing
7756  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7757  * array of UV whose memory management is automatically handled by the existing
7758  * facilities for SV's.
7759  *
7760  * Some of the methods should always be private to the implementation, and some
7761  * should eventually be made public */
7762
7763 /* The header definitions are in F<inline_invlist.c> */
7764
7765 PERL_STATIC_INLINE UV*
7766 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7767 {
7768     /* Returns a pointer to the first element in the inversion list's array.
7769      * This is called upon initialization of an inversion list.  Where the
7770      * array begins depends on whether the list has the code point U+0000 in it
7771      * or not.  The other parameter tells it whether the code that follows this
7772      * call is about to put a 0 in the inversion list or not.  The first
7773      * element is either the element reserved for 0, if TRUE, or the element
7774      * after it, if FALSE */
7775
7776     bool* offset = get_invlist_offset_addr(invlist);
7777     UV* zero_addr = (UV *) SvPVX(invlist);
7778
7779     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7780
7781     /* Must be empty */
7782     assert(! _invlist_len(invlist));
7783
7784     *zero_addr = 0;
7785
7786     /* 1^1 = 0; 1^0 = 1 */
7787     *offset = 1 ^ will_have_0;
7788     return zero_addr + *offset;
7789 }
7790
7791 PERL_STATIC_INLINE UV*
7792 S_invlist_array(pTHX_ SV* const invlist)
7793 {
7794     /* Returns the pointer to the inversion list's array.  Every time the
7795      * length changes, this needs to be called in case malloc or realloc moved
7796      * it */
7797
7798     PERL_ARGS_ASSERT_INVLIST_ARRAY;
7799
7800     /* Must not be empty.  If these fail, you probably didn't check for <len>
7801      * being non-zero before trying to get the array */
7802     assert(_invlist_len(invlist));
7803
7804     /* The very first element always contains zero, The array begins either
7805      * there, or if the inversion list is offset, at the element after it.
7806      * The offset header field determines which; it contains 0 or 1 to indicate
7807      * how much additionally to add */
7808     assert(0 == *(SvPVX(invlist)));
7809     return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7810 }
7811
7812 PERL_STATIC_INLINE void
7813 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7814 {
7815     /* Sets the current number of elements stored in the inversion list.
7816      * Updates SvCUR correspondingly */
7817
7818     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7819
7820     assert(SvTYPE(invlist) == SVt_INVLIST);
7821
7822     SvCUR_set(invlist,
7823               (len == 0)
7824                ? 0
7825                : TO_INTERNAL_SIZE(len + offset));
7826     assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7827 }
7828
7829 PERL_STATIC_INLINE IV*
7830 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7831 {
7832     /* Return the address of the IV that is reserved to hold the cached index
7833      * */
7834
7835     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7836
7837     assert(SvTYPE(invlist) == SVt_INVLIST);
7838
7839     return &(((XINVLIST*) SvANY(invlist))->prev_index);
7840 }
7841
7842 PERL_STATIC_INLINE IV
7843 S_invlist_previous_index(pTHX_ SV* const invlist)
7844 {
7845     /* Returns cached index of previous search */
7846
7847     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7848
7849     return *get_invlist_previous_index_addr(invlist);
7850 }
7851
7852 PERL_STATIC_INLINE void
7853 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7854 {
7855     /* Caches <index> for later retrieval */
7856
7857     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7858
7859     assert(index == 0 || index < (int) _invlist_len(invlist));
7860
7861     *get_invlist_previous_index_addr(invlist) = index;
7862 }
7863
7864 PERL_STATIC_INLINE UV
7865 S_invlist_max(pTHX_ SV* const invlist)
7866 {
7867     /* Returns the maximum number of elements storable in the inversion list's
7868      * array, without having to realloc() */
7869
7870     PERL_ARGS_ASSERT_INVLIST_MAX;
7871
7872     assert(SvTYPE(invlist) == SVt_INVLIST);
7873
7874     /* Assumes worst case, in which the 0 element is not counted in the
7875      * inversion list, so subtracts 1 for that */
7876     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7877            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7878            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7879 }
7880
7881 #ifndef PERL_IN_XSUB_RE
7882 SV*
7883 Perl__new_invlist(pTHX_ IV initial_size)
7884 {
7885
7886     /* Return a pointer to a newly constructed inversion list, with enough
7887      * space to store 'initial_size' elements.  If that number is negative, a
7888      * system default is used instead */
7889
7890     SV* new_list;
7891
7892     if (initial_size < 0) {
7893         initial_size = 10;
7894     }
7895
7896     /* Allocate the initial space */
7897     new_list = newSV_type(SVt_INVLIST);
7898
7899     /* First 1 is in case the zero element isn't in the list; second 1 is for
7900      * trailing NUL */
7901     SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7902     invlist_set_len(new_list, 0, 0);
7903
7904     /* Force iterinit() to be used to get iteration to work */
7905     *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7906
7907     *get_invlist_previous_index_addr(new_list) = 0;
7908
7909     return new_list;
7910 }
7911
7912 SV*
7913 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7914 {
7915     /* Return a pointer to a newly constructed inversion list, initialized to
7916      * point to <list>, which has to be in the exact correct inversion list
7917      * form, including internal fields.  Thus this is a dangerous routine that
7918      * should not be used in the wrong hands.  The passed in 'list' contains
7919      * several header fields at the beginning that are not part of the
7920      * inversion list body proper */
7921
7922     const STRLEN length = (STRLEN) list[0];
7923     const UV version_id =          list[1];
7924     const bool offset   =    cBOOL(list[2]);
7925 #define HEADER_LENGTH 3
7926     /* If any of the above changes in any way, you must change HEADER_LENGTH
7927      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7928      *      perl -E 'say int(rand 2**31-1)'
7929      */
7930 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7931                                         data structure type, so that one being
7932                                         passed in can be validated to be an
7933                                         inversion list of the correct vintage.
7934                                        */
7935
7936     SV* invlist = newSV_type(SVt_INVLIST);
7937
7938     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7939
7940     if (version_id != INVLIST_VERSION_ID) {
7941         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7942     }
7943
7944     /* The generated array passed in includes header elements that aren't part
7945      * of the list proper, so start it just after them */
7946     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7947
7948     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7949                                shouldn't touch it */
7950
7951     *(get_invlist_offset_addr(invlist)) = offset;
7952
7953     /* The 'length' passed to us is the physical number of elements in the
7954      * inversion list.  But if there is an offset the logical number is one
7955      * less than that */
7956     invlist_set_len(invlist, length  - offset, offset);
7957
7958     invlist_set_previous_index(invlist, 0);
7959
7960     /* Initialize the iteration pointer. */
7961     invlist_iterfinish(invlist);
7962
7963     SvREADONLY_on(invlist);
7964
7965     return invlist;
7966 }
7967 #endif /* ifndef PERL_IN_XSUB_RE */
7968
7969 STATIC void
7970 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7971 {
7972     /* Grow the maximum size of an inversion list */
7973
7974     PERL_ARGS_ASSERT_INVLIST_EXTEND;
7975
7976     assert(SvTYPE(invlist) == SVt_INVLIST);
7977
7978     /* Add one to account for the zero element at the beginning which may not
7979      * be counted by the calling parameters */
7980     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7981 }
7982
7983 PERL_STATIC_INLINE void
7984 S_invlist_trim(pTHX_ SV* const invlist)
7985 {
7986     PERL_ARGS_ASSERT_INVLIST_TRIM;
7987
7988     assert(SvTYPE(invlist) == SVt_INVLIST);
7989
7990     /* Change the length of the inversion list to how many entries it currently
7991      * has */
7992     SvPV_shrink_to_cur((SV *) invlist);
7993 }
7994
7995 STATIC void
7996 S__append_range_to_invlist(pTHX_ SV* const invlist,
7997                                  const UV start, const UV end)
7998 {
7999    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8000     * the end of the inversion list.  The range must be above any existing
8001     * ones. */
8002
8003     UV* array;
8004     UV max = invlist_max(invlist);
8005     UV len = _invlist_len(invlist);
8006     bool offset;
8007
8008     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8009
8010     if (len == 0) { /* Empty lists must be initialized */
8011         offset = start != 0;
8012         array = _invlist_array_init(invlist, ! offset);
8013     }
8014     else {
8015         /* Here, the existing list is non-empty. The current max entry in the
8016          * list is generally the first value not in the set, except when the
8017          * set extends to the end of permissible values, in which case it is
8018          * the first entry in that final set, and so this call is an attempt to
8019          * append out-of-order */
8020
8021         UV final_element = len - 1;
8022         array = invlist_array(invlist);
8023         if (array[final_element] > start
8024             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8025         {
8026             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8027                      array[final_element], start,
8028                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8029         }
8030
8031         /* Here, it is a legal append.  If the new range begins with the first
8032          * value not in the set, it is extending the set, so the new first
8033          * value not in the set is one greater than the newly extended range.
8034          * */
8035         offset = *get_invlist_offset_addr(invlist);
8036         if (array[final_element] == start) {
8037             if (end != UV_MAX) {
8038                 array[final_element] = end + 1;
8039             }
8040             else {
8041                 /* But if the end is the maximum representable on the machine,
8042                  * just let the range that this would extend to have no end */
8043                 invlist_set_len(invlist, len - 1, offset);
8044             }
8045             return;
8046         }
8047     }
8048
8049     /* Here the new range doesn't extend any existing set.  Add it */
8050
8051     len += 2;   /* Includes an element each for the start and end of range */
8052
8053     /* If wll overflow the existing space, extend, which may cause the array to
8054      * be moved */
8055     if (max < len) {
8056         invlist_extend(invlist, len);
8057
8058         /* Have to set len here to avoid assert failure in invlist_array() */
8059         invlist_set_len(invlist, len, offset);
8060
8061         array = invlist_array(invlist);
8062     }
8063     else {
8064         invlist_set_len(invlist, len, offset);
8065     }
8066
8067     /* The next item on the list starts the range, the one after that is
8068      * one past the new range.  */
8069     array[len - 2] = start;
8070     if (end != UV_MAX) {
8071         array[len - 1] = end + 1;
8072     }
8073     else {
8074         /* But if the end is the maximum representable on the machine, just let
8075          * the range have no end */
8076         invlist_set_len(invlist, len - 1, offset);
8077     }
8078 }
8079
8080 #ifndef PERL_IN_XSUB_RE
8081
8082 IV
8083 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8084 {
8085     /* Searches the inversion list for the entry that contains the input code
8086      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8087      * return value is the index into the list's array of the range that
8088      * contains <cp> */
8089
8090     IV low = 0;
8091     IV mid;
8092     IV high = _invlist_len(invlist);
8093     const IV highest_element = high - 1;
8094     const UV* array;
8095
8096     PERL_ARGS_ASSERT__INVLIST_SEARCH;
8097
8098     /* If list is empty, return failure. */
8099     if (high == 0) {
8100         return -1;
8101     }
8102
8103     /* (We can't get the array unless we know the list is non-empty) */
8104     array = invlist_array(invlist);
8105
8106     mid = invlist_previous_index(invlist);
8107     assert(mid >=0 && mid <= highest_element);
8108
8109     /* <mid> contains the cache of the result of the previous call to this
8110      * function (0 the first time).  See if this call is for the same result,
8111      * or if it is for mid-1.  This is under the theory that calls to this
8112      * function will often be for related code points that are near each other.
8113      * And benchmarks show that caching gives better results.  We also test
8114      * here if the code point is within the bounds of the list.  These tests
8115      * replace others that would have had to be made anyway to make sure that
8116      * the array bounds were not exceeded, and these give us extra information
8117      * at the same time */
8118     if (cp >= array[mid]) {
8119         if (cp >= array[highest_element]) {
8120             return highest_element;
8121         }
8122
8123         /* Here, array[mid] <= cp < array[highest_element].  This means that
8124          * the final element is not the answer, so can exclude it; it also
8125          * means that <mid> is not the final element, so can refer to 'mid + 1'
8126          * safely */
8127         if (cp < array[mid + 1]) {
8128             return mid;
8129         }
8130         high--;
8131         low = mid + 1;
8132     }
8133     else { /* cp < aray[mid] */
8134         if (cp < array[0]) { /* Fail if outside the array */
8135             return -1;
8136         }
8137         high = mid;
8138         if (cp >= array[mid - 1]) {
8139             goto found_entry;
8140         }
8141     }
8142
8143     /* Binary search.  What we are looking for is <i> such that
8144      *  array[i] <= cp < array[i+1]
8145      * The loop below converges on the i+1.  Note that there may not be an
8146      * (i+1)th element in the array, and things work nonetheless */
8147     while (low < high) {
8148         mid = (low + high) / 2;
8149         assert(mid <= highest_element);
8150         if (array[mid] <= cp) { /* cp >= array[mid] */
8151             low = mid + 1;
8152
8153             /* We could do this extra test to exit the loop early.
8154             if (cp < array[low]) {
8155                 return mid;
8156             }
8157             */
8158         }
8159         else { /* cp < array[mid] */
8160             high = mid;
8161         }
8162     }
8163
8164   found_entry:
8165     high--;
8166     invlist_set_previous_index(invlist, high);
8167     return high;
8168 }
8169
8170 void
8171 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8172                                     const UV start, const UV end, U8* swatch)
8173 {
8174     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8175      * but is used when the swash has an inversion list.  This makes this much
8176      * faster, as it uses a binary search instead of a linear one.  This is
8177      * intimately tied to that function, and perhaps should be in utf8.c,
8178      * except it is intimately tied to inversion lists as well.  It assumes
8179      * that <swatch> is all 0's on input */
8180
8181     UV current = start;
8182     const IV len = _invlist_len(invlist);
8183     IV i;
8184     const UV * array;
8185
8186     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8187
8188     if (len == 0) { /* Empty inversion list */
8189         return;
8190     }
8191
8192     array = invlist_array(invlist);
8193
8194     /* Find which element it is */
8195     i = _invlist_search(invlist, start);
8196
8197     /* We populate from <start> to <end> */
8198     while (current < end) {
8199         UV upper;
8200
8201         /* The inversion list gives the results for every possible code point
8202          * after the first one in the list.  Only those ranges whose index is
8203          * even are ones that the inversion list matches.  For the odd ones,
8204          * and if the initial code point is not in the list, we have to skip
8205          * forward to the next element */
8206         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8207             i++;
8208             if (i >= len) { /* Finished if beyond the end of the array */
8209                 return;
8210             }
8211             current = array[i];
8212             if (current >= end) {   /* Finished if beyond the end of what we
8213                                        are populating */
8214                 if (LIKELY(end < UV_MAX)) {
8215                     return;
8216                 }
8217
8218                 /* We get here when the upper bound is the maximum
8219                  * representable on the machine, and we are looking for just
8220                  * that code point.  Have to special case it */
8221                 i = len;
8222                 goto join_end_of_list;
8223             }
8224         }
8225         assert(current >= start);
8226
8227         /* The current range ends one below the next one, except don't go past
8228          * <end> */
8229         i++;
8230         upper = (i < len && array[i] < end) ? array[i] : end;
8231
8232         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8233          * for each code point in it */
8234         for (; current < upper; current++) {
8235             const STRLEN offset = (STRLEN)(current - start);
8236             swatch[offset >> 3] |= 1 << (offset & 7);
8237         }
8238
8239     join_end_of_list:
8240
8241         /* Quit if at the end of the list */
8242         if (i >= len) {
8243
8244             /* But first, have to deal with the highest possible code point on
8245              * the platform.  The previous code assumes that <end> is one
8246              * beyond where we want to populate, but that is impossible at the
8247              * platform's infinity, so have to handle it specially */
8248             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8249             {
8250                 const STRLEN offset = (STRLEN)(end - start);
8251                 swatch[offset >> 3] |= 1 << (offset & 7);
8252             }
8253             return;
8254         }
8255
8256         /* Advance to the next range, which will be for code points not in the
8257          * inversion list */
8258         current = array[i];
8259     }
8260
8261     return;
8262 }
8263
8264 void
8265 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8266                                          const bool complement_b, SV** output)
8267 {
8268     /* Take the union of two inversion lists and point <output> to it.  *output
8269      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8270      * the reference count to that list will be decremented if not already a
8271      * temporary (mortal); otherwise *output will be made correspondingly
8272      * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8273      * second list is returned.  If <complement_b> is TRUE, the union is taken
8274      * of the complement (inversion) of <b> instead of b itself.
8275      *
8276      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8277      * Richard Gillam, published by Addison-Wesley, and explained at some
8278      * length there.  The preface says to incorporate its examples into your
8279      * code at your own risk.
8280      *
8281      * The algorithm is like a merge sort.
8282      *
8283      * XXX A potential performance improvement is to keep track as we go along
8284      * if only one of the inputs contributes to the result, meaning the other
8285      * is a subset of that one.  In that case, we can skip the final copy and
8286      * return the larger of the input lists, but then outside code might need
8287      * to keep track of whether to free the input list or not */
8288
8289     const UV* array_a;    /* a's array */
8290     const UV* array_b;
8291     UV len_a;       /* length of a's array */
8292     UV len_b;
8293
8294     SV* u;                      /* the resulting union */
8295     UV* array_u;
8296     UV len_u;
8297
8298     UV i_a = 0;             /* current index into a's array */
8299     UV i_b = 0;
8300     UV i_u = 0;
8301
8302     /* running count, as explained in the algorithm source book; items are
8303      * stopped accumulating and are output when the count changes to/from 0.
8304      * The count is incremented when we start a range that's in the set, and
8305      * decremented when we start a range that's not in the set.  So its range
8306      * is 0 to 2.  Only when the count is zero is something not in the set.
8307      */
8308     UV count = 0;
8309
8310     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8311     assert(a != b);
8312
8313     /* If either one is empty, the union is the other one */
8314     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8315         bool make_temp = FALSE; /* Should we mortalize the result? */
8316
8317         if (*output == a) {
8318             if (a != NULL) {
8319                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8320                     SvREFCNT_dec_NN(a);
8321                 }
8322             }
8323         }
8324         if (*output != b) {
8325             *output = invlist_clone(b);
8326             if (complement_b) {
8327                 _invlist_invert(*output);
8328             }
8329         } /* else *output already = b; */
8330
8331         if (make_temp) {
8332             sv_2mortal(*output);
8333         }
8334         return;
8335     }
8336     else if ((len_b = _invlist_len(b)) == 0) {
8337         bool make_temp = FALSE;
8338         if (*output == b) {
8339             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8340                 SvREFCNT_dec_NN(b);
8341             }
8342         }
8343
8344         /* The complement of an empty list is a list that has everything in it,
8345          * so the union with <a> includes everything too */
8346         if (complement_b) {
8347             if (a == *output) {
8348                 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8349                     SvREFCNT_dec_NN(a);
8350                 }
8351             }
8352             *output = _new_invlist(1);
8353             _append_range_to_invlist(*output, 0, UV_MAX);
8354         }
8355         else if (*output != a) {
8356             *output = invlist_clone(a);
8357         }
8358         /* else *output already = a; */
8359
8360         if (make_temp) {
8361             sv_2mortal(*output);
8362         }
8363         return;
8364     }
8365
8366     /* Here both lists exist and are non-empty */
8367     array_a = invlist_array(a);
8368     array_b = invlist_array(b);
8369
8370     /* If are to take the union of 'a' with the complement of b, set it
8371      * up so are looking at b's complement. */
8372     if (complement_b) {
8373
8374         /* To complement, we invert: if the first element is 0, remove it.  To
8375          * do this, we just pretend the array starts one later */
8376         if (array_b[0] == 0) {
8377             array_b++;
8378             len_b--;
8379         }
8380         else {
8381
8382             /* But if the first element is not zero, we pretend the list starts
8383              * at the 0 that is always stored immediately before the array. */
8384             array_b--;
8385             len_b++;
8386         }
8387     }
8388
8389     /* Size the union for the worst case: that the sets are completely
8390      * disjoint */
8391     u = _new_invlist(len_a + len_b);
8392
8393     /* Will contain U+0000 if either component does */
8394     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8395                                       || (len_b > 0 && array_b[0] == 0));
8396
8397     /* Go through each list item by item, stopping when exhausted one of
8398      * them */
8399     while (i_a < len_a && i_b < len_b) {
8400         UV cp;      /* The element to potentially add to the union's array */
8401         bool cp_in_set;   /* is it in the the input list's set or not */
8402
8403         /* We need to take one or the other of the two inputs for the union.
8404          * Since we are merging two sorted lists, we take the smaller of the
8405          * next items.  In case of a tie, we take the one that is in its set
8406          * first.  If we took one not in the set first, it would decrement the
8407          * count, possibly to 0 which would cause it to be output as ending the
8408          * range, and the next time through we would take the same number, and
8409          * output it again as beginning the next range.  By doing it the
8410          * opposite way, there is no possibility that the count will be
8411          * momentarily decremented to 0, and thus the two adjoining ranges will
8412          * be seamlessly merged.  (In a tie and both are in the set or both not
8413          * in the set, it doesn't matter which we take first.) */
8414         if (array_a[i_a] < array_b[i_b]
8415             || (array_a[i_a] == array_b[i_b]
8416                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8417         {
8418             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8419             cp= array_a[i_a++];
8420         }
8421         else {
8422             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8423             cp = array_b[i_b++];
8424         }
8425
8426         /* Here, have chosen which of the two inputs to look at.  Only output
8427          * if the running count changes to/from 0, which marks the
8428          * beginning/end of a range in that's in the set */
8429         if (cp_in_set) {
8430             if (count == 0) {
8431                 array_u[i_u++] = cp;
8432             }
8433             count++;
8434         }
8435         else {
8436             count--;
8437             if (count == 0) {
8438                 array_u[i_u++] = cp;
8439             }
8440         }
8441     }
8442
8443     /* Here, we are finished going through at least one of the lists, which
8444      * means there is something remaining in at most one.  We check if the list
8445      * that hasn't been exhausted is positioned such that we are in the middle
8446      * of a range in its set or not.  (i_a and i_b point to the element beyond
8447      * the one we care about.) If in the set, we decrement 'count'; if 0, there
8448      * is potentially more to output.
8449      * There are four cases:
8450      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
8451      *     in the union is entirely from the non-exhausted set.
8452      *  2) Both were in their sets, count is 2.  Nothing further should
8453      *     be output, as everything that remains will be in the exhausted
8454      *     list's set, hence in the union; decrementing to 1 but not 0 insures
8455      *     that
8456      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
8457      *     Nothing further should be output because the union includes
8458      *     everything from the exhausted set.  Not decrementing ensures that.
8459      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8460      *     decrementing to 0 insures that we look at the remainder of the
8461      *     non-exhausted set */
8462     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8463         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8464     {
8465         count--;
8466     }
8467
8468     /* The final length is what we've output so far, plus what else is about to
8469      * be output.  (If 'count' is non-zero, then the input list we exhausted
8470      * has everything remaining up to the machine's limit in its set, and hence
8471      * in the union, so there will be no further output. */
8472     len_u = i_u;
8473     if (count == 0) {
8474         /* At most one of the subexpressions will be non-zero */
8475         len_u += (len_a - i_a) + (len_b - i_b);
8476     }
8477
8478     /* Set result to final length, which can change the pointer to array_u, so
8479      * re-find it */
8480     if (len_u != _invlist_len(u)) {
8481         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8482         invlist_trim(u);
8483         array_u = invlist_array(u);
8484     }
8485
8486     /* When 'count' is 0, the list that was exhausted (if one was shorter than
8487      * the other) ended with everything above it not in its set.  That means
8488      * that the remaining part of the union is precisely the same as the
8489      * non-exhausted list, so can just copy it unchanged.  (If both list were
8490      * exhausted at the same time, then the operations below will be both 0.)
8491      */
8492     if (count == 0) {
8493         IV copy_count; /* At most one will have a non-zero copy count */
8494         if ((copy_count = len_a - i_a) > 0) {
8495             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8496         }
8497         else if ((copy_count = len_b - i_b) > 0) {
8498             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8499         }
8500     }
8501
8502     /*  We may be removing a reference to one of the inputs.  If so, the output
8503      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8504      *  count decremented) */
8505     if (a == *output || b == *output) {
8506         assert(! invlist_is_iterating(*output));
8507         if ((SvTEMP(*output))) {
8508             sv_2mortal(u);
8509         }
8510         else {
8511             SvREFCNT_dec_NN(*output);
8512         }
8513     }
8514
8515     *output = u;
8516
8517     return;
8518 }
8519
8520 void
8521 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8522                                                const bool complement_b, SV** i)
8523 {
8524     /* Take the intersection of two inversion lists and point <i> to it.  *i
8525      * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8526      * the reference count to that list will be decremented if not already a
8527      * temporary (mortal); otherwise *i will be made correspondingly mortal.
8528      * The first list, <a>, may be NULL, in which case an empty list is
8529      * returned.  If <complement_b> is TRUE, the result will be the
8530      * intersection of <a> and the complement (or inversion) of <b> instead of
8531      * <b> directly.
8532      *
8533      * The basis for this comes from "Unicode Demystified" Chapter 13 by
8534      * Richard Gillam, published by Addison-Wesley, and explained at some
8535      * length there.  The preface says to incorporate its examples into your
8536      * code at your own risk.  In fact, it had bugs
8537      *
8538      * The algorithm is like a merge sort, and is essentially the same as the
8539      * union above
8540      */
8541
8542     const UV* array_a;          /* a's array */
8543     const UV* array_b;
8544     UV len_a;   /* length of a's array */
8545     UV len_b;
8546
8547     SV* r;                   /* the resulting intersection */
8548     UV* array_r;
8549     UV len_r;
8550
8551     UV i_a = 0;             /* current index into a's array */
8552     UV i_b = 0;
8553     UV i_r = 0;
8554
8555     /* running count, as explained in the algorithm source book; items are
8556      * stopped accumulating and are output when the count changes to/from 2.
8557      * The count is incremented when we start a range that's in the set, and
8558      * decremented when we start a range that's not in the set.  So its range
8559      * is 0 to 2.  Only when the count is 2 is something in the intersection.
8560      */
8561     UV count = 0;
8562
8563     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8564     assert(a != b);
8565
8566     /* Special case if either one is empty */
8567     len_a = (a == NULL) ? 0 : _invlist_len(a);
8568     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8569         bool make_temp = FALSE;
8570
8571         if (len_a != 0 && complement_b) {
8572
8573             /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8574              * be empty.  Here, also we are using 'b's complement, which hence
8575              * must be every possible code point.  Thus the intersection is
8576              * simply 'a'. */
8577             if (*i != a) {
8578                 if (*i == b) {
8579                     if (! (make_temp = cBOOL(SvTEMP(b)))) {
8580                         SvREFCNT_dec_NN(b);
8581                     }
8582                 }
8583
8584                 *i = invlist_clone(a);
8585             }
8586             /* else *i is already 'a' */
8587
8588             if (make_temp) {
8589                 sv_2mortal(*i);
8590             }
8591             return;
8592         }
8593
8594         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8595          * intersection must be empty */
8596         if (*i == a) {
8597             if (! (make_temp = cBOOL(SvTEMP(a)))) {
8598                 SvREFCNT_dec_NN(a);
8599             }
8600         }
8601         else if (*i == b) {
8602             if (! (make_temp = cBOOL(SvTEMP(b)))) {
8603                 SvREFCNT_dec_NN(b);
8604             }
8605         }
8606         *i = _new_invlist(0);
8607         if (make_temp) {
8608             sv_2mortal(*i);
8609         }
8610
8611         return;
8612     }
8613
8614     /* Here both lists exist and are non-empty */
8615     array_a = invlist_array(a);
8616     array_b = invlist_array(b);
8617
8618     /* If are to take the intersection of 'a' with the complement of b, set it
8619      * up so are looking at b's complement. */
8620     if (complement_b) {
8621
8622         /* To complement, we invert: if the first element is 0, remove it.  To
8623          * do this, we just pretend the array starts one later */
8624         if (array_b[0] == 0) {
8625             array_b++;
8626             len_b--;
8627         }
8628         else {
8629
8630             /* But if the first element is not zero, we pretend the list starts
8631              * at the 0 that is always stored immediately before the array. */
8632             array_b--;
8633             len_b++;
8634         }
8635     }
8636
8637     /* Size the intersection for the worst case: that the intersection ends up
8638      * fragmenting everything to be completely disjoint */
8639     r= _new_invlist(len_a + len_b);
8640
8641     /* Will contain U+0000 iff both components do */
8642     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8643                                      && len_b > 0 && array_b[0] == 0);
8644
8645     /* Go through each list item by item, stopping when exhausted one of
8646      * them */
8647     while (i_a < len_a && i_b < len_b) {
8648         UV cp;      /* The element to potentially add to the intersection's
8649                        array */
8650         bool cp_in_set; /* Is it in the input list's set or not */
8651
8652         /* We need to take one or the other of the two inputs for the
8653          * intersection.  Since we are merging two sorted lists, we take the
8654          * smaller of the next items.  In case of a tie, we take the one that
8655          * is not in its set first (a difference from the union algorithm).  If
8656          * we took one in the set first, it would increment the count, possibly
8657          * to 2 which would cause it to be output as starting a range in the
8658          * intersection, and the next time through we would take that same
8659          * number, and output it again as ending the set.  By doing it the
8660          * opposite of this, there is no possibility that the count will be
8661          * momentarily incremented to 2.  (In a tie and both are in the set or
8662          * both not in the set, it doesn't matter which we take first.) */
8663         if (array_a[i_a] < array_b[i_b]
8664             || (array_a[i_a] == array_b[i_b]
8665                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8666         {
8667             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8668             cp= array_a[i_a++];
8669         }
8670         else {
8671             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8672             cp= array_b[i_b++];
8673         }
8674
8675         /* Here, have chosen which of the two inputs to look at.  Only output
8676          * if the running count changes to/from 2, which marks the
8677          * beginning/end of a range that's in the intersection */
8678         if (cp_in_set) {
8679             count++;
8680             if (count == 2) {
8681                 array_r[i_r++] = cp;
8682             }
8683         }
8684         else {
8685             if (count == 2) {
8686                 array_r[i_r++] = cp;
8687             }
8688             count--;
8689         }
8690     }
8691
8692     /* Here, we are finished going through at least one of the lists, which
8693      * means there is something remaining in at most one.  We check if the list
8694      * that has been exhausted is positioned such that we are in the middle
8695      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8696      * the ones we care about.)  There are four cases:
8697      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
8698      *     nothing left in the intersection.
8699      *  2) Both were in their sets, count is 2 and perhaps is incremented to
8700      *     above 2.  What should be output is exactly that which is in the
8701      *     non-exhausted set, as everything it has is also in the intersection
8702      *     set, and everything it doesn't have can't be in the intersection
8703      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8704      *     gets incremented to 2.  Like the previous case, the intersection is
8705      *     everything that remains in the non-exhausted set.
8706      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8707      *     remains 1.  And the intersection has nothing more. */
8708     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8709         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8710     {
8711         count++;
8712     }
8713
8714     /* The final length is what we've output so far plus what else is in the
8715      * intersection.  At most one of the subexpressions below will be non-zero
8716      * */
8717     len_r = i_r;
8718     if (count >= 2) {
8719         len_r += (len_a - i_a) + (len_b - i_b);
8720     }
8721
8722     /* Set result to final length, which can change the pointer to array_r, so
8723      * re-find it */
8724     if (len_r != _invlist_len(r)) {
8725         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8726         invlist_trim(r);
8727         array_r = invlist_array(r);
8728     }
8729
8730     /* Finish outputting any remaining */
8731     if (count >= 2) { /* At most one will have a non-zero copy count */
8732         IV copy_count;
8733         if ((copy_count = len_a - i_a) > 0) {
8734             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8735         }
8736         else if ((copy_count = len_b - i_b) > 0) {
8737             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8738         }
8739     }
8740
8741     /*  We may be removing a reference to one of the inputs.  If so, the output
8742      *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8743      *  count decremented) */
8744     if (a == *i || b == *i) {
8745         assert(! invlist_is_iterating(*i));
8746         if (SvTEMP(*i)) {
8747             sv_2mortal(r);
8748         }
8749         else {
8750             SvREFCNT_dec_NN(*i);
8751         }
8752     }
8753
8754     *i = r;
8755
8756     return;
8757 }
8758
8759 SV*
8760 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8761 {
8762     /* Add the range from 'start' to 'end' inclusive to the inversion list's
8763      * set.  A pointer to the inversion list is returned.  This may actually be
8764      * a new list, in which case the passed in one has been destroyed.  The
8765      * passed in inversion list can be NULL, in which case a new one is created
8766      * with just the one range in it */
8767
8768     SV* range_invlist;
8769     UV len;
8770
8771     if (invlist == NULL) {
8772         invlist = _new_invlist(2);
8773         len = 0;
8774     }
8775     else {
8776         len = _invlist_len(invlist);
8777     }
8778
8779     /* If comes after the final entry actually in the list, can just append it
8780      * to the end, */
8781     if (len == 0
8782         || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8783             && start >= invlist_array(invlist)[len - 1]))
8784     {
8785         _append_range_to_invlist(invlist, start, end);
8786         return invlist;
8787     }
8788
8789     /* Here, can't just append things, create and return a new inversion list
8790      * which is the union of this range and the existing inversion list */
8791     range_invlist = _new_invlist(2);
8792     _append_range_to_invlist(range_invlist, start, end);
8793
8794     _invlist_union(invlist, range_invlist, &invlist);
8795
8796     /* The temporary can be freed */
8797     SvREFCNT_dec_NN(range_invlist);
8798
8799     return invlist;
8800 }
8801
8802 SV*
8803 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8804                                  UV** other_elements_ptr)
8805 {
8806     /* Create and return an inversion list whose contents are to be populated
8807      * by the caller.  The caller gives the number of elements (in 'size') and
8808      * the very first element ('element0').  This function will set
8809      * '*other_elements_ptr' to an array of UVs, where the remaining elements
8810      * are to be placed.
8811      *
8812      * Obviously there is some trust involved that the caller will properly
8813      * fill in the other elements of the array.
8814      *
8815      * (The first element needs to be passed in, as the underlying code does
8816      * things differently depending on whether it is zero or non-zero) */
8817
8818     SV* invlist = _new_invlist(size);
8819     bool offset;
8820
8821     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8822
8823     _append_range_to_invlist(invlist, element0, element0);
8824     offset = *get_invlist_offset_addr(invlist);
8825
8826     invlist_set_len(invlist, size, offset);
8827     *other_elements_ptr = invlist_array(invlist) + 1;
8828     return invlist;
8829 }
8830
8831 #endif
8832
8833 PERL_STATIC_INLINE SV*
8834 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8835     return _add_range_to_invlist(invlist, cp, cp);
8836 }
8837
8838 #ifndef PERL_IN_XSUB_RE
8839 void
8840 Perl__invlist_invert(pTHX_ SV* const invlist)
8841 {
8842     /* Complement the input inversion list.  This adds a 0 if the list didn't
8843      * have a zero; removes it otherwise.  As described above, the data
8844      * structure is set up so that this is very efficient */
8845
8846     PERL_ARGS_ASSERT__INVLIST_INVERT;
8847
8848     assert(! invlist_is_iterating(invlist));
8849
8850     /* The inverse of matching nothing is matching everything */
8851     if (_invlist_len(invlist) == 0) {
8852         _append_range_to_invlist(invlist, 0, UV_MAX);
8853         return;
8854     }
8855
8856     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8857 }
8858
8859 #endif
8860
8861 PERL_STATIC_INLINE SV*
8862 S_invlist_clone(pTHX_ SV* const invlist)
8863 {
8864
8865     /* Return a new inversion list that is a copy of the input one, which is
8866      * unchanged.  The new list will not be mortal even if the old one was. */
8867
8868     /* Need to allocate extra space to accommodate Perl's addition of a
8869      * trailing NUL to SvPV's, since it thinks they are always strings */
8870     SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8871     STRLEN physical_length = SvCUR(invlist);
8872     bool offset = *(get_invlist_offset_addr(invlist));
8873
8874     PERL_ARGS_ASSERT_INVLIST_CLONE;
8875
8876     *(get_invlist_offset_addr(new_invlist)) = offset;
8877     invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8878     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8879
8880     return new_invlist;
8881 }
8882
8883 PERL_STATIC_INLINE STRLEN*
8884 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8885 {
8886     /* Return the address of the UV that contains the current iteration
8887      * position */
8888
8889     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8890
8891     assert(SvTYPE(invlist) == SVt_INVLIST);
8892
8893     return &(((XINVLIST*) SvANY(invlist))->iterator);
8894 }
8895
8896 PERL_STATIC_INLINE void
8897 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
8898 {
8899     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8900
8901     *get_invlist_iter_addr(invlist) = 0;
8902 }
8903
8904 PERL_STATIC_INLINE void
8905 S_invlist_iterfinish(pTHX_ SV* invlist)
8906 {
8907     /* Terminate iterator for invlist.  This is to catch development errors.
8908      * Any iteration that is interrupted before completed should call this
8909      * function.  Functions that add code points anywhere else but to the end
8910      * of an inversion list assert that they are not in the middle of an
8911      * iteration.  If they were, the addition would make the iteration
8912      * problematical: if the iteration hadn't reached the place where things
8913      * were being added, it would be ok */
8914
8915     PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8916
8917     *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8918 }
8919
8920 STATIC bool
8921 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8922 {
8923     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8924      * This call sets in <*start> and <*end>, the next range in <invlist>.
8925      * Returns <TRUE> if successful and the next call will return the next
8926      * range; <FALSE> if was already at the end of the list.  If the latter,
8927      * <*start> and <*end> are unchanged, and the next call to this function
8928      * will start over at the beginning of the list */
8929
8930     STRLEN* pos = get_invlist_iter_addr(invlist);
8931     UV len = _invlist_len(invlist);
8932     UV *array;
8933
8934     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8935
8936     if (*pos >= len) {
8937         *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8938         return FALSE;
8939     }
8940
8941     array = invlist_array(invlist);
8942
8943     *start = array[(*pos)++];
8944
8945     if (*pos >= len) {
8946         *end = UV_MAX;
8947     }
8948     else {
8949         *end = array[(*pos)++] - 1;
8950     }
8951
8952     return TRUE;
8953 }
8954
8955 PERL_STATIC_INLINE bool
8956 S_invlist_is_iterating(pTHX_ SV* const invlist)
8957 {
8958     PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8959
8960     return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8961 }
8962
8963 PERL_STATIC_INLINE UV
8964 S_invlist_highest(pTHX_ SV* const invlist)
8965 {
8966     /* Returns the highest code point that matches an inversion list.  This API
8967      * has an ambiguity, as it returns 0 under either the highest is actually
8968      * 0, or if the list is empty.  If this distinction matters to you, check
8969      * for emptiness before calling this function */
8970
8971     UV len = _invlist_len(invlist);
8972     UV *array;
8973
8974     PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8975
8976     if (len == 0) {
8977         return 0;
8978     }
8979
8980     array = invlist_array(invlist);
8981
8982     /* The last element in the array in the inversion list always starts a
8983      * range that goes to infinity.  That range may be for code points that are
8984      * matched in the inversion list, or it may be for ones that aren't
8985      * matched.  In the latter case, the highest code point in the set is one
8986      * less than the beginning of this range; otherwise it is the final element
8987      * of this range: infinity */
8988     return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8989            ? UV_MAX
8990            : array[len - 1] - 1;
8991 }
8992
8993 #ifndef PERL_IN_XSUB_RE
8994 SV *
8995 Perl__invlist_contents(pTHX_ SV* const invlist)
8996 {
8997     /* Get the contents of an inversion list into a string SV so that they can
8998      * be printed out.  It uses the format traditionally done for debug tracing
8999      */
9000
9001     UV start, end;
9002     SV* output = newSVpvs("\n");
9003
9004     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9005
9006     assert(! invlist_is_iterating(invlist));
9007
9008     invlist_iterinit(invlist);
9009     while (invlist_iternext(invlist, &start, &end)) {
9010         if (end == UV_MAX) {
9011             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9012         }
9013         else if (end != start) {
9014             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9015                     start,       end);
9016         }
9017         else {
9018             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9019         }
9020     }
9021
9022     return output;
9023 }
9024 #endif
9025
9026 #ifndef PERL_IN_XSUB_RE
9027 void
9028 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9029                          const char * const indent, SV* const invlist)
9030 {
9031     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9032      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9033      * the string 'indent'.  The output looks like this:
9034          [0] 0x000A .. 0x000D
9035          [2] 0x0085
9036          [4] 0x2028 .. 0x2029
9037          [6] 0x3104 .. INFINITY
9038      * This means that the first range of code points matched by the list are
9039      * 0xA through 0xD; the second range contains only the single code point
9040      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9041      * are used to define each range (except if the final range extends to
9042      * infinity, only a single element is needed).  The array index of the
9043      * first element for the corresponding range is given in brackets. */
9044
9045     UV start, end;
9046     STRLEN count = 0;
9047
9048     PERL_ARGS_ASSERT__INVLIST_DUMP;
9049
9050     if (invlist_is_iterating(invlist)) {
9051         Perl_dump_indent(aTHX_ level, file,
9052              "%sCan't dump inversion list because is in middle of iterating\n",
9053              indent);
9054         return;
9055     }
9056
9057     invlist_iterinit(invlist);
9058     while (invlist_iternext(invlist, &start, &end)) {
9059         if (end == UV_MAX) {
9060             Perl_dump_indent(aTHX_ level, file,
9061                                        "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9062                                    indent, (UV)count, start);
9063         }
9064         else if (end != start) {
9065             Perl_dump_indent(aTHX_ level, file,
9066                                     "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9067                                 indent, (UV)count, start,         end);
9068         }
9069         else {
9070             Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9071                                             indent, (UV)count, start);
9072         }
9073         count += 2;
9074     }
9075 }
9076 #endif
9077
9078 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9079 bool
9080 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9081 {
9082     /* Return a boolean as to if the two passed in inversion lists are
9083      * identical.  The final argument, if TRUE, says to take the complement of
9084      * the second inversion list before doing the comparison */
9085
9086     const UV* array_a = invlist_array(a);
9087     const UV* array_b = invlist_array(b);
9088     UV len_a = _invlist_len(a);
9089     UV len_b = _invlist_len(b);
9090
9091     UV i = 0;               /* current index into the arrays */
9092     bool retval = TRUE;     /* Assume are identical until proven otherwise */
9093
9094     PERL_ARGS_ASSERT__INVLISTEQ;
9095
9096     /* If are to compare 'a' with the complement of b, set it
9097      * up so are looking at b's complement. */
9098     if (complement_b) {
9099
9100         /* The complement of nothing is everything, so <a> would have to have
9101          * just one element, starting at zero (ending at infinity) */
9102         if (len_b == 0) {
9103             return (len_a == 1 && array_a[0] == 0);
9104         }
9105         else if (array_b[0] == 0) {
9106
9107             /* Otherwise, to complement, we invert.  Here, the first element is
9108              * 0, just remove it.  To do this, we just pretend the array starts
9109              * one later */
9110
9111             array_b++;
9112             len_b--;
9113         }
9114         else {
9115
9116             /* But if the first element is not zero, we pretend the list starts
9117              * at the 0 that is always stored immediately before the array. */
9118             array_b--;
9119             len_b++;
9120         }
9121     }
9122
9123     /* Make sure that the lengths are the same, as well as the final element
9124      * before looping through the remainder.  (Thus we test the length, final,
9125      * and first elements right off the bat) */
9126     if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9127         retval = FALSE;
9128     }
9129     else for (i = 0; i < len_a - 1; i++) {
9130         if (array_a[i] != array_b[i]) {
9131             retval = FALSE;
9132             break;
9133         }
9134     }
9135
9136     return retval;
9137 }
9138 #endif
9139
9140 #undef HEADER_LENGTH
9141 #undef TO_INTERNAL_SIZE
9142 #undef FROM_INTERNAL_SIZE
9143 #undef INVLIST_VERSION_ID
9144
9145 /* End of inversion list object */
9146
9147 STATIC void
9148 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9149 {
9150     /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9151      * constructs, and updates RExC_flags with them.  On input, RExC_parse
9152      * should point to the first flag; it is updated on output to point to the
9153      * final ')' or ':'.  There needs to be at least one flag, or this will
9154      * abort */
9155
9156     /* for (?g), (?gc), and (?o) warnings; warning
9157        about (?c) will warn about (?g) -- japhy    */
9158
9159 #define WASTED_O  0x01
9160 #define WASTED_G  0x02
9161 #define WASTED_C  0x04
9162 #define WASTED_GC (WASTED_G|WASTED_C)
9163     I32 wastedflags = 0x00;
9164     U32 posflags = 0, negflags = 0;
9165     U32 *flagsp = &posflags;
9166     char has_charset_modifier = '\0';
9167     regex_charset cs;
9168     bool has_use_defaults = FALSE;
9169     const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9170
9171     PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9172
9173     /* '^' as an initial flag sets certain defaults */
9174     if (UCHARAT(RExC_parse) == '^') {
9175         RExC_parse++;
9176         has_use_defaults = TRUE;
9177         STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9178         set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9179                                         ? REGEX_UNICODE_CHARSET
9180                                         : REGEX_DEPENDS_CHARSET);
9181     }
9182
9183     cs = get_regex_charset(RExC_flags);
9184     if (cs == REGEX_DEPENDS_CHARSET
9185         && (RExC_utf8 || RExC_uni_semantics))
9186     {
9187         cs = REGEX_UNICODE_CHARSET;
9188     }
9189
9190     while (*RExC_parse) {
9191         /* && strchr("iogcmsx", *RExC_parse) */
9192         /* (?g), (?gc) and (?o) are useless here
9193            and must be globally applied -- japhy */
9194         switch (*RExC_parse) {
9195
9196             /* Code for the imsx flags */
9197             CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9198
9199             case LOCALE_PAT_MOD:
9200                 if (has_charset_modifier) {
9201                     goto excess_modifier;
9202                 }
9203                 else if (flagsp == &negflags) {
9204                     goto neg_modifier;
9205                 }
9206                 cs = REGEX_LOCALE_CHARSET;
9207                 has_charset_modifier = LOCALE_PAT_MOD;
9208                 break;
9209             case UNICODE_PAT_MOD:
9210                 if (has_charset_modifier) {
9211                     goto excess_modifier;
9212                 }
9213                 else if (flagsp == &negflags) {
9214                     goto neg_modifier;
9215                 }
9216                 cs = REGEX_UNICODE_CHARSET;
9217                 has_charset_modifier = UNICODE_PAT_MOD;
9218                 break;
9219             case ASCII_RESTRICT_PAT_MOD:
9220                 if (flagsp == &negflags) {
9221                     goto neg_modifier;
9222                 }
9223                 if (has_charset_modifier) {
9224                     if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9225                         goto excess_modifier;
9226                     }
9227                     /* Doubled modifier implies more restricted */
9228                     cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9229                 }
9230                 else {
9231                     cs = REGEX_ASCII_RESTRICTED_CHARSET;
9232                 }
9233                 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9234                 break;
9235             case DEPENDS_PAT_MOD:
9236                 if (has_use_defaults) {
9237                     goto fail_modifiers;
9238                 }
9239                 else if (flagsp == &negflags) {
9240                     goto neg_modifier;
9241                 }
9242                 else if (has_charset_modifier) {
9243                     goto excess_modifier;
9244                 }
9245
9246                 /* The dual charset means unicode semantics if the
9247                  * pattern (or target, not known until runtime) are
9248                  * utf8, or something in the pattern indicates unicode
9249                  * semantics */
9250                 cs = (RExC_utf8 || RExC_uni_semantics)
9251                      ? REGEX_UNICODE_CHARSET
9252                      : REGEX_DEPENDS_CHARSET;
9253                 has_charset_modifier = DEPENDS_PAT_MOD;
9254                 break;
9255             excess_modifier:
9256                 RExC_parse++;
9257                 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9258                     vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9259                 }
9260                 else if (has_charset_modifier == *(RExC_parse - 1)) {
9261                     vFAIL2("Regexp modifier \"%c\" may not appear twice",
9262                                         *(RExC_parse - 1));
9263                 }
9264                 else {
9265                     vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9266                 }
9267                 /*NOTREACHED*/
9268             neg_modifier:
9269                 RExC_parse++;
9270                 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9271                                     *(RExC_parse - 1));
9272                 /*NOTREACHED*/
9273             case ONCE_PAT_MOD: /* 'o' */
9274             case GLOBAL_PAT_MOD: /* 'g' */
9275                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9276                     const I32 wflagbit = *RExC_parse == 'o'
9277                                          ? WASTED_O
9278                                          : WASTED_G;
9279                     if (! (wastedflags & wflagbit) ) {
9280                         wastedflags |= wflagbit;
9281                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9282                         vWARN5(
9283                             RExC_parse + 1,
9284                             "Useless (%s%c) - %suse /%c modifier",
9285                             flagsp == &negflags ? "?-" : "?",
9286                             *RExC_parse,
9287                             flagsp == &negflags ? "don't " : "",
9288                             *RExC_parse
9289                         );
9290                     }
9291                 }
9292                 break;
9293
9294             case CONTINUE_PAT_MOD: /* 'c' */
9295                 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9296                     if (! (wastedflags & WASTED_C) ) {
9297                         wastedflags |= WASTED_GC;
9298                         /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9299                         vWARN3(
9300                             RExC_parse + 1,
9301                             "Useless (%sc) - %suse /gc modifier",
9302                             flagsp == &negflags ? "?-" : "?",
9303                             flagsp == &negflags ? "don't " : ""
9304                         );
9305                     }
9306                 }
9307                 break;
9308             case KEEPCOPY_PAT_MOD: /* 'p' */
9309                 if (flagsp == &negflags) {
9310                     if (SIZE_ONLY)
9311                         ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9312                 } else {
9313                     *flagsp |= RXf_PMf_KEEPCOPY;
9314                 }
9315                 break;
9316             case '-':
9317                 /* A flag is a default iff it is following a minus, so
9318                  * if there is a minus, it means will be trying to
9319                  * re-specify a default which is an error */
9320                 if (has_use_defaults || flagsp == &negflags) {
9321                     goto fail_modifiers;
9322                 }
9323                 flagsp = &negflags;
9324                 wastedflags = 0;  /* reset so (?g-c) warns twice */
9325                 break;
9326             case ':':
9327             case ')':
9328                 RExC_flags |= posflags;
9329                 RExC_flags &= ~negflags;
9330                 set_regex_charset(&RExC_flags, cs);
9331                 if (RExC_flags & RXf_PMf_FOLD) {
9332                     RExC_contains_i = 1;
9333                 }
9334                 return;
9335                 /*NOTREACHED*/
9336             default:
9337             fail_modifiers:
9338                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9339                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9340                 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9341                       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9342                 /*NOTREACHED*/
9343         }
9344
9345         ++RExC_parse;
9346     }
9347 }
9348
9349 /*
9350  - reg - regular expression, i.e. main body or parenthesized thing
9351  *
9352  * Caller must absorb opening parenthesis.
9353  *
9354  * Combining parenthesis handling with the base level of regular expression
9355  * is a trifle forced, but the need to tie the tails of the branches to what
9356  * follows makes it hard to avoid.
9357  */
9358 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9359 #ifdef DEBUGGING
9360 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9361 #else
9362 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9363 #endif
9364
9365 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9366    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9367    needs to be restarted.
9368    Otherwise would only return NULL if regbranch() returns NULL, which
9369    cannot happen.  */
9370 STATIC regnode *
9371 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9372     /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9373      * 2 is like 1, but indicates that nextchar() has been called to advance
9374      * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9375      * this flag alerts us to the need to check for that */
9376 {
9377     dVAR;
9378     regnode *ret;               /* Will be the head of the group. */
9379     regnode *br;
9380     regnode *lastbr;
9381     regnode *ender = NULL;
9382     I32 parno = 0;
9383     I32 flags;
9384     U32 oregflags = RExC_flags;
9385     bool have_branch = 0;
9386     bool is_open = 0;
9387     I32 freeze_paren = 0;
9388     I32 after_freeze = 0;
9389
9390     char * parse_start = RExC_parse; /* MJD */
9391     char * const oregcomp_parse = RExC_parse;
9392
9393     GET_RE_DEBUG_FLAGS_DECL;
9394
9395     PERL_ARGS_ASSERT_REG;
9396     DEBUG_PARSE("reg ");
9397
9398     *flagp = 0;                         /* Tentatively. */
9399
9400
9401     /* Make an OPEN node, if parenthesized. */
9402     if (paren) {
9403
9404         /* Under /x, space and comments can be gobbled up between the '(' and
9405          * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9406          * intervening space, as the sequence is a token, and a token should be
9407          * indivisible */
9408         bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9409
9410         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9411             char *start_verb = RExC_parse;
9412             STRLEN verb_len = 0;
9413             char *start_arg = NULL;
9414             unsigned char op = 0;
9415             int argok = 1;
9416             int internal_argval = 0; /* internal_argval is only useful if
9417                                         !argok */
9418
9419             if (has_intervening_patws && SIZE_ONLY) {
9420                 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9421             }
9422             while ( *RExC_parse && *RExC_parse != ')' ) {
9423                 if ( *RExC_parse == ':' ) {
9424                     start_arg = RExC_parse + 1;
9425                     break;
9426                 }
9427                 RExC_parse++;
9428             }
9429             ++start_verb;
9430             verb_len = RExC_parse - start_verb;
9431             if ( start_arg ) {
9432                 RExC_parse++;
9433                 while ( *RExC_parse && *RExC_parse != ')' )
9434                     RExC_parse++;
9435                 if ( *RExC_parse != ')' )
9436                     vFAIL("Unterminated verb pattern argument");
9437                 if ( RExC_parse == start_arg )
9438                     start_arg = NULL;
9439             } else {
9440                 if ( *RExC_parse != ')' )
9441                     vFAIL("Unterminated verb pattern");
9442             }
9443
9444             switch ( *start_verb ) {
9445             case 'A':  /* (*ACCEPT) */
9446                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9447                     op = ACCEPT;
9448                     internal_argval = RExC_nestroot;
9449                 }
9450                 break;
9451             case 'C':  /* (*COMMIT) */
9452                 if ( memEQs(start_verb,verb_len,"COMMIT") )
9453                     op = COMMIT;
9454                 break;
9455             case 'F':  /* (*FAIL) */
9456                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9457                     op = OPFAIL;
9458                     argok = 0;
9459                 }
9460                 break;
9461             case ':':  /* (*:NAME) */
9462             case 'M':  /* (*MARK:NAME) */
9463                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9464                     op = MARKPOINT;
9465                     argok = -1;
9466                 }
9467                 break;
9468             case 'P':  /* (*PRUNE) */
9469                 if ( memEQs(start_verb,verb_len,"PRUNE") )
9470                     op = PRUNE;
9471                 break;
9472             case 'S':   /* (*SKIP) */
9473                 if ( memEQs(start_verb,verb_len,"SKIP") )
9474                     op = SKIP;
9475                 break;
9476             case 'T':  /* (*THEN) */
9477                 /* [19:06] <TimToady> :: is then */
9478                 if ( memEQs(start_verb,verb_len,"THEN") ) {
9479                     op = CUTGROUP;
9480                     RExC_seen |= REG_CUTGROUP_SEEN;
9481                 }
9482                 break;
9483             }
9484             if ( ! op ) {
9485                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9486                 vFAIL2utf8f(
9487                     "Unknown verb pattern '%"UTF8f"'",
9488                     UTF8fARG(UTF, verb_len, start_verb));
9489             }
9490             if ( argok ) {
9491                 if ( start_arg && internal_argval ) {
9492                     vFAIL3("Verb pattern '%.*s' may not have an argument",
9493                         verb_len, start_verb);
9494                 } else if ( argok < 0 && !start_arg ) {
9495                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9496                         verb_len, start_verb);
9497                 } else {
9498                     ret = reganode(pRExC_state, op, internal_argval);
9499                     if ( ! internal_argval && ! SIZE_ONLY ) {
9500                         if (start_arg) {
9501                             SV *sv = newSVpvn( start_arg,
9502                                                RExC_parse - start_arg);
9503                             ARG(ret) = add_data( pRExC_state,
9504                                                  STR_WITH_LEN("S"));
9505                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9506                             ret->flags = 0;
9507                         } else {
9508                             ret->flags = 1;
9509                         }
9510                     }
9511                 }
9512                 if (!internal_argval)
9513                     RExC_seen |= REG_VERBARG_SEEN;
9514             } else if ( start_arg ) {
9515                 vFAIL3("Verb pattern '%.*s' may not have an argument",
9516                         verb_len, start_verb);
9517             } else {
9518                 ret = reg_node(pRExC_state, op);
9519             }
9520             nextchar(pRExC_state);
9521             return ret;
9522         }
9523         else if (*RExC_parse == '?') { /* (?...) */
9524             bool is_logical = 0;
9525             const char * const seqstart = RExC_parse;
9526             if (has_intervening_patws && SIZE_ONLY) {
9527                 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9528             }
9529
9530             RExC_parse++;
9531             paren = *RExC_parse++;
9532             ret = NULL;                 /* For look-ahead/behind. */
9533             switch (paren) {
9534
9535             case 'P':   /* (?P...) variants for those used to PCRE/Python */
9536                 paren = *RExC_parse++;
9537                 if ( paren == '<')         /* (?P<...>) named capture */
9538                     goto named_capture;
9539                 else if (paren == '>') {   /* (?P>name) named recursion */
9540                     goto named_recursion;
9541                 }
9542                 else if (paren == '=') {   /* (?P=...)  named backref */
9543                     /* this pretty much dupes the code for \k<NAME> in
9544                      * regatom(), if you change this make sure you change that
9545                      * */
9546                     char* name_start = RExC_parse;
9547                     U32 num = 0;
9548                     SV *sv_dat = reg_scan_name(pRExC_state,
9549                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9550                     if (RExC_parse == name_start || *RExC_parse != ')')
9551                         /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9552                         vFAIL2("Sequence %.3s... not terminated",parse_start);
9553
9554                     if (!SIZE_ONLY) {
9555                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9556                         RExC_rxi->data->data[num]=(void*)sv_dat;
9557                         SvREFCNT_inc_simple_void(sv_dat);
9558                     }
9559                     RExC_sawback = 1;
9560                     ret = reganode(pRExC_state,
9561                                    ((! FOLD)
9562                                      ? NREF
9563                                      : (ASCII_FOLD_RESTRICTED)
9564                                        ? NREFFA
9565                                        : (AT_LEAST_UNI_SEMANTICS)
9566                                          ? NREFFU
9567                                          : (LOC)
9568                                            ? NREFFL
9569                                            : NREFF),
9570                                     num);
9571                     *flagp |= HASWIDTH;
9572
9573                     Set_Node_Offset(ret, parse_start+1);
9574                     Set_Node_Cur_Length(ret, parse_start);
9575
9576                     nextchar(pRExC_state);
9577                     return ret;
9578                 }
9579                 RExC_parse++;
9580                 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9581                 vFAIL3("Sequence (%.*s...) not recognized",
9582                                 RExC_parse-seqstart, seqstart);
9583                 /*NOTREACHED*/
9584             case '<':           /* (?<...) */
9585                 if (*RExC_parse == '!')
9586                     paren = ',';
9587                 else if (*RExC_parse != '=')
9588               named_capture:
9589                 {               /* (?<...>) */
9590                     char *name_start;
9591                     SV *svname;
9592                     paren= '>';
9593             case '\'':          /* (?'...') */
9594                     name_start= RExC_parse;
9595                     svname = reg_scan_name(pRExC_state,
9596                         SIZE_ONLY    /* reverse test from the others */
9597                         ? REG_RSN_RETURN_NAME
9598                         : REG_RSN_RETURN_NULL);
9599                     if (RExC_parse == name_start || *RExC_parse != paren)
9600                         vFAIL2("Sequence (?%c... not terminated",
9601                             paren=='>' ? '<' : paren);
9602                     if (SIZE_ONLY) {
9603                         HE *he_str;
9604                         SV *sv_dat = NULL;
9605                         if (!svname) /* shouldn't happen */
9606                             Perl_croak(aTHX_
9607                                 "panic: reg_scan_name returned NULL");
9608                         if (!RExC_paren_names) {
9609                             RExC_paren_names= newHV();
9610                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
9611 #ifdef DEBUGGING
9612                             RExC_paren_name_list= newAV();
9613                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9614 #endif
9615                         }
9616                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9617                         if ( he_str )
9618                             sv_dat = HeVAL(he_str);
9619                         if ( ! sv_dat ) {
9620                             /* croak baby croak */
9621                             Perl_croak(aTHX_
9622                                 "panic: paren_name hash element allocation failed");
9623                         } else if ( SvPOK(sv_dat) ) {
9624                             /* (?|...) can mean we have dupes so scan to check
9625                                its already been stored. Maybe a flag indicating
9626                                we are inside such a construct would be useful,
9627                                but the arrays are likely to be quite small, so
9628                                for now we punt -- dmq */
9629                             IV count = SvIV(sv_dat);
9630                             I32 *pv = (I32*)SvPVX(sv_dat);
9631                             IV i;
9632                             for ( i = 0 ; i < count ; i++ ) {
9633                                 if ( pv[i] == RExC_npar ) {
9634                                     count = 0;
9635                                     break;
9636                                 }
9637                             }
9638                             if ( count ) {
9639                                 pv = (I32*)SvGROW(sv_dat,
9640                                                 SvCUR(sv_dat) + sizeof(I32)+1);
9641                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9642                                 pv[count] = RExC_npar;
9643                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9644                             }
9645                         } else {
9646                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
9647                             sv_setpvn(sv_dat, (char *)&(RExC_npar),
9648                                                                 sizeof(I32));
9649                             SvIOK_on(sv_dat);
9650                             SvIV_set(sv_dat, 1);
9651                         }
9652 #ifdef DEBUGGING
9653                         /* Yes this does cause a memory leak in debugging Perls
9654                          * */
9655                         if (!av_store(RExC_paren_name_list,
9656                                       RExC_npar, SvREFCNT_inc(svname)))
9657                             SvREFCNT_dec_NN(svname);
9658 #endif
9659
9660                         /*sv_dump(sv_dat);*/
9661                     }
9662                     nextchar(pRExC_state);
9663                     paren = 1;
9664                     goto capturing_parens;
9665                 }
9666                 RExC_seen |= REG_LOOKBEHIND_SEEN;
9667                 RExC_in_lookbehind++;
9668                 RExC_parse++;
9669             case '=':           /* (?=...) */
9670                 RExC_seen_zerolen++;
9671                 break;
9672             case '!':           /* (?!...) */
9673                 RExC_seen_zerolen++;
9674                 if (*RExC_parse == ')') {
9675                     ret=reg_node(pRExC_state, OPFAIL);
9676                     nextchar(pRExC_state);
9677                     return ret;
9678                 }
9679                 break;
9680             case '|':           /* (?|...) */
9681                 /* branch reset, behave like a (?:...) except that
9682                    buffers in alternations share the same numbers */
9683                 paren = ':';
9684                 after_freeze = freeze_paren = RExC_npar;
9685                 break;
9686             case ':':           /* (?:...) */
9687             case '>':           /* (?>...) */
9688                 break;
9689             case '$':           /* (?$...) */
9690             case '@':           /* (?@...) */
9691                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9692                 break;
9693             case '#':           /* (?#...) */
9694                 /* XXX As soon as we disallow separating the '?' and '*' (by
9695                  * spaces or (?#...) comment), it is believed that this case
9696                  * will be unreachable and can be removed.  See
9697                  * [perl #117327] */
9698                 while (*RExC_parse && *RExC_parse != ')')
9699                     RExC_parse++;
9700                 if (*RExC_parse != ')')
9701                     FAIL("Sequence (?#... not terminated");
9702                 nextchar(pRExC_state);
9703                 *flagp = TRYAGAIN;
9704                 return NULL;
9705             case '0' :           /* (?0) */
9706             case 'R' :           /* (?R) */
9707                 if (*RExC_parse != ')')
9708                     FAIL("Sequence (?R) not terminated");
9709                 ret = reg_node(pRExC_state, GOSTART);
9710                     RExC_seen |= REG_GOSTART_SEEN;
9711                 *flagp |= POSTPONED;
9712                 nextchar(pRExC_state);
9713                 return ret;
9714                 /*notreached*/
9715             { /* named and numeric backreferences */
9716                 I32 num;
9717             case '&':            /* (?&NAME) */
9718                 parse_start = RExC_parse - 1;
9719               named_recursion:
9720                 {
9721                     SV *sv_dat = reg_scan_name(pRExC_state,
9722                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9723                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9724                 }
9725                 if (RExC_parse == RExC_end || *RExC_parse != ')')
9726                     vFAIL("Sequence (?&... not terminated");
9727                 goto gen_recurse_regop;
9728                 assert(0); /* NOT REACHED */
9729             case '+':
9730                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9731                     RExC_parse++;
9732                     vFAIL("Illegal pattern");
9733                 }
9734                 goto parse_recursion;
9735                 /* NOT REACHED*/
9736             case '-': /* (?-1) */
9737                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9738                     RExC_parse--; /* rewind to let it be handled later */
9739                     goto parse_flags;
9740                 }
9741                 /*FALLTHROUGH */
9742             case '1': case '2': case '3': case '4': /* (?1) */
9743             case '5': case '6': case '7': case '8': case '9':
9744                 RExC_parse--;
9745               parse_recursion:
9746                 num = atoi(RExC_parse);
9747                 parse_start = RExC_parse - 1; /* MJD */
9748                 if (*RExC_parse == '-')
9749                     RExC_parse++;
9750                 while (isDIGIT(*RExC_parse))
9751                         RExC_parse++;
9752                 if (*RExC_parse!=')')
9753                     vFAIL("Expecting close bracket");
9754
9755               gen_recurse_regop:
9756                 if ( paren == '-' ) {
9757                     /*
9758                     Diagram of capture buffer numbering.
9759                     Top line is the normal capture buffer numbers
9760                     Bottom line is the negative indexing as from
9761                     the X (the (?-2))
9762
9763                     +   1 2    3 4 5 X          6 7
9764                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9765                     -   5 4    3 2 1 X          x x
9766
9767                     */
9768                     num = RExC_npar + num;
9769                     if (num < 1)  {
9770                         RExC_parse++;
9771                         vFAIL("Reference to nonexistent group");
9772                     }
9773                 } else if ( paren == '+' ) {
9774                     num = RExC_npar + num - 1;
9775                 }
9776
9777                 ret = reganode(pRExC_state, GOSUB, num);
9778                 if (!SIZE_ONLY) {
9779                     if (num > (I32)RExC_rx->nparens) {
9780                         RExC_parse++;
9781                         vFAIL("Reference to nonexistent group");
9782                     }
9783                     ARG2L_SET( ret, RExC_recurse_count++);
9784                     RExC_emit++;
9785                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9786                         "Recurse #%"UVuf" to %"IVdf"\n",
9787                               (UV)ARG(ret), (IV)ARG2L(ret)));
9788                 } else {
9789                     RExC_size++;
9790                 }
9791                     RExC_seen |= REG_RECURSE_SEEN;
9792                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9793                 Set_Node_Offset(ret, parse_start); /* MJD */
9794
9795                 *flagp |= POSTPONED;
9796                 nextchar(pRExC_state);
9797                 return ret;
9798             } /* named and numeric backreferences */
9799             assert(0); /* NOT REACHED */
9800
9801             case '?':           /* (??...) */
9802                 is_logical = 1;
9803                 if (*RExC_parse != '{') {
9804                     RExC_parse++;
9805                     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9806                     vFAIL2utf8f(
9807                         "Sequence (%"UTF8f"...) not recognized",
9808                         UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9809                     /*NOTREACHED*/
9810                 }
9811                 *flagp |= POSTPONED;
9812                 paren = *RExC_parse++;
9813                 /* FALL THROUGH */
9814             case '{':           /* (?{...}) */
9815             {
9816                 U32 n = 0;
9817                 struct reg_code_block *cb;
9818
9819                 RExC_seen_zerolen++;
9820
9821                 if (   !pRExC_state->num_code_blocks
9822                     || pRExC_state->code_index >= pRExC_state->num_code_blocks
9823                     || pRExC_state->code_blocks[pRExC_state->code_index].start
9824                         != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9825                             - RExC_start)
9826                 ) {
9827                     if (RExC_pm_flags & PMf_USE_RE_EVAL)
9828                         FAIL("panic: Sequence (?{...}): no code block found\n");
9829                     FAIL("Eval-group not allowed at runtime, use re 'eval'");
9830                 }
9831                 /* this is a pre-compiled code block (?{...}) */
9832                 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9833                 RExC_parse = RExC_start + cb->end;
9834                 if (!SIZE_ONLY) {
9835                     OP *o = cb->block;
9836                     if (cb->src_regex) {
9837                         n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9838                         RExC_rxi->data->data[n] =
9839                             (void*)SvREFCNT_inc((SV*)cb->src_regex);
9840                         RExC_rxi->data->data[n+1] = (void*)o;
9841                     }
9842                     else {
9843                         n = add_data(pRExC_state,
9844                                (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9845                         RExC_rxi->data->data[n] = (void*)o;
9846                     }
9847                 }
9848                 pRExC_state->code_index++;
9849                 nextchar(pRExC_state);
9850
9851                 if (is_logical) {
9852                     regnode *eval;
9853                     ret = reg_node(pRExC_state, LOGICAL);
9854                     eval = reganode(pRExC_state, EVAL, n);
9855                     if (!SIZE_ONLY) {
9856                         ret->flags = 2;
9857                         /* for later propagation into (??{}) return value */
9858                         eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9859                     }
9860                     REGTAIL(pRExC_state, ret, eval);
9861                     /* deal with the length of this later - MJD */
9862                     return ret;
9863                 }
9864                 ret = reganode(pRExC_state, EVAL, n);
9865                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9866                 Set_Node_Offset(ret, parse_start);
9867                 return ret;
9868             }
9869             case '(':           /* (?(?{...})...) and (?(?=...)...) */
9870             {
9871                 int is_define= 0;
9872                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
9873                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9874                         || RExC_parse[1] == '<'
9875                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
9876                         I32 flag;
9877                         regnode *tail;
9878
9879                         ret = reg_node(pRExC_state, LOGICAL);
9880                         if (!SIZE_ONLY)
9881                             ret->flags = 1;
9882
9883                         tail = reg(pRExC_state, 1, &flag, depth+1);
9884                         if (flag & RESTART_UTF8) {
9885                             *flagp = RESTART_UTF8;
9886                             return NULL;
9887                         }
9888                         REGTAIL(pRExC_state, ret, tail);
9889                         goto insert_if;
9890                     }
9891                 }
9892                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9893                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9894                 {
9895                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
9896                     char *name_start= RExC_parse++;
9897                     U32 num = 0;
9898                     SV *sv_dat=reg_scan_name(pRExC_state,
9899                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9900                     if (RExC_parse == name_start || *RExC_parse != ch)
9901                         vFAIL2("Sequence (?(%c... not terminated",
9902                             (ch == '>' ? '<' : ch));
9903                     RExC_parse++;
9904                     if (!SIZE_ONLY) {
9905                         num = add_data( pRExC_state, STR_WITH_LEN("S"));
9906                         RExC_rxi->data->data[num]=(void*)sv_dat;
9907                         SvREFCNT_inc_simple_void(sv_dat);
9908                     }
9909                     ret = reganode(pRExC_state,NGROUPP,num);
9910                     goto insert_if_check_paren;
9911                 }
9912                 else if (RExC_parse[0] == 'D' &&
9913                          RExC_parse[1] == 'E' &&
9914                          RExC_parse[2] == 'F' &&
9915                          RExC_parse[3] == 'I' &&
9916                          RExC_parse[4] == 'N' &&
9917                          RExC_parse[5] == 'E')
9918                 {
9919                     ret = reganode(pRExC_state,DEFINEP,0);
9920                     RExC_parse +=6 ;
9921                     is_define = 1;
9922                     goto insert_if_check_paren;
9923                 }
9924                 else if (RExC_parse[0] == 'R') {
9925                     RExC_parse++;
9926                     parno = 0;
9927                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9928                         parno = atoi(RExC_parse++);
9929                         while (isDIGIT(*RExC_parse))
9930                             RExC_parse++;
9931                     } else if (RExC_parse[0] == '&') {
9932                         SV *sv_dat;
9933                         RExC_parse++;
9934                         sv_dat = reg_scan_name(pRExC_state,
9935                             SIZE_ONLY
9936                             ? REG_RSN_RETURN_NULL
9937                             : REG_RSN_RETURN_DATA);
9938                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9939                     }
9940                     ret = reganode(pRExC_state,INSUBP,parno);
9941                     goto insert_if_check_paren;
9942                 }
9943                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9944                     /* (?(1)...) */
9945                     char c;
9946                     char *tmp;
9947                     parno = atoi(RExC_parse++);
9948
9949                     while (isDIGIT(*RExC_parse))
9950                         RExC_parse++;
9951                     ret = reganode(pRExC_state, GROUPP, parno);
9952
9953                  insert_if_check_paren:
9954                     if (*(tmp = nextchar(pRExC_state)) != ')') {
9955                         /* nextchar also skips comments, so undo its work
9956                          * and skip over the the next character.
9957                          */
9958                         RExC_parse = tmp;
9959                         RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9960                         vFAIL("Switch condition not recognized");
9961                     }
9962                   insert_if:
9963                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9964                     br = regbranch(pRExC_state, &flags, 1,depth+1);
9965                     if (br == NULL) {
9966                         if (flags & RESTART_UTF8) {
9967                             *flagp = RESTART_UTF8;
9968                             return NULL;
9969                         }
9970                         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9971                               (UV) flags);
9972                     } else
9973                         REGTAIL(pRExC_state, br, reganode(pRExC_state,
9974                                                           LONGJMP, 0));
9975                     c = *nextchar(pRExC_state);
9976                     if (flags&HASWIDTH)
9977                         *flagp |= HASWIDTH;
9978                     if (c == '|') {
9979                         if (is_define)
9980                             vFAIL("(?(DEFINE)....) does not allow branches");
9981
9982                         /* Fake one for optimizer.  */
9983                         lastbr = reganode(pRExC_state, IFTHEN, 0);
9984
9985                         if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9986                             if (flags & RESTART_UTF8) {
9987                                 *flagp = RESTART_UTF8;
9988                                 return NULL;
9989                             }
9990                             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9991                                   (UV) flags);
9992                         }
9993                         REGTAIL(pRExC_state, ret, lastbr);
9994                         if (flags&HASWIDTH)
9995                             *flagp |= HASWIDTH;
9996                         c = *nextchar(pRExC_state);
9997                     }
9998                     else
9999                         lastbr = NULL;
10000                     if (c != ')')
10001                         vFAIL("Switch (?(condition)... contains too many branches");
10002                     ender = reg_node(pRExC_state, TAIL);
10003                     REGTAIL(pRExC_state, br, ender);
10004                     if (lastbr) {
10005                         REGTAIL(pRExC_state, lastbr, ender);
10006                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10007                     }
10008                     else
10009                         REGTAIL(pRExC_state, ret, ender);
10010                     RExC_size++; /* XXX WHY do we need this?!!
10011                                     For large programs it seems to be required
10012                                     but I can't figure out why. -- dmq*/
10013                     return ret;
10014                 }
10015                 else {
10016                     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10017                     vFAIL("Unknown switch condition (?(...))");
10018                 }
10019             }
10020             case '[':           /* (?[ ... ]) */
10021                 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10022                                          oregcomp_parse);
10023             case 0:
10024                 RExC_parse--; /* for vFAIL to print correctly */
10025                 vFAIL("Sequence (? incomplete");
10026                 break;
10027             default: /* e.g., (?i) */
10028                 --RExC_parse;
10029               parse_flags:
10030                 parse_lparen_question_flags(pRExC_state);
10031                 if (UCHARAT(RExC_parse) != ':') {
10032                     nextchar(pRExC_state);
10033                     *flagp = TRYAGAIN;
10034                     return NULL;
10035                 }
10036                 paren = ':';
10037                 nextchar(pRExC_state);
10038                 ret = NULL;
10039                 goto parse_rest;
10040             } /* end switch */
10041         }
10042         else {                  /* (...) */
10043           capturing_parens:
10044             parno = RExC_npar;
10045             RExC_npar++;
10046
10047             ret = reganode(pRExC_state, OPEN, parno);
10048             if (!SIZE_ONLY ){
10049                 if (!RExC_nestroot)
10050                     RExC_nestroot = parno;
10051                 if (RExC_seen & REG_RECURSE_SEEN
10052                     && !RExC_open_parens[parno-1])
10053                 {
10054                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10055                         "Setting open paren #%"IVdf" to %d\n",
10056                         (IV)parno, REG_NODE_NUM(ret)));
10057                     RExC_open_parens[parno-1]= ret;
10058                 }
10059             }
10060             Set_Node_Length(ret, 1); /* MJD */
10061             Set_Node_Offset(ret, RExC_parse); /* MJD */
10062             is_open = 1;
10063         }
10064     }
10065     else                        /* ! paren */
10066         ret = NULL;
10067
10068    parse_rest:
10069     /* Pick up the branches, linking them together. */
10070     parse_start = RExC_parse;   /* MJD */
10071     br = regbranch(pRExC_state, &flags, 1,depth+1);
10072
10073     /*     branch_len = (paren != 0); */
10074
10075     if (br == NULL) {
10076         if (flags & RESTART_UTF8) {
10077             *flagp = RESTART_UTF8;
10078             return NULL;
10079         }
10080         FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10081     }
10082     if (*RExC_parse == '|') {
10083         if (!SIZE_ONLY && RExC_extralen) {
10084             reginsert(pRExC_state, BRANCHJ, br, depth+1);
10085         }
10086         else {                  /* MJD */
10087             reginsert(pRExC_state, BRANCH, br, depth+1);
10088             Set_Node_Length(br, paren != 0);
10089             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10090         }
10091         have_branch = 1;
10092         if (SIZE_ONLY)
10093             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
10094     }
10095     else if (paren == ':') {
10096         *flagp |= flags&SIMPLE;
10097     }
10098     if (is_open) {                              /* Starts with OPEN. */
10099         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10100     }
10101     else if (paren != '?')              /* Not Conditional */
10102         ret = br;
10103     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10104     lastbr = br;
10105     while (*RExC_parse == '|') {
10106         if (!SIZE_ONLY && RExC_extralen) {
10107             ender = reganode(pRExC_state, LONGJMP,0);
10108
10109             /* Append to the previous. */
10110             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10111         }
10112         if (SIZE_ONLY)
10113             RExC_extralen += 2;         /* Account for LONGJMP. */
10114         nextchar(pRExC_state);
10115         if (freeze_paren) {
10116             if (RExC_npar > after_freeze)
10117                 after_freeze = RExC_npar;
10118             RExC_npar = freeze_paren;
10119         }
10120         br = regbranch(pRExC_state, &flags, 0, depth+1);
10121
10122         if (br == NULL) {
10123             if (flags & RESTART_UTF8) {
10124                 *flagp = RESTART_UTF8;
10125                 return NULL;
10126             }
10127             FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10128         }
10129         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10130         lastbr = br;
10131         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10132     }
10133
10134     if (have_branch || paren != ':') {
10135         /* Make a closing node, and hook it on the end. */
10136         switch (paren) {
10137         case ':':
10138             ender = reg_node(pRExC_state, TAIL);
10139             break;
10140         case 1: case 2:
10141             ender = reganode(pRExC_state, CLOSE, parno);
10142             if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10143                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10144                         "Setting close paren #%"IVdf" to %d\n",
10145                         (IV)parno, REG_NODE_NUM(ender)));
10146                 RExC_close_parens[parno-1]= ender;
10147                 if (RExC_nestroot == parno)
10148                     RExC_nestroot = 0;
10149             }
10150             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10151             Set_Node_Length(ender,1); /* MJD */
10152             break;
10153         case '<':
10154         case ',':
10155         case '=':
10156         case '!':
10157             *flagp &= ~HASWIDTH;
10158             /* FALL THROUGH */
10159         case '>':
10160             ender = reg_node(pRExC_state, SUCCEED);
10161             break;
10162         case 0:
10163             ender = reg_node(pRExC_state, END);
10164             if (!SIZE_ONLY) {
10165                 assert(!RExC_opend); /* there can only be one! */
10166                 RExC_opend = ender;
10167             }
10168             break;
10169         }
10170         DEBUG_PARSE_r(if (!SIZE_ONLY) {
10171             SV * const mysv_val1=sv_newmortal();
10172             SV * const mysv_val2=sv_newmortal();
10173             DEBUG_PARSE_MSG("lsbr");
10174             regprop(RExC_rx, mysv_val1, lastbr, NULL);
10175             regprop(RExC_rx, mysv_val2, ender, NULL);
10176             PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10177                           SvPV_nolen_const(mysv_val1),
10178                           (IV)REG_NODE_NUM(lastbr),
10179                           SvPV_nolen_const(mysv_val2),
10180                           (IV)REG_NODE_NUM(ender),
10181                           (IV)(ender - lastbr)
10182             );
10183         });
10184         REGTAIL(pRExC_state, lastbr, ender);
10185
10186         if (have_branch && !SIZE_ONLY) {
10187             char is_nothing= 1;
10188             if (depth==1)
10189                 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10190
10191             /* Hook the tails of the branches to the closing node. */
10192             for (br = ret; br; br = regnext(br)) {
10193                 const U8 op = PL_regkind[OP(br)];
10194                 if (op == BRANCH) {
10195                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10196                     if ( OP(NEXTOPER(br)) != NOTHING
10197                          || regnext(NEXTOPER(br)) != ender)
10198                         is_nothing= 0;
10199                 }
10200                 else if (op == BRANCHJ) {
10201                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10202                     /* for now we always disable this optimisation * /
10203                     if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10204                          || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10205                     */
10206                         is_nothing= 0;
10207                 }
10208             }
10209             if (is_nothing) {
10210                 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10211                 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10212                     SV * const mysv_val1=sv_newmortal();
10213                     SV * const mysv_val2=sv_newmortal();
10214                     DEBUG_PARSE_MSG("NADA");
10215                     regprop(RExC_rx, mysv_val1, ret, NULL);
10216                     regprop(RExC_rx, mysv_val2, ender, NULL);
10217                     PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10218                                   SvPV_nolen_const(mysv_val1),
10219                                   (IV)REG_NODE_NUM(ret),
10220                                   SvPV_nolen_const(mysv_val2),
10221                                   (IV)REG_NODE_NUM(ender),
10222                                   (IV)(ender - ret)
10223                     );
10224                 });
10225                 OP(br)= NOTHING;
10226                 if (OP(ender) == TAIL) {
10227                     NEXT_OFF(br)= 0;
10228                     RExC_emit= br + 1;
10229                 } else {
10230                     regnode *opt;
10231                     for ( opt= br + 1; opt < ender ; opt++ )
10232                         OP(opt)= OPTIMIZED;
10233                     NEXT_OFF(br)= ender - br;
10234                 }
10235             }
10236         }
10237     }
10238
10239     {
10240         const char *p;
10241         static const char parens[] = "=!<,>";
10242
10243         if (paren && (p = strchr(parens, paren))) {
10244             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10245             int flag = (p - parens) > 1;
10246
10247             if (paren == '>')
10248                 node = SUSPEND, flag = 0;
10249             reginsert(pRExC_state, node,ret, depth+1);
10250             Set_Node_Cur_Length(ret, parse_start);
10251             Set_Node_Offset(ret, parse_start + 1);
10252             ret->flags = flag;
10253             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10254         }
10255     }
10256
10257     /* Check for proper termination. */
10258     if (paren) {
10259         /* restore original flags, but keep (?p) */
10260         RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10261         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10262             RExC_parse = oregcomp_parse;
10263             vFAIL("Unmatched (");
10264         }
10265     }
10266     else if (!paren && RExC_parse < RExC_end) {
10267         if (*RExC_parse == ')') {
10268             RExC_parse++;
10269             vFAIL("Unmatched )");
10270         }
10271         else
10272             FAIL("Junk on end of regexp");      /* "Can't happen". */
10273         assert(0); /* NOTREACHED */
10274     }
10275
10276     if (RExC_in_lookbehind) {
10277         RExC_in_lookbehind--;
10278     }
10279     if (after_freeze > RExC_npar)
10280         RExC_npar = after_freeze;
10281     return(ret);
10282 }
10283
10284 /*
10285  - regbranch - one alternative of an | operator
10286  *
10287  * Implements the concatenation operator.
10288  *
10289  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10290  * restarted.
10291  */
10292 STATIC regnode *
10293 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10294 {
10295     dVAR;
10296     regnode *ret;
10297     regnode *chain = NULL;
10298     regnode *latest;
10299     I32 flags = 0, c = 0;
10300     GET_RE_DEBUG_FLAGS_DECL;
10301
10302     PERL_ARGS_ASSERT_REGBRANCH;
10303
10304     DEBUG_PARSE("brnc");
10305
10306     if (first)
10307         ret = NULL;
10308     else {
10309         if (!SIZE_ONLY && RExC_extralen)
10310             ret = reganode(pRExC_state, BRANCHJ,0);
10311         else {
10312             ret = reg_node(pRExC_state, BRANCH);
10313             Set_Node_Length(ret, 1);
10314         }
10315     }
10316
10317     if (!first && SIZE_ONLY)
10318         RExC_extralen += 1;                     /* BRANCHJ */
10319
10320     *flagp = WORST;                     /* Tentatively. */
10321
10322     RExC_parse--;
10323     nextchar(pRExC_state);
10324     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10325         flags &= ~TRYAGAIN;
10326         latest = regpiece(pRExC_state, &flags,depth+1);
10327         if (latest == NULL) {
10328             if (flags & TRYAGAIN)
10329                 continue;
10330             if (flags & RESTART_UTF8) {
10331                 *flagp = RESTART_UTF8;
10332                 return NULL;
10333             }
10334             FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10335         }
10336         else if (ret == NULL)
10337             ret = latest;
10338         *flagp |= flags&(HASWIDTH|POSTPONED);
10339         if (chain == NULL)      /* First piece. */
10340             *flagp |= flags&SPSTART;
10341         else {
10342             RExC_naughty++;
10343             REGTAIL(pRExC_state, chain, latest);
10344         }
10345         chain = latest;
10346         c++;
10347     }
10348     if (chain == NULL) {        /* Loop ran zero times. */
10349         chain = reg_node(pRExC_state, NOTHING);
10350         if (ret == NULL)
10351             ret = chain;
10352     }
10353     if (c == 1) {
10354         *flagp |= flags&SIMPLE;
10355     }
10356
10357     return ret;
10358 }
10359
10360 /*
10361  - regpiece - something followed by possible [*+?]
10362  *
10363  * Note that the branching code sequences used for ? and the general cases
10364  * of * and + are somewhat optimized:  they use the same NOTHING node as
10365  * both the endmarker for their branch list and the body of the last branch.
10366  * It might seem that this node could be dispensed with entirely, but the
10367  * endmarker role is not redundant.
10368  *
10369  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10370  * TRYAGAIN.
10371  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10372  * restarted.
10373  */
10374 STATIC regnode *
10375 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10376 {
10377     dVAR;
10378     regnode *ret;
10379     char op;
10380     char *next;
10381     I32 flags;
10382     const char * const origparse = RExC_parse;
10383     I32 min;
10384     I32 max = REG_INFTY;
10385 #ifdef RE_TRACK_PATTERN_OFFSETS
10386     char *parse_start;
10387 #endif
10388     const char *maxpos = NULL;
10389
10390     /* Save the original in case we change the emitted regop to a FAIL. */
10391     regnode * const orig_emit = RExC_emit;
10392
10393     GET_RE_DEBUG_FLAGS_DECL;
10394
10395     PERL_ARGS_ASSERT_REGPIECE;
10396
10397     DEBUG_PARSE("piec");
10398
10399     ret = regatom(pRExC_state, &flags,depth+1);
10400     if (ret == NULL) {
10401         if (flags & (TRYAGAIN|RESTART_UTF8))
10402             *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10403         else
10404             FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10405         return(NULL);
10406     }
10407
10408     op = *RExC_parse;
10409
10410     if (op == '{' && regcurly(RExC_parse, FALSE)) {
10411         maxpos = NULL;
10412 #ifdef RE_TRACK_PATTERN_OFFSETS
10413         parse_start = RExC_parse; /* MJD */
10414 #endif
10415         next = RExC_parse + 1;
10416         while (isDIGIT(*next) || *next == ',') {
10417             if (*next == ',') {
10418                 if (maxpos)
10419                     break;
10420                 else
10421                     maxpos = next;
10422             }
10423             next++;
10424         }
10425         if (*next == '}') {             /* got one */
10426             if (!maxpos)
10427                 maxpos = next;
10428             RExC_parse++;
10429             min = atoi(RExC_parse);
10430             if (*maxpos == ',')
10431                 maxpos++;
10432             else
10433                 maxpos = RExC_parse;
10434             max = atoi(maxpos);
10435             if (!max && *maxpos != '0')
10436                 max = REG_INFTY;                /* meaning "infinity" */
10437             else if (max >= REG_INFTY)
10438                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10439             RExC_parse = next;
10440             nextchar(pRExC_state);
10441             if (max < min) {    /* If can't match, warn and optimize to fail
10442                                    unconditionally */
10443                 if (SIZE_ONLY) {
10444                     ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10445
10446                     /* We can't back off the size because we have to reserve
10447                      * enough space for all the things we are about to throw
10448                      * away, but we can shrink it by the ammount we are about
10449                      * to re-use here */
10450                     RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10451                 }
10452                 else {
10453                     RExC_emit = orig_emit;
10454                 }
10455                 ret = reg_node(pRExC_state, OPFAIL);
10456                 return ret;
10457             }
10458             else if (min == max
10459                      && RExC_parse < RExC_end
10460                      && (*RExC_parse == '?' || *RExC_parse == '+'))
10461             {
10462                 if (SIZE_ONLY) {
10463                     ckWARN2reg(RExC_parse + 1,
10464                                "Useless use of greediness modifier '%c'",
10465                                *RExC_parse);
10466                 }
10467                 /* Absorb the modifier, so later code doesn't see nor use
10468                     * it */
10469                 nextchar(pRExC_state);
10470             }
10471
10472         do_curly:
10473             if ((flags&SIMPLE)) {
10474                 RExC_naughty += 2 + RExC_naughty / 2;
10475                 reginsert(pRExC_state, CURLY, ret, depth+1);
10476                 Set_Node_Offset(ret, parse_start+1); /* MJD */
10477                 Set_Node_Cur_Length(ret, parse_start);
10478             }
10479             else {
10480                 regnode * const w = reg_node(pRExC_state, WHILEM);
10481
10482                 w->flags = 0;
10483                 REGTAIL(pRExC_state, ret, w);
10484                 if (!SIZE_ONLY && RExC_extralen) {
10485                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
10486                     reginsert(pRExC_state, NOTHING,ret, depth+1);
10487                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
10488                 }
10489                 reginsert(pRExC_state, CURLYX,ret, depth+1);
10490                                 /* MJD hk */
10491                 Set_Node_Offset(ret, parse_start+1);
10492                 Set_Node_Length(ret,
10493                                 op == '{' ? (RExC_parse - parse_start) : 1);
10494
10495                 if (!SIZE_ONLY && RExC_extralen)
10496                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
10497                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10498                 if (SIZE_ONLY)
10499                     RExC_whilem_seen++, RExC_extralen += 3;
10500                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
10501             }
10502             ret->flags = 0;
10503
10504             if (min > 0)
10505                 *flagp = WORST;
10506             if (max > 0)
10507                 *flagp |= HASWIDTH;
10508             if (!SIZE_ONLY) {
10509                 ARG1_SET(ret, (U16)min);
10510                 ARG2_SET(ret, (U16)max);
10511             }
10512             if (max == REG_INFTY)
10513                 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10514
10515             goto nest_check;
10516         }
10517     }
10518
10519     if (!ISMULT1(op)) {
10520         *flagp = flags;
10521         return(ret);
10522     }
10523
10524 #if 0                           /* Now runtime fix should be reliable. */
10525
10526     /* if this is reinstated, don't forget to put this back into perldiag:
10527
10528             =item Regexp *+ operand could be empty at {#} in regex m/%s/
10529
10530            (F) The part of the regexp subject to either the * or + quantifier
10531            could match an empty string. The {#} shows in the regular
10532            expression about where the problem was discovered.
10533
10534     */
10535
10536     if (!(flags&HASWIDTH) && op != '?')
10537       vFAIL("Regexp *+ operand could be empty");
10538 #endif
10539
10540 #ifdef RE_TRACK_PATTERN_OFFSETS
10541     parse_start = RExC_parse;
10542 #endif
10543     nextchar(pRExC_state);
10544
10545     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10546
10547     if (op == '*' && (flags&SIMPLE)) {
10548         reginsert(pRExC_state, STAR, ret, depth+1);
10549         ret->flags = 0;
10550         RExC_naughty += 4;
10551         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10552     }
10553     else if (op == '*') {
10554         min = 0;
10555         goto do_curly;
10556     }
10557     else if (op == '+' && (flags&SIMPLE)) {
10558         reginsert(pRExC_state, PLUS, ret, depth+1);
10559         ret->flags = 0;
10560         RExC_naughty += 3;
10561         RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10562     }
10563     else if (op == '+') {
10564         min = 1;
10565         goto do_curly;
10566     }
10567     else if (op == '?') {
10568         min = 0; max = 1;
10569         goto do_curly;
10570     }
10571   nest_check:
10572     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10573         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10574         ckWARN2reg(RExC_parse,
10575                    "%"UTF8f" matches null string many times",
10576                    UTF8fARG(UTF, (RExC_parse >= origparse
10577                                  ? RExC_parse - origparse
10578                                  : 0),
10579                    origparse));
10580         (void)ReREFCNT_inc(RExC_rx_sv);
10581     }
10582
10583     if (RExC_parse < RExC_end && *RExC_parse == '?') {
10584         nextchar(pRExC_state);
10585         reginsert(pRExC_state, MINMOD, ret, depth+1);
10586         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10587     }
10588     else
10589     if (RExC_parse < RExC_end && *RExC_parse == '+') {
10590         regnode *ender;
10591         nextchar(pRExC_state);
10592         ender = reg_node(pRExC_state, SUCCEED);
10593         REGTAIL(pRExC_state, ret, ender);
10594         reginsert(pRExC_state, SUSPEND, ret, depth+1);
10595         ret->flags = 0;
10596         ender = reg_node(pRExC_state, TAIL);
10597         REGTAIL(pRExC_state, ret, ender);
10598     }
10599
10600     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10601         RExC_parse++;
10602         vFAIL("Nested quantifiers");
10603     }
10604
10605     return(ret);
10606 }
10607
10608 STATIC bool
10609 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10610                       UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10611                       const bool strict   /* Apply stricter parsing rules? */
10612     )
10613 {
10614
10615  /* This is expected to be called by a parser routine that has recognized '\N'
10616    and needs to handle the rest. RExC_parse is expected to point at the first
10617    char following the N at the time of the call.  On successful return,
10618    RExC_parse has been updated to point to just after the sequence identified
10619    by this routine, and <*flagp> has been updated.
10620
10621    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10622    character class.
10623
10624    \N may begin either a named sequence, or if outside a character class, mean
10625    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10626    attempted to decide which, and in the case of a named sequence, converted it
10627    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10628    where c1... are the characters in the sequence.  For single-quoted regexes,
10629    the tokenizer passes the \N sequence through unchanged; this code will not
10630    attempt to determine this nor expand those, instead raising a syntax error.
10631    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10632    or there is no '}', it signals that this \N occurrence means to match a
10633    non-newline.
10634
10635    Only the \N{U+...} form should occur in a character class, for the same
10636    reason that '.' inside a character class means to just match a period: it
10637    just doesn't make sense.
10638
10639    The function raises an error (via vFAIL), and doesn't return for various
10640    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10641    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10642    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10643    only possible if node_p is non-NULL.
10644
10645
10646    If <valuep> is non-null, it means the caller can accept an input sequence
10647    consisting of a just a single code point; <*valuep> is set to that value
10648    if the input is such.
10649
10650    If <node_p> is non-null it signifies that the caller can accept any other
10651    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10652    is set as follows:
10653     1) \N means not-a-NL: points to a newly created REG_ANY node;
10654     2) \N{}:              points to a new NOTHING node;
10655     3) otherwise:         points to a new EXACT node containing the resolved
10656                           string.
10657    Note that FALSE is returned for single code point sequences if <valuep> is
10658    null.
10659  */
10660
10661     char * endbrace;    /* '}' following the name */
10662     char* p;
10663     char *endchar;      /* Points to '.' or '}' ending cur char in the input
10664                            stream */
10665     bool has_multiple_chars; /* true if the input stream contains a sequence of
10666                                 more than one character */
10667
10668     GET_RE_DEBUG_FLAGS_DECL;
10669
10670     PERL_ARGS_ASSERT_GROK_BSLASH_N;
10671
10672     GET_RE_DEBUG_FLAGS;
10673
10674     assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10675
10676     /* The [^\n] meaning of \N ignores spaces and comments under the /x
10677      * modifier.  The other meaning does not, so use a temporary until we find
10678      * out which we are being called with */
10679     p = (RExC_flags & RXf_PMf_EXTENDED)
10680         ? regwhite( pRExC_state, RExC_parse )
10681         : RExC_parse;
10682
10683     /* Disambiguate between \N meaning a named character versus \N meaning
10684      * [^\n].  The former is assumed when it can't be the latter. */
10685     if (*p != '{' || regcurly(p, FALSE)) {
10686         RExC_parse = p;
10687         if (! node_p) {
10688             /* no bare \N allowed in a charclass */
10689             if (in_char_class) {
10690                 vFAIL("\\N in a character class must be a named character: \\N{...}");
10691             }
10692             return FALSE;
10693         }
10694         RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10695                            current char */
10696         nextchar(pRExC_state);
10697         *node_p = reg_node(pRExC_state, REG_ANY);
10698         *flagp |= HASWIDTH|SIMPLE;
10699         RExC_naughty++;
10700         Set_Node_Length(*node_p, 1); /* MJD */
10701         return TRUE;
10702     }
10703
10704     /* Here, we have decided it should be a named character or sequence */
10705
10706     /* The test above made sure that the next real character is a '{', but
10707      * under the /x modifier, it could be separated by space (or a comment and
10708      * \n) and this is not allowed (for consistency with \x{...} and the
10709      * tokenizer handling of \N{NAME}). */
10710     if (*RExC_parse != '{') {
10711         vFAIL("Missing braces on \\N{}");
10712     }
10713
10714     RExC_parse++;       /* Skip past the '{' */
10715
10716     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10717         || ! (endbrace == RExC_parse            /* nothing between the {} */
10718               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below
10719                                                  */
10720                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10721                                                      */
10722     {
10723         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
10724         vFAIL("\\N{NAME} must be resolved by the lexer");
10725     }
10726
10727     if (endbrace == RExC_parse) {   /* empty: \N{} */
10728         bool ret = TRUE;
10729         if (node_p) {
10730             *node_p = reg_node(pRExC_state,NOTHING);
10731         }
10732         else if (in_char_class) {
10733             if (SIZE_ONLY && in_char_class) {
10734                 if (strict) {
10735                     RExC_parse++;   /* Position after the "}" */
10736                     vFAIL("Zero length \\N{}");
10737                 }
10738                 else {
10739                     ckWARNreg(RExC_parse,
10740                               "Ignoring zero length \\N{} in character class");
10741                 }
10742             }
10743             ret = FALSE;
10744         }
10745         else {
10746             return FALSE;
10747         }
10748         nextchar(pRExC_state);
10749         return ret;
10750     }
10751
10752     RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10753     RExC_parse += 2;    /* Skip past the 'U+' */
10754
10755     endchar = RExC_parse + strcspn(RExC_parse, ".}");
10756
10757     /* Code points are separated by dots.  If none, there is only one code
10758      * point, and is terminated by the brace */
10759     has_multiple_chars = (endchar < endbrace);
10760
10761     if (valuep && (! has_multiple_chars || in_char_class)) {
10762         /* We only pay attention to the first char of
10763         multichar strings being returned in char classes. I kinda wonder
10764         if this makes sense as it does change the behaviour
10765         from earlier versions, OTOH that behaviour was broken
10766         as well. XXX Solution is to recharacterize as
10767         [rest-of-class]|multi1|multi2... */
10768
10769         STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10770         I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10771             | PERL_SCAN_DISALLOW_PREFIX
10772             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10773
10774         *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10775
10776         /* The tokenizer should have guaranteed validity, but it's possible to
10777          * bypass it by using single quoting, so check */
10778         if (length_of_hex == 0
10779             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10780         {
10781             RExC_parse += length_of_hex;        /* Includes all the valid */
10782             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
10783                             ? UTF8SKIP(RExC_parse)
10784                             : 1;
10785             /* Guard against malformed utf8 */
10786             if (RExC_parse >= endchar) {
10787                 RExC_parse = endchar;
10788             }
10789             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10790         }
10791
10792         if (in_char_class && has_multiple_chars) {
10793             if (strict) {
10794                 RExC_parse = endbrace;
10795                 vFAIL("\\N{} in character class restricted to one character");
10796             }
10797             else {
10798                 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10799             }
10800         }
10801
10802         RExC_parse = endbrace + 1;
10803     }
10804     else if (! node_p || ! has_multiple_chars) {
10805
10806         /* Here, the input is legal, but not according to the caller's
10807          * options.  We fail without advancing the parse, so that the
10808          * caller can try again */
10809         RExC_parse = p;
10810         return FALSE;
10811     }
10812     else {
10813
10814         /* What is done here is to convert this to a sub-pattern of the form
10815          * (?:\x{char1}\x{char2}...)
10816          * and then call reg recursively.  That way, it retains its atomicness,
10817          * while not having to worry about special handling that some code
10818          * points may have.  toke.c has converted the original Unicode values
10819          * to native, so that we can just pass on the hex values unchanged.  We
10820          * do have to set a flag to keep recoding from happening in the
10821          * recursion */
10822
10823         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10824         STRLEN len;
10825         char *orig_end = RExC_end;
10826         I32 flags;
10827
10828         while (RExC_parse < endbrace) {
10829
10830             /* Convert to notation the rest of the code understands */
10831             sv_catpv(substitute_parse, "\\x{");
10832             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10833             sv_catpv(substitute_parse, "}");
10834
10835             /* Point to the beginning of the next character in the sequence. */
10836             RExC_parse = endchar + 1;
10837             endchar = RExC_parse + strcspn(RExC_parse, ".}");
10838         }
10839         sv_catpv(substitute_parse, ")");
10840
10841         RExC_parse = SvPV(substitute_parse, len);
10842
10843         /* Don't allow empty number */
10844         if (len < 8) {
10845             vFAIL("Invalid hexadecimal number in \\N{U+...}");
10846         }
10847         RExC_end = RExC_parse + len;
10848
10849         /* The values are Unicode, and therefore not subject to recoding */
10850         RExC_override_recoding = 1;
10851
10852         if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10853             if (flags & RESTART_UTF8) {
10854                 *flagp = RESTART_UTF8;
10855                 return FALSE;
10856             }
10857             FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10858                   (UV) flags);
10859         }
10860         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10861
10862         RExC_parse = endbrace;
10863         RExC_end = orig_end;
10864         RExC_override_recoding = 0;
10865
10866         nextchar(pRExC_state);
10867     }
10868
10869     return TRUE;
10870 }
10871
10872
10873 /*
10874  * reg_recode
10875  *
10876  * It returns the code point in utf8 for the value in *encp.
10877  *    value: a code value in the source encoding
10878  *    encp:  a pointer to an Encode object
10879  *
10880  * If the result from Encode is not a single character,
10881  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10882  */
10883 STATIC UV
10884 S_reg_recode(pTHX_ const char value, SV **encp)
10885 {
10886     STRLEN numlen = 1;
10887     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10888     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10889     const STRLEN newlen = SvCUR(sv);
10890     UV uv = UNICODE_REPLACEMENT;
10891
10892     PERL_ARGS_ASSERT_REG_RECODE;
10893
10894     if (newlen)
10895         uv = SvUTF8(sv)
10896              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10897              : *(U8*)s;
10898
10899     if (!newlen || numlen != newlen) {
10900         uv = UNICODE_REPLACEMENT;
10901         *encp = NULL;
10902     }
10903     return uv;
10904 }
10905
10906 PERL_STATIC_INLINE U8
10907 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10908 {
10909     U8 op;
10910
10911     PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10912
10913     if (! FOLD) {
10914         return EXACT;
10915     }
10916
10917     op = get_regex_charset(RExC_flags);
10918     if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10919         op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10920                  been, so there is no hole */
10921     }
10922
10923     return op + EXACTF;
10924 }
10925
10926 PERL_STATIC_INLINE void
10927 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10928                          regnode *node, I32* flagp, STRLEN len, UV code_point,
10929                          bool downgradable)
10930 {
10931     /* This knows the details about sizing an EXACTish node, setting flags for
10932      * it (by setting <*flagp>, and potentially populating it with a single
10933      * character.
10934      *
10935      * If <len> (the length in bytes) is non-zero, this function assumes that
10936      * the node has already been populated, and just does the sizing.  In this
10937      * case <code_point> should be the final code point that has already been
10938      * placed into the node.  This value will be ignored except that under some
10939      * circumstances <*flagp> is set based on it.
10940      *
10941      * If <len> is zero, the function assumes that the node is to contain only
10942      * the single character given by <code_point> and calculates what <len>
10943      * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10944      * additionally will populate the node's STRING with <code_point> or its
10945      * fold if folding.
10946      *
10947      * In both cases <*flagp> is appropriately set
10948      *
10949      * It knows that under FOLD, the Latin Sharp S and UTF characters above
10950      * 255, must be folded (the former only when the rules indicate it can
10951      * match 'ss')
10952      *
10953      * When it does the populating, it looks at the flag 'downgradable'.  If
10954      * true with a node that folds, it checks if the single code point
10955      * participates in a fold, and if not downgrades the node to an EXACT.
10956      * This helps the optimizer */
10957
10958     bool len_passed_in = cBOOL(len != 0);
10959     U8 character[UTF8_MAXBYTES_CASE+1];
10960
10961     PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10962
10963     /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10964      * sizing difference, and is extra work that is thrown away */
10965     if (downgradable && ! PASS2) {
10966         downgradable = FALSE;
10967     }
10968
10969     if (! len_passed_in) {
10970         if (UTF) {
10971             if (UNI_IS_INVARIANT(code_point)) {
10972                 if (LOC || ! FOLD) {    /* /l defers folding until runtime */
10973                     *character = (U8) code_point;
10974                 }
10975                 else { /* Here is /i and not /l (toFOLD() is defined on just
10976                           ASCII, which isn't the same thing as INVARIANT on
10977                           EBCDIC, but it works there, as the extra invariants
10978                           fold to themselves) */
10979                     *character = toFOLD((U8) code_point);
10980
10981                     /* We can downgrade to an EXACT node if this character
10982                      * isn't a folding one.  Note that this assumes that
10983                      * nothing above Latin1 folds to some other invariant than
10984                      * one of these alphabetics; otherwise we would also have
10985                      * to check:
10986                      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
10987                      *      || ASCII_FOLD_RESTRICTED))
10988                      */
10989                     if (downgradable && PL_fold[code_point] == code_point) {
10990                         OP(node) = EXACT;
10991                     }
10992                 }
10993                 len = 1;
10994             }
10995             else if (FOLD && (! LOC
10996                               || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10997             {   /* Folding, and ok to do so now */
10998                 UV folded = _to_uni_fold_flags(
10999                                    code_point,
11000                                    character,
11001                                    &len,
11002                                    FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11003                                                       ? FOLD_FLAGS_NOMIX_ASCII
11004                                                       : 0));
11005                 if (downgradable
11006                     && folded == code_point
11007                     && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11008                 {
11009                     OP(node) = EXACT;
11010                 }
11011             }
11012             else if (code_point <= MAX_UTF8_TWO_BYTE) {
11013
11014                 /* Not folding this cp, and can output it directly */
11015                 *character = UTF8_TWO_BYTE_HI(code_point);
11016                 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11017                 len = 2;
11018             }
11019             else {
11020                 uvchr_to_utf8( character, code_point);
11021                 len = UTF8SKIP(character);
11022             }
11023         } /* Else pattern isn't UTF8.  */
11024         else if (! FOLD) {
11025             *character = (U8) code_point;
11026             len = 1;
11027         } /* Else is folded non-UTF8 */
11028         else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11029
11030             /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11031              * comments at join_exact()); */
11032             *character = (U8) code_point;
11033             len = 1;
11034
11035             /* Can turn into an EXACT node if we know the fold at compile time,
11036              * and it folds to itself and doesn't particpate in other folds */
11037             if (downgradable
11038                 && ! LOC
11039                 && PL_fold_latin1[code_point] == code_point
11040                 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11041                     || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11042             {
11043                 OP(node) = EXACT;
11044             }
11045         } /* else is Sharp s.  May need to fold it */
11046         else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11047             *character = 's';
11048             *(character + 1) = 's';
11049             len = 2;
11050         }
11051         else {
11052             *character = LATIN_SMALL_LETTER_SHARP_S;
11053             len = 1;
11054         }
11055     }
11056
11057     if (SIZE_ONLY) {
11058         RExC_size += STR_SZ(len);
11059     }
11060     else {
11061         RExC_emit += STR_SZ(len);
11062         STR_LEN(node) = len;
11063         if (! len_passed_in) {
11064             Copy((char *) character, STRING(node), len, char);
11065         }
11066     }
11067
11068     *flagp |= HASWIDTH;
11069
11070     /* A single character node is SIMPLE, except for the special-cased SHARP S
11071      * under /di. */
11072     if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11073         && (code_point != LATIN_SMALL_LETTER_SHARP_S
11074             || ! FOLD || ! DEPENDS_SEMANTICS))
11075     {
11076         *flagp |= SIMPLE;
11077     }
11078
11079     /* The OP may not be well defined in PASS1 */
11080     if (PASS2 && OP(node) == EXACTFL) {
11081         RExC_contains_locale = 1;
11082     }
11083 }
11084
11085
11086 /* return atoi(p), unless it's too big to sensibly be a backref,
11087  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11088
11089 static I32
11090 S_backref_value(char *p)
11091 {
11092     char *q = p;
11093
11094     for (;isDIGIT(*q); q++); /* calculate length of num */
11095     if (q - p == 0 || q - p > 9)
11096         return I32_MAX;
11097     return atoi(p);
11098 }
11099
11100
11101 /*
11102  - regatom - the lowest level
11103
11104    Try to identify anything special at the start of the pattern. If there
11105    is, then handle it as required. This may involve generating a single regop,
11106    such as for an assertion; or it may involve recursing, such as to
11107    handle a () structure.
11108
11109    If the string doesn't start with something special then we gobble up
11110    as much literal text as we can.
11111
11112    Once we have been able to handle whatever type of thing started the
11113    sequence, we return.
11114
11115    Note: we have to be careful with escapes, as they can be both literal
11116    and special, and in the case of \10 and friends, context determines which.
11117
11118    A summary of the code structure is:
11119
11120    switch (first_byte) {
11121         cases for each special:
11122             handle this special;
11123             break;
11124         case '\\':
11125             switch (2nd byte) {
11126                 cases for each unambiguous special:
11127                     handle this special;
11128                     break;
11129                 cases for each ambigous special/literal:
11130                     disambiguate;
11131                     if (special)  handle here
11132                     else goto defchar;
11133                 default: // unambiguously literal:
11134                     goto defchar;
11135             }
11136         default:  // is a literal char
11137             // FALL THROUGH
11138         defchar:
11139             create EXACTish node for literal;
11140             while (more input and node isn't full) {
11141                 switch (input_byte) {
11142                    cases for each special;
11143                        make sure parse pointer is set so that the next call to
11144                            regatom will see this special first
11145                        goto loopdone; // EXACTish node terminated by prev. char
11146                    default:
11147                        append char to EXACTISH node;
11148                 }
11149                 get next input byte;
11150             }
11151         loopdone:
11152    }
11153    return the generated node;
11154
11155    Specifically there are two separate switches for handling
11156    escape sequences, with the one for handling literal escapes requiring
11157    a dummy entry for all of the special escapes that are actually handled
11158    by the other.
11159
11160    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11161    TRYAGAIN.
11162    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11163    restarted.
11164    Otherwise does not return NULL.
11165 */
11166
11167 STATIC regnode *
11168 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11169 {
11170     dVAR;
11171     regnode *ret = NULL;
11172     I32 flags = 0;
11173     char *parse_start = RExC_parse;
11174     U8 op;
11175     int invert = 0;
11176
11177     GET_RE_DEBUG_FLAGS_DECL;
11178
11179     *flagp = WORST;             /* Tentatively. */
11180
11181     DEBUG_PARSE("atom");
11182
11183     PERL_ARGS_ASSERT_REGATOM;
11184
11185 tryagain:
11186     switch ((U8)*RExC_parse) {
11187     case '^':
11188         RExC_seen_zerolen++;
11189         nextchar(pRExC_state);
11190         if (RExC_flags & RXf_PMf_MULTILINE)
11191             ret = reg_node(pRExC_state, MBOL);
11192         else if (RExC_flags & RXf_PMf_SINGLELINE)
11193             ret = reg_node(pRExC_state, SBOL);
11194         else
11195             ret = reg_node(pRExC_state, BOL);
11196         Set_Node_Length(ret, 1); /* MJD */
11197         break;
11198     case '$':
11199         nextchar(pRExC_state);
11200         if (*RExC_parse)
11201             RExC_seen_zerolen++;
11202         if (RExC_flags & RXf_PMf_MULTILINE)
11203             ret = reg_node(pRExC_state, MEOL);
11204         else if (RExC_flags & RXf_PMf_SINGLELINE)
11205             ret = reg_node(pRExC_state, SEOL);
11206         else
11207             ret = reg_node(pRExC_state, EOL);
11208         Set_Node_Length(ret, 1); /* MJD */
11209         break;
11210     case '.':
11211         nextchar(pRExC_state);
11212         if (RExC_flags & RXf_PMf_SINGLELINE)
11213             ret = reg_node(pRExC_state, SANY);
11214         else
11215             ret = reg_node(pRExC_state, REG_ANY);
11216         *flagp |= HASWIDTH|SIMPLE;
11217         RExC_naughty++;
11218         Set_Node_Length(ret, 1); /* MJD */
11219         break;
11220     case '[':
11221     {
11222         char * const oregcomp_parse = ++RExC_parse;
11223         ret = regclass(pRExC_state, flagp,depth+1,
11224                        FALSE, /* means parse the whole char class */
11225                        TRUE, /* allow multi-char folds */
11226                        FALSE, /* don't silence non-portable warnings. */
11227                        NULL);
11228         if (*RExC_parse != ']') {
11229             RExC_parse = oregcomp_parse;
11230             vFAIL("Unmatched [");
11231         }
11232         if (ret == NULL) {
11233             if (*flagp & RESTART_UTF8)
11234                 return NULL;
11235             FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11236                   (UV) *flagp);
11237         }
11238         nextchar(pRExC_state);
11239         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11240         break;
11241     }
11242     case '(':
11243         nextchar(pRExC_state);
11244         ret = reg(pRExC_state, 2, &flags,depth+1);
11245         if (ret == NULL) {
11246                 if (flags & TRYAGAIN) {
11247                     if (RExC_parse == RExC_end) {
11248                          /* Make parent create an empty node if needed. */
11249                         *flagp |= TRYAGAIN;
11250                         return(NULL);
11251                     }
11252                     goto tryagain;
11253                 }
11254                 if (flags & RESTART_UTF8) {
11255                     *flagp = RESTART_UTF8;
11256                     return NULL;
11257                 }
11258                 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11259                                                                  (UV) flags);
11260         }
11261         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11262         break;
11263     case '|':
11264     case ')':
11265         if (flags & TRYAGAIN) {
11266             *flagp |= TRYAGAIN;
11267             return NULL;
11268         }
11269         vFAIL("Internal urp");
11270                                 /* Supposed to be caught earlier. */
11271         break;
11272     case '{':
11273         if (!regcurly(RExC_parse, FALSE)) {
11274             RExC_parse++;
11275             goto defchar;
11276         }
11277         /* FALL THROUGH */
11278     case '?':
11279     case '+':
11280     case '*':
11281         RExC_parse++;
11282         vFAIL("Quantifier follows nothing");
11283         break;
11284     case '\\':
11285         /* Special Escapes
11286
11287            This switch handles escape sequences that resolve to some kind
11288            of special regop and not to literal text. Escape sequnces that
11289            resolve to literal text are handled below in the switch marked
11290            "Literal Escapes".
11291
11292            Every entry in this switch *must* have a corresponding entry
11293            in the literal escape switch. However, the opposite is not
11294            required, as the default for this switch is to jump to the
11295            literal text handling code.
11296         */
11297         switch ((U8)*++RExC_parse) {
11298             U8 arg;
11299         /* Special Escapes */
11300         case 'A':
11301             RExC_seen_zerolen++;
11302             ret = reg_node(pRExC_state, SBOL);
11303             *flagp |= SIMPLE;
11304             goto finish_meta_pat;
11305         case 'G':
11306             ret = reg_node(pRExC_state, GPOS);
11307             RExC_seen |= REG_GPOS_SEEN;
11308             *flagp |= SIMPLE;
11309             goto finish_meta_pat;
11310         case 'K':
11311             RExC_seen_zerolen++;
11312             ret = reg_node(pRExC_state, KEEPS);
11313             *flagp |= SIMPLE;
11314             /* XXX:dmq : disabling in-place substitution seems to
11315              * be necessary here to avoid cases of memory corruption, as
11316              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11317              */
11318             RExC_seen |= REG_LOOKBEHIND_SEEN;
11319             goto finish_meta_pat;
11320         case 'Z':
11321             ret = reg_node(pRExC_state, SEOL);
11322             *flagp |= SIMPLE;
11323             RExC_seen_zerolen++;                /* Do not optimize RE away */
11324             goto finish_meta_pat;
11325         case 'z':
11326             ret = reg_node(pRExC_state, EOS);
11327             *flagp |= SIMPLE;
11328             RExC_seen_zerolen++;                /* Do not optimize RE away */
11329             goto finish_meta_pat;
11330         case 'C':
11331             ret = reg_node(pRExC_state, CANY);
11332             RExC_seen |= REG_CANY_SEEN;
11333             *flagp |= HASWIDTH|SIMPLE;
11334             goto finish_meta_pat;
11335         case 'X':
11336             ret = reg_node(pRExC_state, CLUMP);
11337             *flagp |= HASWIDTH;
11338             goto finish_meta_pat;
11339
11340         case 'W':
11341             invert = 1;
11342             /* FALLTHROUGH */
11343         case 'w':
11344             arg = ANYOF_WORDCHAR;
11345             goto join_posix;
11346
11347         case 'b':
11348             RExC_seen_zerolen++;
11349             RExC_seen |= REG_LOOKBEHIND_SEEN;
11350             op = BOUND + get_regex_charset(RExC_flags);
11351             if (op > BOUNDA) {  /* /aa is same as /a */
11352                 op = BOUNDA;
11353             }
11354             else if (op == BOUNDL) {
11355                 RExC_contains_locale = 1;
11356             }
11357             ret = reg_node(pRExC_state, op);
11358             FLAGS(ret) = get_regex_charset(RExC_flags);
11359             *flagp |= SIMPLE;
11360             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11361                 /* diag_listed_as: Use "%s" instead of "%s" */
11362                 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11363             }
11364             goto finish_meta_pat;
11365         case 'B':
11366             RExC_seen_zerolen++;
11367             RExC_seen |= REG_LOOKBEHIND_SEEN;
11368             op = NBOUND + get_regex_charset(RExC_flags);
11369             if (op > NBOUNDA) { /* /aa is same as /a */
11370                 op = NBOUNDA;
11371             }
11372             else if (op == NBOUNDL) {
11373                 RExC_contains_locale = 1;
11374             }
11375             ret = reg_node(pRExC_state, op);
11376             FLAGS(ret) = get_regex_charset(RExC_flags);
11377             *flagp |= SIMPLE;
11378             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11379                 /* diag_listed_as: Use "%s" instead of "%s" */
11380                 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11381             }
11382             goto finish_meta_pat;
11383
11384         case 'D':
11385             invert = 1;
11386             /* FALLTHROUGH */
11387         case 'd':
11388             arg = ANYOF_DIGIT;
11389             goto join_posix;
11390
11391         case 'R':
11392             ret = reg_node(pRExC_state, LNBREAK);
11393             *flagp |= HASWIDTH|SIMPLE;
11394             goto finish_meta_pat;
11395
11396         case 'H':
11397             invert = 1;
11398             /* FALLTHROUGH */
11399         case 'h':
11400             arg = ANYOF_BLANK;
11401             op = POSIXU;
11402             goto join_posix_op_known;
11403
11404         case 'V':
11405             invert = 1;
11406             /* FALLTHROUGH */
11407         case 'v':
11408             arg = ANYOF_VERTWS;
11409             op = POSIXU;
11410             goto join_posix_op_known;
11411
11412         case 'S':
11413             invert = 1;
11414             /* FALLTHROUGH */
11415         case 's':
11416             arg = ANYOF_SPACE;
11417
11418         join_posix:
11419
11420             op = POSIXD + get_regex_charset(RExC_flags);
11421             if (op > POSIXA) {  /* /aa is same as /a */
11422                 op = POSIXA;
11423             }
11424             else if (op == POSIXL) {
11425                 RExC_contains_locale = 1;
11426             }
11427
11428         join_posix_op_known:
11429
11430             if (invert) {
11431                 op += NPOSIXD - POSIXD;
11432             }
11433
11434             ret = reg_node(pRExC_state, op);
11435             if (! SIZE_ONLY) {
11436                 FLAGS(ret) = namedclass_to_classnum(arg);
11437             }
11438
11439             *flagp |= HASWIDTH|SIMPLE;
11440             /* FALL THROUGH */
11441
11442          finish_meta_pat:
11443             nextchar(pRExC_state);
11444             Set_Node_Length(ret, 2); /* MJD */
11445             break;
11446         case 'p':
11447         case 'P':
11448             {
11449 #ifdef DEBUGGING
11450                 char* parse_start = RExC_parse - 2;
11451 #endif
11452
11453                 RExC_parse--;
11454
11455                 ret = regclass(pRExC_state, flagp,depth+1,
11456                                TRUE, /* means just parse this element */
11457                                FALSE, /* don't allow multi-char folds */
11458                                FALSE, /* don't silence non-portable warnings.
11459                                          It would be a bug if these returned
11460                                          non-portables */
11461                                NULL);
11462                 /* regclass() can only return RESTART_UTF8 if multi-char folds
11463                    are allowed.  */
11464                 if (!ret)
11465                     FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11466                           (UV) *flagp);
11467
11468                 RExC_parse--;
11469
11470                 Set_Node_Offset(ret, parse_start + 2);
11471                 Set_Node_Cur_Length(ret, parse_start);
11472                 nextchar(pRExC_state);
11473             }
11474             break;
11475         case 'N':
11476             /* Handle \N and \N{NAME} with multiple code points here and not
11477              * below because it can be multicharacter. join_exact() will join
11478              * them up later on.  Also this makes sure that things like
11479              * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11480              * The options to the grok function call causes it to fail if the
11481              * sequence is just a single code point.  We then go treat it as
11482              * just another character in the current EXACT node, and hence it
11483              * gets uniform treatment with all the other characters.  The
11484              * special treatment for quantifiers is not needed for such single
11485              * character sequences */
11486             ++RExC_parse;
11487             if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11488                                 FALSE /* not strict */ )) {
11489                 if (*flagp & RESTART_UTF8)
11490                     return NULL;
11491                 RExC_parse--;
11492                 goto defchar;
11493             }
11494             break;
11495         case 'k':    /* Handle \k<NAME> and \k'NAME' */
11496         parse_named_seq:
11497         {
11498             char ch= RExC_parse[1];
11499             if (ch != '<' && ch != '\'' && ch != '{') {
11500                 RExC_parse++;
11501                 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11502                 vFAIL2("Sequence %.2s... not terminated",parse_start);
11503             } else {
11504                 /* this pretty much dupes the code for (?P=...) in reg(), if
11505                    you change this make sure you change that */
11506                 char* name_start = (RExC_parse += 2);
11507                 U32 num = 0;
11508                 SV *sv_dat = reg_scan_name(pRExC_state,
11509                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11510                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11511                 if (RExC_parse == name_start || *RExC_parse != ch)
11512                     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11513                     vFAIL2("Sequence %.3s... not terminated",parse_start);
11514
11515                 if (!SIZE_ONLY) {
11516                     num = add_data( pRExC_state, STR_WITH_LEN("S"));
11517                     RExC_rxi->data->data[num]=(void*)sv_dat;
11518                     SvREFCNT_inc_simple_void(sv_dat);
11519                 }
11520
11521                 RExC_sawback = 1;
11522                 ret = reganode(pRExC_state,
11523                                ((! FOLD)
11524                                  ? NREF
11525                                  : (ASCII_FOLD_RESTRICTED)
11526                                    ? NREFFA
11527                                    : (AT_LEAST_UNI_SEMANTICS)
11528                                      ? NREFFU
11529                                      : (LOC)
11530                                        ? NREFFL
11531                                        : NREFF),
11532                                 num);
11533                 *flagp |= HASWIDTH;
11534
11535                 /* override incorrect value set in reganode MJD */
11536                 Set_Node_Offset(ret, parse_start+1);
11537                 Set_Node_Cur_Length(ret, parse_start);
11538                 nextchar(pRExC_state);
11539
11540             }
11541             break;
11542         }
11543         case 'g':
11544         case '1': case '2': case '3': case '4':
11545         case '5': case '6': case '7': case '8': case '9':
11546             {
11547                 I32 num;
11548                 bool hasbrace = 0;
11549
11550                 if (*RExC_parse == 'g') {
11551                     bool isrel = 0;
11552
11553                     RExC_parse++;
11554                     if (*RExC_parse == '{') {
11555                         RExC_parse++;
11556                         hasbrace = 1;
11557                     }
11558                     if (*RExC_parse == '-') {
11559                         RExC_parse++;
11560                         isrel = 1;
11561                     }
11562                     if (hasbrace && !isDIGIT(*RExC_parse)) {
11563                         if (isrel) RExC_parse--;
11564                         RExC_parse -= 2;
11565                         goto parse_named_seq;
11566                     }
11567
11568                     num = S_backref_value(RExC_parse);
11569                     if (num == 0)
11570                         vFAIL("Reference to invalid group 0");
11571                     else if (num == I32_MAX) {
11572                          if (isDIGIT(*RExC_parse))
11573                             vFAIL("Reference to nonexistent group");
11574                         else
11575                             vFAIL("Unterminated \\g... pattern");
11576                     }
11577
11578                     if (isrel) {
11579                         num = RExC_npar - num;
11580                         if (num < 1)
11581                             vFAIL("Reference to nonexistent or unclosed group");
11582                     }
11583                 }
11584                 else {
11585                     num = S_backref_value(RExC_parse);
11586                     /* bare \NNN might be backref or octal - if it is larger than or equal
11587                      * RExC_npar then it is assumed to be and octal escape.
11588                      * Note RExC_npar is +1 from the actual number of parens*/
11589                     if (num == I32_MAX || (num > 9 && num >= RExC_npar
11590                             && *RExC_parse != '8' && *RExC_parse != '9'))
11591                     {
11592                         /* Probably a character specified in octal, e.g. \35 */
11593                         goto defchar;
11594                     }
11595                 }
11596
11597                 /* at this point RExC_parse definitely points to a backref
11598                  * number */
11599                 {
11600 #ifdef RE_TRACK_PATTERN_OFFSETS
11601                     char * const parse_start = RExC_parse - 1; /* MJD */
11602 #endif
11603                     while (isDIGIT(*RExC_parse))
11604                         RExC_parse++;
11605                     if (hasbrace) {
11606                         if (*RExC_parse != '}')
11607                             vFAIL("Unterminated \\g{...} pattern");
11608                         RExC_parse++;
11609                     }
11610                     if (!SIZE_ONLY) {
11611                         if (num > (I32)RExC_rx->nparens)
11612                             vFAIL("Reference to nonexistent group");
11613                     }
11614                     RExC_sawback = 1;
11615                     ret = reganode(pRExC_state,
11616                                    ((! FOLD)
11617                                      ? REF
11618                                      : (ASCII_FOLD_RESTRICTED)
11619                                        ? REFFA
11620                                        : (AT_LEAST_UNI_SEMANTICS)
11621                                          ? REFFU
11622                                          : (LOC)
11623                                            ? REFFL
11624                                            : REFF),
11625                                     num);
11626                     *flagp |= HASWIDTH;
11627
11628                     /* override incorrect value set in reganode MJD */
11629                     Set_Node_Offset(ret, parse_start+1);
11630                     Set_Node_Cur_Length(ret, parse_start);
11631                     RExC_parse--;
11632                     nextchar(pRExC_state);
11633                 }
11634             }
11635             break;
11636         case '\0':
11637             if (RExC_parse >= RExC_end)
11638                 FAIL("Trailing \\");
11639             /* FALL THROUGH */
11640         default:
11641             /* Do not generate "unrecognized" warnings here, we fall
11642                back into the quick-grab loop below */
11643             parse_start--;
11644             goto defchar;
11645         }
11646         break;
11647
11648     case '#':
11649         if (RExC_flags & RXf_PMf_EXTENDED) {
11650             if ( reg_skipcomment( pRExC_state ) )
11651                 goto tryagain;
11652         }
11653         /* FALL THROUGH */
11654
11655     default:
11656
11657             parse_start = RExC_parse - 1;
11658
11659             RExC_parse++;
11660
11661         defchar: {
11662             STRLEN len = 0;
11663             UV ender = 0;
11664             char *p;
11665             char *s;
11666 #define MAX_NODE_STRING_SIZE 127
11667             char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11668             char *s0;
11669             U8 upper_parse = MAX_NODE_STRING_SIZE;
11670             U8 node_type = compute_EXACTish(pRExC_state);
11671             bool next_is_quantifier;
11672             char * oldp = NULL;
11673
11674             /* We can convert EXACTF nodes to EXACTFU if they contain only
11675              * characters that match identically regardless of the target
11676              * string's UTF8ness.  The reason to do this is that EXACTF is not
11677              * trie-able, EXACTFU is.
11678              *
11679              * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11680              * contain only above-Latin1 characters (hence must be in UTF8),
11681              * which don't participate in folds with Latin1-range characters,
11682              * as the latter's folds aren't known until runtime.  (We don't
11683              * need to figure this out until pass 2) */
11684             bool maybe_exactfu = PASS2
11685                                && (node_type == EXACTF || node_type == EXACTFL);
11686
11687             /* If a folding node contains only code points that don't
11688              * participate in folds, it can be changed into an EXACT node,
11689              * which allows the optimizer more things to look for */
11690             bool maybe_exact;
11691
11692             ret = reg_node(pRExC_state, node_type);
11693
11694             /* In pass1, folded, we use a temporary buffer instead of the
11695              * actual node, as the node doesn't exist yet */
11696             s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11697
11698             s0 = s;
11699
11700         reparse:
11701
11702             /* We do the EXACTFish to EXACT node only if folding.  (And we
11703              * don't need to figure this out until pass 2) */
11704             maybe_exact = FOLD && PASS2;
11705
11706             /* XXX The node can hold up to 255 bytes, yet this only goes to
11707              * 127.  I (khw) do not know why.  Keeping it somewhat less than
11708              * 255 allows us to not have to worry about overflow due to
11709              * converting to utf8 and fold expansion, but that value is
11710              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11711              * split up by this limit into a single one using the real max of
11712              * 255.  Even at 127, this breaks under rare circumstances.  If
11713              * folding, we do not want to split a node at a character that is a
11714              * non-final in a multi-char fold, as an input string could just
11715              * happen to want to match across the node boundary.  The join
11716              * would solve that problem if the join actually happens.  But a
11717              * series of more than two nodes in a row each of 127 would cause
11718              * the first join to succeed to get to 254, but then there wouldn't
11719              * be room for the next one, which could at be one of those split
11720              * multi-char folds.  I don't know of any fool-proof solution.  One
11721              * could back off to end with only a code point that isn't such a
11722              * non-final, but it is possible for there not to be any in the
11723              * entire node. */
11724             for (p = RExC_parse - 1;
11725                  len < upper_parse && p < RExC_end;
11726                  len++)
11727             {
11728                 oldp = p;
11729
11730                 if (RExC_flags & RXf_PMf_EXTENDED)
11731                     p = regwhite( pRExC_state, p );
11732                 switch ((U8)*p) {
11733                 case '^':
11734                 case '$':
11735                 case '.':
11736                 case '[':
11737                 case '(':
11738                 case ')':
11739                 case '|':
11740                     goto loopdone;
11741                 case '\\':
11742                     /* Literal Escapes Switch
11743
11744                        This switch is meant to handle escape sequences that
11745                        resolve to a literal character.
11746
11747                        Every escape sequence that represents something
11748                        else, like an assertion or a char class, is handled
11749                        in the switch marked 'Special Escapes' above in this
11750                        routine, but also has an entry here as anything that
11751                        isn't explicitly mentioned here will be treated as
11752                        an unescaped equivalent literal.
11753                     */
11754
11755                     switch ((U8)*++p) {
11756                     /* These are all the special escapes. */
11757                     case 'A':             /* Start assertion */
11758                     case 'b': case 'B':   /* Word-boundary assertion*/
11759                     case 'C':             /* Single char !DANGEROUS! */
11760                     case 'd': case 'D':   /* digit class */
11761                     case 'g': case 'G':   /* generic-backref, pos assertion */
11762                     case 'h': case 'H':   /* HORIZWS */
11763                     case 'k': case 'K':   /* named backref, keep marker */
11764                     case 'p': case 'P':   /* Unicode property */
11765                               case 'R':   /* LNBREAK */
11766                     case 's': case 'S':   /* space class */
11767                     case 'v': case 'V':   /* VERTWS */
11768                     case 'w': case 'W':   /* word class */
11769                     case 'X':             /* eXtended Unicode "combining
11770                                              character sequence" */
11771                     case 'z': case 'Z':   /* End of line/string assertion */
11772                         --p;
11773                         goto loopdone;
11774
11775                     /* Anything after here is an escape that resolves to a
11776                        literal. (Except digits, which may or may not)
11777                      */
11778                     case 'n':
11779                         ender = '\n';
11780                         p++;
11781                         break;
11782                     case 'N': /* Handle a single-code point named character. */
11783                         /* The options cause it to fail if a multiple code
11784                          * point sequence.  Handle those in the switch() above
11785                          * */
11786                         RExC_parse = p + 1;
11787                         if (! grok_bslash_N(pRExC_state, NULL, &ender,
11788                                             flagp, depth, FALSE,
11789                                             FALSE /* not strict */ ))
11790                         {
11791                             if (*flagp & RESTART_UTF8)
11792                                 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11793                             RExC_parse = p = oldp;
11794                             goto loopdone;
11795                         }
11796                         p = RExC_parse;
11797                         if (ender > 0xff) {
11798                             REQUIRE_UTF8;
11799                         }
11800                         break;
11801                     case 'r':
11802                         ender = '\r';
11803                         p++;
11804                         break;
11805                     case 't':
11806                         ender = '\t';
11807                         p++;
11808                         break;
11809                     case 'f':
11810                         ender = '\f';
11811                         p++;
11812                         break;
11813                     case 'e':
11814                           ender = ASCII_TO_NATIVE('\033');
11815                         p++;
11816                         break;
11817                     case 'a':
11818                           ender = '\a';
11819                         p++;
11820                         break;
11821                     case 'o':
11822                         {
11823                             UV result;
11824                             const char* error_msg;
11825
11826                             bool valid = grok_bslash_o(&p,
11827                                                        &result,
11828                                                        &error_msg,
11829                                                        TRUE, /* out warnings */
11830                                                        FALSE, /* not strict */
11831                                                        TRUE, /* Output warnings
11832                                                                 for non-
11833                                                                 portables */
11834                                                        UTF);
11835                             if (! valid) {
11836                                 RExC_parse = p; /* going to die anyway; point
11837                                                    to exact spot of failure */
11838                                 vFAIL(error_msg);
11839                             }
11840                             ender = result;
11841                             if (PL_encoding && ender < 0x100) {
11842                                 goto recode_encoding;
11843                             }
11844                             if (ender > 0xff) {
11845                                 REQUIRE_UTF8;
11846                             }
11847                             break;
11848                         }
11849                     case 'x':
11850                         {
11851                             UV result = UV_MAX; /* initialize to erroneous
11852                                                    value */
11853                             const char* error_msg;
11854
11855                             bool valid = grok_bslash_x(&p,
11856                                                        &result,
11857                                                        &error_msg,
11858                                                        TRUE, /* out warnings */
11859                                                        FALSE, /* not strict */
11860                                                        TRUE, /* Output warnings
11861                                                                 for non-
11862                                                                 portables */
11863                                                        UTF);
11864                             if (! valid) {
11865                                 RExC_parse = p; /* going to die anyway; point
11866                                                    to exact spot of failure */
11867                                 vFAIL(error_msg);
11868                             }
11869                             ender = result;
11870
11871                             if (PL_encoding && ender < 0x100) {
11872                                 goto recode_encoding;
11873                             }
11874                             if (ender > 0xff) {
11875                                 REQUIRE_UTF8;
11876                             }
11877                             break;
11878                         }
11879                     case 'c':
11880                         p++;
11881                         ender = grok_bslash_c(*p++, SIZE_ONLY);
11882                         break;
11883                     case '8': case '9': /* must be a backreference */
11884                         --p;
11885                         goto loopdone;
11886                     case '1': case '2': case '3':case '4':
11887                     case '5': case '6': case '7':
11888                         /* When we parse backslash escapes there is ambiguity
11889                          * between backreferences and octal escapes. Any escape
11890                          * from \1 - \9 is a backreference, any multi-digit
11891                          * escape which does not start with 0 and which when
11892                          * evaluated as decimal could refer to an already
11893                          * parsed capture buffer is a backslash. Anything else
11894                          * is octal.
11895                          *
11896                          * Note this implies that \118 could be interpreted as
11897                          * 118 OR as "\11" . "8" depending on whether there
11898                          * were 118 capture buffers defined already in the
11899                          * pattern.  */
11900
11901                         /* NOTE, RExC_npar is 1 more than the actual number of
11902                          * parens we have seen so far, hence the < RExC_npar below. */
11903
11904                         if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11905                         {  /* Not to be treated as an octal constant, go
11906                                    find backref */
11907                             --p;
11908                             goto loopdone;
11909                         }
11910                     case '0':
11911                         {
11912                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11913                             STRLEN numlen = 3;
11914                             ender = grok_oct(p, &numlen, &flags, NULL);
11915                             if (ender > 0xff) {
11916                                 REQUIRE_UTF8;
11917                             }
11918                             p += numlen;
11919                             if (SIZE_ONLY   /* like \08, \178 */
11920                                 && numlen < 3
11921                                 && p < RExC_end
11922                                 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11923                             {
11924                                 reg_warn_non_literal_string(
11925                                          p + 1,
11926                                          form_short_octal_warning(p, numlen));
11927                             }
11928                         }
11929                         if (PL_encoding && ender < 0x100)
11930                             goto recode_encoding;
11931                         break;
11932                     recode_encoding:
11933                         if (! RExC_override_recoding) {
11934                             SV* enc = PL_encoding;
11935                             ender = reg_recode((const char)(U8)ender, &enc);
11936                             if (!enc && SIZE_ONLY)
11937                                 ckWARNreg(p, "Invalid escape in the specified encoding");
11938                             REQUIRE_UTF8;
11939                         }
11940                         break;
11941                     case '\0':
11942                         if (p >= RExC_end)
11943                             FAIL("Trailing \\");
11944                         /* FALL THROUGH */
11945                     default:
11946                         if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11947                             /* Include any { following the alpha to emphasize
11948                              * that it could be part of an escape at some point
11949                              * in the future */
11950                             int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11951                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11952                         }
11953                         goto normal_default;
11954                     } /* End of switch on '\' */
11955                     break;
11956                 default:    /* A literal character */
11957
11958                     if (! SIZE_ONLY
11959                         && RExC_flags & RXf_PMf_EXTENDED
11960                         && ckWARN_d(WARN_DEPRECATED)
11961                         && is_PATWS_non_low_safe(p, RExC_end, UTF))
11962                     {
11963                         vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11964                                 "Escape literal pattern white space under /x");
11965                     }
11966
11967                   normal_default:
11968                     if (UTF8_IS_START(*p) && UTF) {
11969                         STRLEN numlen;
11970                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11971                                                &numlen, UTF8_ALLOW_DEFAULT);
11972                         p += numlen;
11973                     }
11974                     else
11975                         ender = (U8) *p++;
11976                     break;
11977                 } /* End of switch on the literal */
11978
11979                 /* Here, have looked at the literal character and <ender>
11980                  * contains its ordinal, <p> points to the character after it
11981                  */
11982
11983                 if ( RExC_flags & RXf_PMf_EXTENDED)
11984                     p = regwhite( pRExC_state, p );
11985
11986                 /* If the next thing is a quantifier, it applies to this
11987                  * character only, which means that this character has to be in
11988                  * its own node and can't just be appended to the string in an
11989                  * existing node, so if there are already other characters in
11990                  * the node, close the node with just them, and set up to do
11991                  * this character again next time through, when it will be the
11992                  * only thing in its new node */
11993                 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11994                 {
11995                     p = oldp;
11996                     goto loopdone;
11997                 }
11998
11999                 if (! FOLD   /* The simple case, just append the literal */
12000                     || (LOC  /* Also don't fold for tricky chars under /l */
12001                         && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12002                 {
12003                     if (UTF) {
12004
12005                         /* Normally, we don't need the representation of the
12006                          * character in the sizing pass--just its size, but if
12007                          * folding, we have to actually put the character out
12008                          * even in the sizing pass, because the size could
12009                          * change as we juggle things at the end of this loop
12010                          * to avoid splitting a too-full node in the middle of
12011                          * a potential multi-char fold [perl #123539] */
12012                         const STRLEN unilen = (SIZE_ONLY && ! FOLD)
12013                                                ? UNISKIP(ender)
12014                                                : (uvchr_to_utf8((U8*)s, ender) - (U8*)s);
12015                         if (unilen > 0) {
12016                            s   += unilen;
12017                            len += unilen;
12018                         }
12019
12020                         /* The loop increments <len> each time, as all but this
12021                          * path (and one other) through it add a single byte to
12022                          * the EXACTish node.  But this one has changed len to
12023                          * be the correct final value, so subtract one to
12024                          * cancel out the increment that follows */
12025                         len--;
12026                     }
12027                     else if (FOLD) {
12028                         /* See comment above for [perl #123539] */
12029                         *(s++) = (char) ender;
12030                     }
12031                     else {
12032                         REGC((char)ender, s++);
12033                     }
12034
12035                     /* Can get here if folding only if is one of the /l
12036                      * characters whose fold depends on the locale.  The
12037                      * occurrence of any of these indicate that we can't
12038                      * simplify things */
12039                     if (FOLD) {
12040                         maybe_exact = FALSE;
12041                         maybe_exactfu = FALSE;
12042                     }
12043                 }
12044                 else             /* FOLD */
12045                      if (! ( UTF
12046                         /* See comments for join_exact() as to why we fold this
12047                          * non-UTF at compile time */
12048                         || (node_type == EXACTFU
12049                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
12050                 {
12051                     /* Here, are folding and are not UTF-8 encoded; therefore
12052                      * the character must be in the range 0-255, and is not /l
12053                      * (Not /l because we already handled these under /l in
12054                      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12055                     if (IS_IN_SOME_FOLD_L1(ender)) {
12056                         maybe_exact = FALSE;
12057
12058                         /* See if the character's fold differs between /d and
12059                          * /u.  This includes the multi-char fold SHARP S to
12060                          * 'ss' */
12061                         if (maybe_exactfu
12062                             && (PL_fold[ender] != PL_fold_latin1[ender]
12063                                 || ender == LATIN_SMALL_LETTER_SHARP_S
12064                                 || (len > 0
12065                                    && isARG2_lower_or_UPPER_ARG1('s', ender)
12066                                    && isARG2_lower_or_UPPER_ARG1('s',
12067                                                                  *(s-1)))))
12068                         {
12069                             maybe_exactfu = FALSE;
12070                         }
12071                     }
12072
12073                     /* Even when folding, we store just the input character, as
12074                      * we have an array that finds its fold quickly */
12075                     *(s++) = (char) ender;
12076                 }
12077                 else {  /* FOLD and UTF */
12078                     /* Unlike the non-fold case, we do actually have to
12079                      * calculate the results here in pass 1.  This is for two
12080                      * reasons, the folded length may be longer than the
12081                      * unfolded, and we have to calculate how many EXACTish
12082                      * nodes it will take; and we may run out of room in a node
12083                      * in the middle of a potential multi-char fold, and have
12084                      * to back off accordingly.  (Hence we can't use REGC for
12085                      * the simple case just below.) */
12086
12087                     UV folded;
12088                     if (isASCII(ender)) {
12089                         folded = toFOLD(ender);
12090                         *(s)++ = (U8) folded;
12091                     }
12092                     else {
12093                         STRLEN foldlen;
12094
12095                         folded = _to_uni_fold_flags(
12096                                      ender,
12097                                      (U8 *) s,
12098                                      &foldlen,
12099                                      FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12100                                                         ? FOLD_FLAGS_NOMIX_ASCII
12101                                                         : 0));
12102                         s += foldlen;
12103
12104                         /* The loop increments <len> each time, as all but this
12105                          * path (and one other) through it add a single byte to
12106                          * the EXACTish node.  But this one has changed len to
12107                          * be the correct final value, so subtract one to
12108                          * cancel out the increment that follows */
12109                         len += foldlen - 1;
12110                     }
12111                     /* If this node only contains non-folding code points so
12112                      * far, see if this new one is also non-folding */
12113                     if (maybe_exact) {
12114                         if (folded != ender) {
12115                             maybe_exact = FALSE;
12116                         }
12117                         else {
12118                             /* Here the fold is the original; we have to check
12119                              * further to see if anything folds to it */
12120                             if (_invlist_contains_cp(PL_utf8_foldable,
12121                                                         ender))
12122                             {
12123                                 maybe_exact = FALSE;
12124                             }
12125                         }
12126                     }
12127                     ender = folded;
12128                 }
12129
12130                 if (next_is_quantifier) {
12131
12132                     /* Here, the next input is a quantifier, and to get here,
12133                      * the current character is the only one in the node.
12134                      * Also, here <len> doesn't include the final byte for this
12135                      * character */
12136                     len++;
12137                     goto loopdone;
12138                 }
12139
12140             } /* End of loop through literal characters */
12141
12142             /* Here we have either exhausted the input or ran out of room in
12143              * the node.  (If we encountered a character that can't be in the
12144              * node, transfer is made directly to <loopdone>, and so we
12145              * wouldn't have fallen off the end of the loop.)  In the latter
12146              * case, we artificially have to split the node into two, because
12147              * we just don't have enough space to hold everything.  This
12148              * creates a problem if the final character participates in a
12149              * multi-character fold in the non-final position, as a match that
12150              * should have occurred won't, due to the way nodes are matched,
12151              * and our artificial boundary.  So back off until we find a non-
12152              * problematic character -- one that isn't at the beginning or
12153              * middle of such a fold.  (Either it doesn't participate in any
12154              * folds, or appears only in the final position of all the folds it
12155              * does participate in.)  A better solution with far fewer false
12156              * positives, and that would fill the nodes more completely, would
12157              * be to actually have available all the multi-character folds to
12158              * test against, and to back-off only far enough to be sure that
12159              * this node isn't ending with a partial one.  <upper_parse> is set
12160              * further below (if we need to reparse the node) to include just
12161              * up through that final non-problematic character that this code
12162              * identifies, so when it is set to less than the full node, we can
12163              * skip the rest of this */
12164             if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12165
12166                 const STRLEN full_len = len;
12167
12168                 assert(len >= MAX_NODE_STRING_SIZE);
12169
12170                 /* Here, <s> points to the final byte of the final character.
12171                  * Look backwards through the string until find a non-
12172                  * problematic character */
12173
12174                 if (! UTF) {
12175
12176                     /* This has no multi-char folds to non-UTF characters */
12177                     if (ASCII_FOLD_RESTRICTED) {
12178                         goto loopdone;
12179                     }
12180
12181                     while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12182                     len = s - s0 + 1;
12183                 }
12184                 else {
12185                     if (!  PL_NonL1NonFinalFold) {
12186                         PL_NonL1NonFinalFold = _new_invlist_C_array(
12187                                         NonL1_Perl_Non_Final_Folds_invlist);
12188                     }
12189
12190                     /* Point to the first byte of the final character */
12191                     s = (char *) utf8_hop((U8 *) s, -1);
12192
12193                     while (s >= s0) {   /* Search backwards until find
12194                                            non-problematic char */
12195                         if (UTF8_IS_INVARIANT(*s)) {
12196
12197                             /* There are no ascii characters that participate
12198                              * in multi-char folds under /aa.  In EBCDIC, the
12199                              * non-ascii invariants are all control characters,
12200                              * so don't ever participate in any folds. */
12201                             if (ASCII_FOLD_RESTRICTED
12202                                 || ! IS_NON_FINAL_FOLD(*s))
12203                             {
12204                                 break;
12205                             }
12206                         }
12207                         else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12208                             if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12209                                                                   *s, *(s+1))))
12210                             {
12211                                 break;
12212                             }
12213                         }
12214                         else if (! _invlist_contains_cp(
12215                                         PL_NonL1NonFinalFold,
12216                                         valid_utf8_to_uvchr((U8 *) s, NULL)))
12217                         {
12218                             break;
12219                         }
12220
12221                         /* Here, the current character is problematic in that
12222                          * it does occur in the non-final position of some
12223                          * fold, so try the character before it, but have to
12224                          * special case the very first byte in the string, so
12225                          * we don't read outside the string */
12226                         s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12227                     } /* End of loop backwards through the string */
12228
12229                     /* If there were only problematic characters in the string,
12230                      * <s> will point to before s0, in which case the length
12231                      * should be 0, otherwise include the length of the
12232                      * non-problematic character just found */
12233                     len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12234                 }
12235
12236                 /* Here, have found the final character, if any, that is
12237                  * non-problematic as far as ending the node without splitting
12238                  * it across a potential multi-char fold.  <len> contains the
12239                  * number of bytes in the node up-to and including that
12240                  * character, or is 0 if there is no such character, meaning
12241                  * the whole node contains only problematic characters.  In
12242                  * this case, give up and just take the node as-is.  We can't
12243                  * do any better */
12244                 if (len == 0) {
12245                     len = full_len;
12246
12247                     /* If the node ends in an 's' we make sure it stays EXACTF,
12248                      * as if it turns into an EXACTFU, it could later get
12249                      * joined with another 's' that would then wrongly match
12250                      * the sharp s */
12251                     if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12252                     {
12253                         maybe_exactfu = FALSE;
12254                     }
12255                 } else {
12256
12257                     /* Here, the node does contain some characters that aren't
12258                      * problematic.  If one such is the final character in the
12259                      * node, we are done */
12260                     if (len == full_len) {
12261                         goto loopdone;
12262                     }
12263                     else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12264
12265                         /* If the final character is problematic, but the
12266                          * penultimate is not, back-off that last character to
12267                          * later start a new node with it */
12268                         p = oldp;
12269                         goto loopdone;
12270                     }
12271
12272                     /* Here, the final non-problematic character is earlier
12273                      * in the input than the penultimate character.  What we do
12274                      * is reparse from the beginning, going up only as far as
12275                      * this final ok one, thus guaranteeing that the node ends
12276                      * in an acceptable character.  The reason we reparse is
12277                      * that we know how far in the character is, but we don't
12278                      * know how to correlate its position with the input parse.
12279                      * An alternate implementation would be to build that
12280                      * correlation as we go along during the original parse,
12281                      * but that would entail extra work for every node, whereas
12282                      * this code gets executed only when the string is too
12283                      * large for the node, and the final two characters are
12284                      * problematic, an infrequent occurrence.  Yet another
12285                      * possible strategy would be to save the tail of the
12286                      * string, and the next time regatom is called, initialize
12287                      * with that.  The problem with this is that unless you
12288                      * back off one more character, you won't be guaranteed
12289                      * regatom will get called again, unless regbranch,
12290                      * regpiece ... are also changed.  If you do back off that
12291                      * extra character, so that there is input guaranteed to
12292                      * force calling regatom, you can't handle the case where
12293                      * just the first character in the node is acceptable.  I
12294                      * (khw) decided to try this method which doesn't have that
12295                      * pitfall; if performance issues are found, we can do a
12296                      * combination of the current approach plus that one */
12297                     upper_parse = len;
12298                     len = 0;
12299                     s = s0;
12300                     goto reparse;
12301                 }
12302             }   /* End of verifying node ends with an appropriate char */
12303
12304         loopdone:   /* Jumped to when encounters something that shouldn't be in
12305                        the node */
12306
12307             /* I (khw) don't know if you can get here with zero length, but the
12308              * old code handled this situation by creating a zero-length EXACT
12309              * node.  Might as well be NOTHING instead */
12310             if (len == 0) {
12311                 OP(ret) = NOTHING;
12312             }
12313             else {
12314                 if (FOLD) {
12315                     /* If 'maybe_exact' is still set here, means there are no
12316                      * code points in the node that participate in folds;
12317                      * similarly for 'maybe_exactfu' and code points that match
12318                      * differently depending on UTF8ness of the target string
12319                      * (for /u), or depending on locale for /l */
12320                     if (maybe_exact) {
12321                         OP(ret) = EXACT;
12322                     }
12323                     else if (maybe_exactfu) {
12324                         OP(ret) = EXACTFU;
12325                     }
12326                 }
12327                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12328                                            FALSE /* Don't look to see if could
12329                                                     be turned into an EXACT
12330                                                     node, as we have already
12331                                                     computed that */
12332                                           );
12333             }
12334
12335             RExC_parse = p - 1;
12336             Set_Node_Cur_Length(ret, parse_start);
12337             nextchar(pRExC_state);
12338             {
12339                 /* len is STRLEN which is unsigned, need to copy to signed */
12340                 IV iv = len;
12341                 if (iv < 0)
12342                     vFAIL("Internal disaster");
12343             }
12344
12345         } /* End of label 'defchar:' */
12346         break;
12347     } /* End of giant switch on input character */
12348
12349     return(ret);
12350 }
12351
12352 STATIC char *
12353 S_regwhite( RExC_state_t *pRExC_state, char *p )
12354 {
12355     const char *e = RExC_end;
12356
12357     PERL_ARGS_ASSERT_REGWHITE;
12358
12359     while (p < e) {
12360         if (isSPACE(*p))
12361             ++p;
12362         else if (*p == '#') {
12363             bool ended = 0;
12364             do {
12365                 if (*p++ == '\n') {
12366                     ended = 1;
12367                     break;
12368                 }
12369             } while (p < e);
12370             if (!ended)
12371                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12372         }
12373         else
12374             break;
12375     }
12376     return p;
12377 }
12378
12379 STATIC char *
12380 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12381 {
12382     /* Returns the next non-pattern-white space, non-comment character (the
12383      * latter only if 'recognize_comment is true) in the string p, which is
12384      * ended by RExC_end.  If there is no line break ending a comment,
12385      * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
12386     const char *e = RExC_end;
12387
12388     PERL_ARGS_ASSERT_REGPATWS;
12389
12390     while (p < e) {
12391         STRLEN len;
12392         if ((len = is_PATWS_safe(p, e, UTF))) {
12393             p += len;
12394         }
12395         else if (recognize_comment && *p == '#') {
12396             bool ended = 0;
12397             do {
12398                 p++;
12399                 if (is_LNBREAK_safe(p, e, UTF)) {
12400                     ended = 1;
12401                     break;
12402                 }
12403             } while (p < e);
12404             if (!ended)
12405                 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12406         }
12407         else
12408             break;
12409     }
12410     return p;
12411 }
12412
12413 STATIC void
12414 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12415 {
12416     /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12417      * sets up the bitmap and any flags, removing those code points from the
12418      * inversion list, setting it to NULL should it become completely empty */
12419
12420     PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12421     assert(PL_regkind[OP(node)] == ANYOF);
12422
12423     ANYOF_BITMAP_ZERO(node);
12424     if (*invlist_ptr) {
12425
12426         /* This gets set if we actually need to modify things */
12427         bool change_invlist = FALSE;
12428
12429         UV start, end;
12430
12431         /* Start looking through *invlist_ptr */
12432         invlist_iterinit(*invlist_ptr);
12433         while (invlist_iternext(*invlist_ptr, &start, &end)) {
12434             UV high;
12435             int i;
12436
12437             if (end == UV_MAX && start <= 256) {
12438                 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12439             }
12440             else if (end >= 256) {
12441                 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12442             }
12443
12444             /* Quit if are above what we should change */
12445             if (start > 255) {
12446                 break;
12447             }
12448
12449             change_invlist = TRUE;
12450
12451             /* Set all the bits in the range, up to the max that we are doing */
12452             high = (end < 255) ? end : 255;
12453             for (i = start; i <= (int) high; i++) {
12454                 if (! ANYOF_BITMAP_TEST(node, i)) {
12455                     ANYOF_BITMAP_SET(node, i);
12456                 }
12457             }
12458         }
12459         invlist_iterfinish(*invlist_ptr);
12460
12461         /* Done with loop; remove any code points that are in the bitmap from
12462          * *invlist_ptr; similarly for code points above latin1 if we have a
12463          * flag to match all of them anyways */
12464         if (change_invlist) {
12465             _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12466         }
12467         if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12468             _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12469         }
12470
12471         /* If have completely emptied it, remove it completely */
12472         if (_invlist_len(*invlist_ptr) == 0) {
12473             SvREFCNT_dec_NN(*invlist_ptr);
12474             *invlist_ptr = NULL;
12475         }
12476     }
12477 }
12478
12479 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12480    Character classes ([:foo:]) can also be negated ([:^foo:]).
12481    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12482    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12483    but trigger failures because they are currently unimplemented. */
12484
12485 #define POSIXCC_DONE(c)   ((c) == ':')
12486 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12487 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12488
12489 PERL_STATIC_INLINE I32
12490 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12491 {
12492     dVAR;
12493     I32 namedclass = OOB_NAMEDCLASS;
12494
12495     PERL_ARGS_ASSERT_REGPPOSIXCC;
12496
12497     if (value == '[' && RExC_parse + 1 < RExC_end &&
12498         /* I smell either [: or [= or [. -- POSIX has been here, right? */
12499         POSIXCC(UCHARAT(RExC_parse)))
12500     {
12501         const char c = UCHARAT(RExC_parse);
12502         char* const s = RExC_parse++;
12503
12504         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12505             RExC_parse++;
12506         if (RExC_parse == RExC_end) {
12507             if (strict) {
12508
12509                 /* Try to give a better location for the error (than the end of
12510                  * the string) by looking for the matching ']' */
12511                 RExC_parse = s;
12512                 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12513                     RExC_parse++;
12514                 }
12515                 vFAIL2("Unmatched '%c' in POSIX class", c);
12516             }
12517             /* Grandfather lone [:, [=, [. */
12518             RExC_parse = s;
12519         }
12520         else {
12521             const char* const t = RExC_parse++; /* skip over the c */
12522             assert(*t == c);
12523
12524             if (UCHARAT(RExC_parse) == ']') {
12525                 const char *posixcc = s + 1;
12526                 RExC_parse++; /* skip over the ending ] */
12527
12528                 if (*s == ':') {
12529                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12530                     const I32 skip = t - posixcc;
12531
12532                     /* Initially switch on the length of the name.  */
12533                     switch (skip) {
12534                     case 4:
12535                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12536                                                           this is the Perl \w
12537                                                         */
12538                             namedclass = ANYOF_WORDCHAR;
12539                         break;
12540                     case 5:
12541                         /* Names all of length 5.  */
12542                         /* alnum alpha ascii blank cntrl digit graph lower
12543                            print punct space upper  */
12544                         /* Offset 4 gives the best switch position.  */
12545                         switch (posixcc[4]) {
12546                         case 'a':
12547                             if (memEQ(posixcc, "alph", 4)) /* alpha */
12548                                 namedclass = ANYOF_ALPHA;
12549                             break;
12550                         case 'e':
12551                             if (memEQ(posixcc, "spac", 4)) /* space */
12552                                 namedclass = ANYOF_PSXSPC;
12553                             break;
12554                         case 'h':
12555                             if (memEQ(posixcc, "grap", 4)) /* graph */
12556                                 namedclass = ANYOF_GRAPH;
12557                             break;
12558                         case 'i':
12559                             if (memEQ(posixcc, "asci", 4)) /* ascii */
12560                                 namedclass = ANYOF_ASCII;
12561                             break;
12562                         case 'k':
12563                             if (memEQ(posixcc, "blan", 4)) /* blank */
12564                                 namedclass = ANYOF_BLANK;
12565                             break;
12566                         case 'l':
12567                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12568                                 namedclass = ANYOF_CNTRL;
12569                             break;
12570                         case 'm':
12571                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
12572                                 namedclass = ANYOF_ALPHANUMERIC;
12573                             break;
12574                         case 'r':
12575                             if (memEQ(posixcc, "lowe", 4)) /* lower */
12576                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12577                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
12578                                 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12579                             break;
12580                         case 't':
12581                             if (memEQ(posixcc, "digi", 4)) /* digit */
12582                                 namedclass = ANYOF_DIGIT;
12583                             else if (memEQ(posixcc, "prin", 4)) /* print */
12584                                 namedclass = ANYOF_PRINT;
12585                             else if (memEQ(posixcc, "punc", 4)) /* punct */
12586                                 namedclass = ANYOF_PUNCT;
12587                             break;
12588                         }
12589                         break;
12590                     case 6:
12591                         if (memEQ(posixcc, "xdigit", 6))
12592                             namedclass = ANYOF_XDIGIT;
12593                         break;
12594                     }
12595
12596                     if (namedclass == OOB_NAMEDCLASS)
12597                         vFAIL2utf8f(
12598                             "POSIX class [:%"UTF8f":] unknown",
12599                             UTF8fARG(UTF, t - s - 1, s + 1));
12600
12601                     /* The #defines are structured so each complement is +1 to
12602                      * the normal one */
12603                     if (complement) {
12604                         namedclass++;
12605                     }
12606                     assert (posixcc[skip] == ':');
12607                     assert (posixcc[skip+1] == ']');
12608                 } else if (!SIZE_ONLY) {
12609                     /* [[=foo=]] and [[.foo.]] are still future. */
12610
12611                     /* adjust RExC_parse so the warning shows after
12612                        the class closes */
12613                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12614                         RExC_parse++;
12615                     vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12616                 }
12617             } else {
12618                 /* Maternal grandfather:
12619                  * "[:" ending in ":" but not in ":]" */
12620                 if (strict) {
12621                     vFAIL("Unmatched '[' in POSIX class");
12622                 }
12623
12624                 /* Grandfather lone [:, [=, [. */
12625                 RExC_parse = s;
12626             }
12627         }
12628     }
12629
12630     return namedclass;
12631 }
12632
12633 STATIC bool
12634 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12635 {
12636     /* This applies some heuristics at the current parse position (which should
12637      * be at a '[') to see if what follows might be intended to be a [:posix:]
12638      * class.  It returns true if it really is a posix class, of course, but it
12639      * also can return true if it thinks that what was intended was a posix
12640      * class that didn't quite make it.
12641      *
12642      * It will return true for
12643      *      [:alphanumerics:
12644      *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12645      *                         ')' indicating the end of the (?[
12646      *      [:any garbage including %^&$ punctuation:]
12647      *
12648      * This is designed to be called only from S_handle_regex_sets; it could be
12649      * easily adapted to be called from the spot at the beginning of regclass()
12650      * that checks to see in a normal bracketed class if the surrounding []
12651      * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12652      * change long-standing behavior, so I (khw) didn't do that */
12653     char* p = RExC_parse + 1;
12654     char first_char = *p;
12655
12656     PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12657
12658     assert(*(p - 1) == '[');
12659
12660     if (! POSIXCC(first_char)) {
12661         return FALSE;
12662     }
12663
12664     p++;
12665     while (p < RExC_end && isWORDCHAR(*p)) p++;
12666
12667     if (p >= RExC_end) {
12668         return FALSE;
12669     }
12670
12671     if (p - RExC_parse > 2    /* Got at least 1 word character */
12672         && (*p == first_char
12673             || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12674     {
12675         return TRUE;
12676     }
12677
12678     p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12679
12680     return (p
12681             && p - RExC_parse > 2 /* [:] evaluates to colon;
12682                                       [::] is a bad posix class. */
12683             && first_char == *(p - 1));
12684 }
12685
12686 STATIC regnode *
12687 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12688                     I32 *flagp, U32 depth,
12689                     char * const oregcomp_parse)
12690 {
12691     /* Handle the (?[...]) construct to do set operations */
12692
12693     U8 curchar;
12694     UV start, end;      /* End points of code point ranges */
12695     SV* result_string;
12696     char *save_end, *save_parse;
12697     SV* final;
12698     STRLEN len;
12699     regnode* node;
12700     AV* stack;
12701     const bool save_fold = FOLD;
12702
12703     GET_RE_DEBUG_FLAGS_DECL;
12704
12705     PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12706
12707     if (LOC) {
12708         vFAIL("(?[...]) not valid in locale");
12709     }
12710     RExC_uni_semantics = 1;
12711
12712     /* This will return only an ANYOF regnode, or (unlikely) something smaller
12713      * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12714      * call regclass to handle '[]' so as to not have to reinvent its parsing
12715      * rules here (throwing away the size it computes each time).  And, we exit
12716      * upon an unescaped ']' that isn't one ending a regclass.  To do both
12717      * these things, we need to realize that something preceded by a backslash
12718      * is escaped, so we have to keep track of backslashes */
12719     if (SIZE_ONLY) {
12720         UV depth = 0; /* how many nested (?[...]) constructs */
12721
12722         Perl_ck_warner_d(aTHX_
12723             packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12724             "The regex_sets feature is experimental" REPORT_LOCATION,
12725                 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12726                 UTF8fARG(UTF,
12727                          RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12728                          RExC_precomp + (RExC_parse - RExC_precomp)));
12729
12730         while (RExC_parse < RExC_end) {
12731             SV* current = NULL;
12732             RExC_parse = regpatws(pRExC_state, RExC_parse,
12733                                 TRUE); /* means recognize comments */
12734             switch (*RExC_parse) {
12735                 case '?':
12736                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
12737                     /* FALL THROUGH */
12738                 default:
12739                     break;
12740                 case '\\':
12741                     /* Skip the next byte (which could cause us to end up in
12742                      * the middle of a UTF-8 character, but since none of those
12743                      * are confusable with anything we currently handle in this
12744                      * switch (invariants all), it's safe.  We'll just hit the
12745                      * default: case next time and keep on incrementing until
12746                      * we find one of the invariants we do handle. */
12747                     RExC_parse++;
12748                     break;
12749                 case '[':
12750                 {
12751                     /* If this looks like it is a [:posix:] class, leave the
12752                      * parse pointer at the '[' to fool regclass() into
12753                      * thinking it is part of a '[[:posix:]]'.  That function
12754                      * will use strict checking to force a syntax error if it
12755                      * doesn't work out to a legitimate class */
12756                     bool is_posix_class
12757                                     = could_it_be_a_POSIX_class(pRExC_state);
12758                     if (! is_posix_class) {
12759                         RExC_parse++;
12760                     }
12761
12762                     /* regclass() can only return RESTART_UTF8 if multi-char
12763                        folds are allowed.  */
12764                     if (!regclass(pRExC_state, flagp,depth+1,
12765                                   is_posix_class, /* parse the whole char
12766                                                      class only if not a
12767                                                      posix class */
12768                                   FALSE, /* don't allow multi-char folds */
12769                                   TRUE, /* silence non-portable warnings. */
12770                                   &current))
12771                         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12772                               (UV) *flagp);
12773
12774                     /* function call leaves parse pointing to the ']', except
12775                      * if we faked it */
12776                     if (is_posix_class) {
12777                         RExC_parse--;
12778                     }
12779
12780                     SvREFCNT_dec(current);   /* In case it returned something */
12781                     break;
12782                 }
12783
12784                 case ']':
12785                     if (depth--) break;
12786                     RExC_parse++;
12787                     if (RExC_parse < RExC_end
12788                         && *RExC_parse == ')')
12789                     {
12790                         node = reganode(pRExC_state, ANYOF, 0);
12791                         RExC_size += ANYOF_SKIP;
12792                         nextchar(pRExC_state);
12793                         Set_Node_Length(node,
12794                                 RExC_parse - oregcomp_parse + 1); /* MJD */
12795                         return node;
12796                     }
12797                     goto no_close;
12798             }
12799             RExC_parse++;
12800         }
12801
12802         no_close:
12803         FAIL("Syntax error in (?[...])");
12804     }
12805
12806     /* Pass 2 only after this.  Everything in this construct is a
12807      * metacharacter.  Operands begin with either a '\' (for an escape
12808      * sequence), or a '[' for a bracketed character class.  Any other
12809      * character should be an operator, or parenthesis for grouping.  Both
12810      * types of operands are handled by calling regclass() to parse them.  It
12811      * is called with a parameter to indicate to return the computed inversion
12812      * list.  The parsing here is implemented via a stack.  Each entry on the
12813      * stack is a single character representing one of the operators, or the
12814      * '('; or else a pointer to an operand inversion list. */
12815
12816 #define IS_OPERAND(a)  (! SvIOK(a))
12817
12818     /* The stack starts empty.  It is a syntax error if the first thing parsed
12819      * is a binary operator; everything else is pushed on the stack.  When an
12820      * operand is parsed, the top of the stack is examined.  If it is a binary
12821      * operator, the item before it should be an operand, and both are replaced
12822      * by the result of doing that operation on the new operand and the one on
12823      * the stack.   Thus a sequence of binary operands is reduced to a single
12824      * one before the next one is parsed.
12825      *
12826      * A unary operator may immediately follow a binary in the input, for
12827      * example
12828      *      [a] + ! [b]
12829      * When an operand is parsed and the top of the stack is a unary operator,
12830      * the operation is performed, and then the stack is rechecked to see if
12831      * this new operand is part of a binary operation; if so, it is handled as
12832      * above.
12833      *
12834      * A '(' is simply pushed on the stack; it is valid only if the stack is
12835      * empty, or the top element of the stack is an operator or another '('
12836      * (for which the parenthesized expression will become an operand).  By the
12837      * time the corresponding ')' is parsed everything in between should have
12838      * been parsed and evaluated to a single operand (or else is a syntax
12839      * error), and is handled as a regular operand */
12840
12841     sv_2mortal((SV *)(stack = newAV()));
12842
12843     while (RExC_parse < RExC_end) {
12844         I32 top_index = av_tindex(stack);
12845         SV** top_ptr;
12846         SV* current = NULL;
12847
12848         /* Skip white space */
12849         RExC_parse = regpatws(pRExC_state, RExC_parse,
12850                                 TRUE); /* means recognize comments */
12851         if (RExC_parse >= RExC_end) {
12852             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12853         }
12854         if ((curchar = UCHARAT(RExC_parse)) == ']') {
12855             break;
12856         }
12857
12858         switch (curchar) {
12859
12860             case '?':
12861                 if (av_tindex(stack) >= 0   /* This makes sure that we can
12862                                                safely subtract 1 from
12863                                                RExC_parse in the next clause.
12864                                                If we have something on the
12865                                                stack, we have parsed something
12866                                              */
12867                     && UCHARAT(RExC_parse - 1) == '('
12868                     && RExC_parse < RExC_end)
12869                 {
12870                     /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12871                      * This happens when we have some thing like
12872                      *
12873                      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12874                      *   ...
12875                      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12876                      *
12877                      * Here we would be handling the interpolated
12878                      * '$thai_or_lao'.  We handle this by a recursive call to
12879                      * ourselves which returns the inversion list the
12880                      * interpolated expression evaluates to.  We use the flags
12881                      * from the interpolated pattern. */
12882                     U32 save_flags = RExC_flags;
12883                     const char * const save_parse = ++RExC_parse;
12884
12885                     parse_lparen_question_flags(pRExC_state);
12886
12887                     if (RExC_parse == save_parse  /* Makes sure there was at
12888                                                      least one flag (or this
12889                                                      embedding wasn't compiled)
12890                                                    */
12891                         || RExC_parse >= RExC_end - 4
12892                         || UCHARAT(RExC_parse) != ':'
12893                         || UCHARAT(++RExC_parse) != '('
12894                         || UCHARAT(++RExC_parse) != '?'
12895                         || UCHARAT(++RExC_parse) != '[')
12896                     {
12897
12898                         /* In combination with the above, this moves the
12899                          * pointer to the point just after the first erroneous
12900                          * character (or if there are no flags, to where they
12901                          * should have been) */
12902                         if (RExC_parse >= RExC_end - 4) {
12903                             RExC_parse = RExC_end;
12904                         }
12905                         else if (RExC_parse != save_parse) {
12906                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12907                         }
12908                         vFAIL("Expecting '(?flags:(?[...'");
12909                     }
12910                     RExC_parse++;
12911                     (void) handle_regex_sets(pRExC_state, &current, flagp,
12912                                                     depth+1, oregcomp_parse);
12913
12914                     /* Here, 'current' contains the embedded expression's
12915                      * inversion list, and RExC_parse points to the trailing
12916                      * ']'; the next character should be the ')' which will be
12917                      * paired with the '(' that has been put on the stack, so
12918                      * the whole embedded expression reduces to '(operand)' */
12919                     RExC_parse++;
12920
12921                     RExC_flags = save_flags;
12922                     goto handle_operand;
12923                 }
12924                 /* FALL THROUGH */
12925
12926             default:
12927                 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12928                 vFAIL("Unexpected character");
12929
12930             case '\\':
12931                 /* regclass() can only return RESTART_UTF8 if multi-char
12932                    folds are allowed.  */
12933                 if (!regclass(pRExC_state, flagp,depth+1,
12934                               TRUE, /* means parse just the next thing */
12935                               FALSE, /* don't allow multi-char folds */
12936                               FALSE, /* don't silence non-portable warnings.  */
12937                               &current))
12938                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12939                           (UV) *flagp);
12940                 /* regclass() will return with parsing just the \ sequence,
12941                  * leaving the parse pointer at the next thing to parse */
12942                 RExC_parse--;
12943                 goto handle_operand;
12944
12945             case '[':   /* Is a bracketed character class */
12946             {
12947                 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12948
12949                 if (! is_posix_class) {
12950                     RExC_parse++;
12951                 }
12952
12953                 /* regclass() can only return RESTART_UTF8 if multi-char
12954                    folds are allowed.  */
12955                 if(!regclass(pRExC_state, flagp,depth+1,
12956                              is_posix_class, /* parse the whole char class
12957                                                 only if not a posix class */
12958                              FALSE, /* don't allow multi-char folds */
12959                              FALSE, /* don't silence non-portable warnings.  */
12960                              &current))
12961                     FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12962                           (UV) *flagp);
12963                 /* function call leaves parse pointing to the ']', except if we
12964                  * faked it */
12965                 if (is_posix_class) {
12966                     RExC_parse--;
12967                 }
12968
12969                 goto handle_operand;
12970             }
12971
12972             case '&':
12973             case '|':
12974             case '+':
12975             case '-':
12976             case '^':
12977                 if (top_index < 0
12978                     || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12979                     || ! IS_OPERAND(*top_ptr))
12980                 {
12981                     RExC_parse++;
12982                     vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12983                 }
12984                 av_push(stack, newSVuv(curchar));
12985                 break;
12986
12987             case '!':
12988                 av_push(stack, newSVuv(curchar));
12989                 break;
12990
12991             case '(':
12992                 if (top_index >= 0) {
12993                     top_ptr = av_fetch(stack, top_index, FALSE);
12994                     assert(top_ptr);
12995                     if (IS_OPERAND(*top_ptr)) {
12996                         RExC_parse++;
12997                         vFAIL("Unexpected '(' with no preceding operator");
12998                     }
12999                 }
13000                 av_push(stack, newSVuv(curchar));
13001                 break;
13002
13003             case ')':
13004             {
13005                 SV* lparen;
13006                 if (top_index < 1
13007                     || ! (current = av_pop(stack))
13008                     || ! IS_OPERAND(current)
13009                     || ! (lparen = av_pop(stack))
13010                     || IS_OPERAND(lparen)
13011                     || SvUV(lparen) != '(')
13012                 {
13013                     SvREFCNT_dec(current);
13014                     RExC_parse++;
13015                     vFAIL("Unexpected ')'");
13016                 }
13017                 top_index -= 2;
13018                 SvREFCNT_dec_NN(lparen);
13019
13020                 /* FALL THROUGH */
13021             }
13022
13023               handle_operand:
13024
13025                 /* Here, we have an operand to process, in 'current' */
13026
13027                 if (top_index < 0) {    /* Just push if stack is empty */
13028                     av_push(stack, current);
13029                 }
13030                 else {
13031                     SV* top = av_pop(stack);
13032                     SV *prev = NULL;
13033                     char current_operator;
13034
13035                     if (IS_OPERAND(top)) {
13036                         SvREFCNT_dec_NN(top);
13037                         SvREFCNT_dec_NN(current);
13038                         vFAIL("Operand with no preceding operator");
13039                     }
13040                     current_operator = (char) SvUV(top);
13041                     switch (current_operator) {
13042                         case '(':   /* Push the '(' back on followed by the new
13043                                        operand */
13044                             av_push(stack, top);
13045                             av_push(stack, current);
13046                             SvREFCNT_inc(top);  /* Counters the '_dec' done
13047                                                    just after the 'break', so
13048                                                    it doesn't get wrongly freed
13049                                                  */
13050                             break;
13051
13052                         case '!':
13053                             _invlist_invert(current);
13054
13055                             /* Unlike binary operators, the top of the stack,
13056                              * now that this unary one has been popped off, may
13057                              * legally be an operator, and we now have operand
13058                              * for it. */
13059                             top_index--;
13060                             SvREFCNT_dec_NN(top);
13061                             goto handle_operand;
13062
13063                         case '&':
13064                             prev = av_pop(stack);
13065                             _invlist_intersection(prev,
13066                                                    current,
13067                                                    &current);
13068                             av_push(stack, current);
13069                             break;
13070
13071                         case '|':
13072                         case '+':
13073                             prev = av_pop(stack);
13074                             _invlist_union(prev, current, &current);
13075                             av_push(stack, current);
13076                             break;
13077
13078                         case '-':
13079                             prev = av_pop(stack);;
13080                             _invlist_subtract(prev, current, &current);
13081                             av_push(stack, current);
13082                             break;
13083
13084                         case '^':   /* The union minus the intersection */
13085                         {
13086                             SV* i = NULL;
13087                             SV* u = NULL;
13088                             SV* element;
13089
13090                             prev = av_pop(stack);
13091                             _invlist_union(prev, current, &u);
13092                             _invlist_intersection(prev, current, &i);
13093                             /* _invlist_subtract will overwrite current
13094                                 without freeing what it already contains */
13095                             element = current;
13096                             _invlist_subtract(u, i, &current);
13097                             av_push(stack, current);
13098                             SvREFCNT_dec_NN(i);
13099                             SvREFCNT_dec_NN(u);
13100                             SvREFCNT_dec_NN(element);
13101                             break;
13102                         }
13103
13104                         default:
13105                             Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13106                 }
13107                 SvREFCNT_dec_NN(top);
13108                 SvREFCNT_dec(prev);
13109             }
13110         }
13111
13112         RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13113     }
13114
13115     if (av_tindex(stack) < 0   /* Was empty */
13116         || ((final = av_pop(stack)) == NULL)
13117         || ! IS_OPERAND(final)
13118         || av_tindex(stack) >= 0)  /* More left on stack */
13119     {
13120         vFAIL("Incomplete expression within '(?[ ])'");
13121     }
13122
13123     /* Here, 'final' is the resultant inversion list from evaluating the
13124      * expression.  Return it if so requested */
13125     if (return_invlist) {
13126         *return_invlist = final;
13127         return END;
13128     }
13129
13130     /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13131      * expecting a string of ranges and individual code points */
13132     invlist_iterinit(final);
13133     result_string = newSVpvs("");
13134     while (invlist_iternext(final, &start, &end)) {
13135         if (start == end) {
13136             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13137         }
13138         else {
13139             Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13140                                                      start,          end);
13141         }
13142     }
13143
13144     save_parse = RExC_parse;
13145     RExC_parse = SvPV(result_string, len);
13146     save_end = RExC_end;
13147     RExC_end = RExC_parse + len;
13148
13149     /* We turn off folding around the call, as the class we have constructed
13150      * already has all folding taken into consideration, and we don't want
13151      * regclass() to add to that */
13152     RExC_flags &= ~RXf_PMf_FOLD;
13153     /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13154      */
13155     node = regclass(pRExC_state, flagp,depth+1,
13156                     FALSE, /* means parse the whole char class */
13157                     FALSE, /* don't allow multi-char folds */
13158                     TRUE, /* silence non-portable warnings.  The above may very
13159                              well have generated non-portable code points, but
13160                              they're valid on this machine */
13161                     NULL);
13162     if (!node)
13163         FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13164                     PTR2UV(flagp));
13165     if (save_fold) {
13166         RExC_flags |= RXf_PMf_FOLD;
13167     }
13168     RExC_parse = save_parse + 1;
13169     RExC_end = save_end;
13170     SvREFCNT_dec_NN(final);
13171     SvREFCNT_dec_NN(result_string);
13172
13173     nextchar(pRExC_state);
13174     Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13175     return node;
13176 }
13177 #undef IS_OPERAND
13178
13179 /* The names of properties whose definitions are not known at compile time are
13180  * stored in this SV, after a constant heading.  So if the length has been
13181  * changed since initialization, then there is a run-time definition. */
13182 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13183                                         (SvCUR(listsv) != initial_listsv_len)
13184
13185 STATIC regnode *
13186 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13187                  const bool stop_at_1,  /* Just parse the next thing, don't
13188                                            look for a full character class */
13189                  bool allow_multi_folds,
13190                  const bool silence_non_portable,   /* Don't output warnings
13191                                                        about too large
13192                                                        characters */
13193                  SV** ret_invlist)  /* Return an inversion list, not a node */
13194 {
13195     /* parse a bracketed class specification.  Most of these will produce an
13196      * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13197      * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13198      * under /i with multi-character folds: it will be rewritten following the
13199      * paradigm of this example, where the <multi-fold>s are characters which
13200      * fold to multiple character sequences:
13201      *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13202      * gets effectively rewritten as:
13203      *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13204      * reg() gets called (recursively) on the rewritten version, and this
13205      * function will return what it constructs.  (Actually the <multi-fold>s
13206      * aren't physically removed from the [abcdefghi], it's just that they are
13207      * ignored in the recursion by means of a flag:
13208      * <RExC_in_multi_char_class>.)
13209      *
13210      * ANYOF nodes contain a bit map for the first 256 characters, with the
13211      * corresponding bit set if that character is in the list.  For characters
13212      * above 255, a range list or swash is used.  There are extra bits for \w,
13213      * etc. in locale ANYOFs, as what these match is not determinable at
13214      * compile time
13215      *
13216      * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13217      * to be restarted.  This can only happen if ret_invlist is non-NULL.
13218      */
13219
13220     dVAR;
13221     UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13222     IV range = 0;
13223     UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13224     regnode *ret;
13225     STRLEN numlen;
13226     IV namedclass = OOB_NAMEDCLASS;
13227     char *rangebegin = NULL;
13228     bool need_class = 0;
13229     SV *listsv = NULL;
13230     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13231                                       than just initialized.  */
13232     SV* properties = NULL;    /* Code points that match \p{} \P{} */
13233     SV* posixes = NULL;     /* Code points that match classes like [:word:],
13234                                extended beyond the Latin1 range.  These have to
13235                                be kept separate from other code points for much
13236                                of this function because their handling  is
13237                                different under /i, and for most classes under
13238                                /d as well */
13239     SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13240                                separate for a while from the non-complemented
13241                                versions because of complications with /d
13242                                matching */
13243     UV element_count = 0;   /* Number of distinct elements in the class.
13244                                Optimizations may be possible if this is tiny */
13245     AV * multi_char_matches = NULL; /* Code points that fold to more than one
13246                                        character; used under /i */
13247     UV n;
13248     char * stop_ptr = RExC_end;    /* where to stop parsing */
13249     const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13250                                                    space? */
13251     const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13252
13253     /* Unicode properties are stored in a swash; this holds the current one
13254      * being parsed.  If this swash is the only above-latin1 component of the
13255      * character class, an optimization is to pass it directly on to the
13256      * execution engine.  Otherwise, it is set to NULL to indicate that there
13257      * are other things in the class that have to be dealt with at execution
13258      * time */
13259     SV* swash = NULL;           /* Code points that match \p{} \P{} */
13260
13261     /* Set if a component of this character class is user-defined; just passed
13262      * on to the engine */
13263     bool has_user_defined_property = FALSE;
13264
13265     /* inversion list of code points this node matches only when the target
13266      * string is in UTF-8.  (Because is under /d) */
13267     SV* depends_list = NULL;
13268
13269     /* Inversion list of code points this node matches regardless of things
13270      * like locale, folding, utf8ness of the target string */
13271     SV* cp_list = NULL;
13272
13273     /* Like cp_list, but code points on this list need to be checked for things
13274      * that fold to/from them under /i */
13275     SV* cp_foldable_list = NULL;
13276
13277     /* Like cp_list, but code points on this list are valid only when the
13278      * runtime locale is UTF-8 */
13279     SV* only_utf8_locale_list = NULL;
13280
13281 #ifdef EBCDIC
13282     /* In a range, counts how many 0-2 of the ends of it came from literals,
13283      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13284     UV literal_endpoint = 0;
13285 #endif
13286     bool invert = FALSE;    /* Is this class to be complemented */
13287
13288     bool warn_super = ALWAYS_WARN_SUPER;
13289
13290     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13291         case we need to change the emitted regop to an EXACT. */
13292     const char * orig_parse = RExC_parse;
13293     const SSize_t orig_size = RExC_size;
13294     bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13295     GET_RE_DEBUG_FLAGS_DECL;
13296
13297     PERL_ARGS_ASSERT_REGCLASS;
13298 #ifndef DEBUGGING
13299     PERL_UNUSED_ARG(depth);
13300 #endif
13301
13302     DEBUG_PARSE("clas");
13303
13304     /* Assume we are going to generate an ANYOF node. */
13305     ret = reganode(pRExC_state, ANYOF, 0);
13306
13307     if (SIZE_ONLY) {
13308         RExC_size += ANYOF_SKIP;
13309         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13310     }
13311     else {
13312         ANYOF_FLAGS(ret) = 0;
13313
13314         RExC_emit += ANYOF_SKIP;
13315         listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13316         initial_listsv_len = SvCUR(listsv);
13317         SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13318     }
13319
13320     if (skip_white) {
13321         RExC_parse = regpatws(pRExC_state, RExC_parse,
13322                               FALSE /* means don't recognize comments */);
13323     }
13324
13325     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
13326         RExC_parse++;
13327         invert = TRUE;
13328         allow_multi_folds = FALSE;
13329         RExC_naughty++;
13330         if (skip_white) {
13331             RExC_parse = regpatws(pRExC_state, RExC_parse,
13332                                   FALSE /* means don't recognize comments */);
13333         }
13334     }
13335
13336     /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13337     if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13338         const char *s = RExC_parse;
13339         const char  c = *s++;
13340
13341         while (isWORDCHAR(*s))
13342             s++;
13343         if (*s && c == *s && s[1] == ']') {
13344             SAVEFREESV(RExC_rx_sv);
13345             ckWARN3reg(s+2,
13346                        "POSIX syntax [%c %c] belongs inside character classes",
13347                        c, c);
13348             (void)ReREFCNT_inc(RExC_rx_sv);
13349         }
13350     }
13351
13352     /* If the caller wants us to just parse a single element, accomplish this
13353      * by faking the loop ending condition */
13354     if (stop_at_1 && RExC_end > RExC_parse) {
13355         stop_ptr = RExC_parse + 1;
13356     }
13357
13358     /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13359     if (UCHARAT(RExC_parse) == ']')
13360         goto charclassloop;
13361
13362 parseit:
13363     while (1) {
13364         if  (RExC_parse >= stop_ptr) {
13365             break;
13366         }
13367
13368         if (skip_white) {
13369             RExC_parse = regpatws(pRExC_state, RExC_parse,
13370                                   FALSE /* means don't recognize comments */);
13371         }
13372
13373         if  (UCHARAT(RExC_parse) == ']') {
13374             break;
13375         }
13376
13377     charclassloop:
13378
13379         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13380         save_value = value;
13381         save_prevvalue = prevvalue;
13382
13383         if (!range) {
13384             rangebegin = RExC_parse;
13385             element_count++;
13386         }
13387         if (UTF) {
13388             value = utf8n_to_uvchr((U8*)RExC_parse,
13389                                    RExC_end - RExC_parse,
13390                                    &numlen, UTF8_ALLOW_DEFAULT);
13391             RExC_parse += numlen;
13392         }
13393         else
13394             value = UCHARAT(RExC_parse++);
13395
13396         if (value == '['
13397             && RExC_parse < RExC_end
13398             && POSIXCC(UCHARAT(RExC_parse)))
13399         {
13400             namedclass = regpposixcc(pRExC_state, value, strict);
13401         }
13402         else if (value == '\\') {
13403             if (UTF) {
13404                 value = utf8n_to_uvchr((U8*)RExC_parse,
13405                                    RExC_end - RExC_parse,
13406                                    &numlen, UTF8_ALLOW_DEFAULT);
13407                 RExC_parse += numlen;
13408             }
13409             else
13410                 value = UCHARAT(RExC_parse++);
13411
13412             /* Some compilers cannot handle switching on 64-bit integer
13413              * values, therefore value cannot be an UV.  Yes, this will
13414              * be a problem later if we want switch on Unicode.
13415              * A similar issue a little bit later when switching on
13416              * namedclass. --jhi */
13417
13418             /* If the \ is escaping white space when white space is being
13419              * skipped, it means that that white space is wanted literally, and
13420              * is already in 'value'.  Otherwise, need to translate the escape
13421              * into what it signifies. */
13422             if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13423
13424             case 'w':   namedclass = ANYOF_WORDCHAR;    break;
13425             case 'W':   namedclass = ANYOF_NWORDCHAR;   break;
13426             case 's':   namedclass = ANYOF_SPACE;       break;
13427             case 'S':   namedclass = ANYOF_NSPACE;      break;
13428             case 'd':   namedclass = ANYOF_DIGIT;       break;
13429             case 'D':   namedclass = ANYOF_NDIGIT;      break;
13430             case 'v':   namedclass = ANYOF_VERTWS;      break;
13431             case 'V':   namedclass = ANYOF_NVERTWS;     break;
13432             case 'h':   namedclass = ANYOF_HORIZWS;     break;
13433             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
13434             case 'N':  /* Handle \N{NAME} in class */
13435                 {
13436                     /* We only pay attention to the first char of
13437                     multichar strings being returned. I kinda wonder
13438                     if this makes sense as it does change the behaviour
13439                     from earlier versions, OTOH that behaviour was broken
13440                     as well. */
13441                     if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13442                                       TRUE, /* => charclass */
13443                                       strict))
13444                     {
13445                         if (*flagp & RESTART_UTF8)
13446                             FAIL("panic: grok_bslash_N set RESTART_UTF8");
13447                         goto parseit;
13448                     }
13449                 }
13450                 break;
13451             case 'p':
13452             case 'P':
13453                 {
13454                 char *e;
13455
13456                 /* We will handle any undefined properties ourselves */
13457                 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13458                                        /* And we actually would prefer to get
13459                                         * the straight inversion list of the
13460                                         * swash, since we will be accessing it
13461                                         * anyway, to save a little time */
13462                                       |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13463
13464                 if (RExC_parse >= RExC_end)
13465                     vFAIL2("Empty \\%c{}", (U8)value);
13466                 if (*RExC_parse == '{') {
13467                     const U8 c = (U8)value;
13468                     e = strchr(RExC_parse++, '}');
13469                     if (!e)
13470                         vFAIL2("Missing right brace on \\%c{}", c);
13471                     while (isSPACE(UCHARAT(RExC_parse)))
13472                         RExC_parse++;
13473                     if (e == RExC_parse)
13474                         vFAIL2("Empty \\%c{}", c);
13475                     n = e - RExC_parse;
13476                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13477                         n--;
13478                 }
13479                 else {
13480                     e = RExC_parse;
13481                     n = 1;
13482                 }
13483                 if (!SIZE_ONLY) {
13484                     SV* invlist;
13485                     char* formatted;
13486                     char* name;
13487
13488                     if (UCHARAT(RExC_parse) == '^') {
13489                          RExC_parse++;
13490                          n--;
13491                          /* toggle.  (The rhs xor gets the single bit that
13492                           * differs between P and p; the other xor inverts just
13493                           * that bit) */
13494                          value ^= 'P' ^ 'p';
13495
13496                          while (isSPACE(UCHARAT(RExC_parse))) {
13497                               RExC_parse++;
13498                               n--;
13499                          }
13500                     }
13501                     /* Try to get the definition of the property into
13502                      * <invlist>.  If /i is in effect, the effective property
13503                      * will have its name be <__NAME_i>.  The design is
13504                      * discussed in commit
13505                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13506                     formatted = Perl_form(aTHX_
13507                                           "%s%.*s%s\n",
13508                                           (FOLD) ? "__" : "",
13509                                           (int)n,
13510                                           RExC_parse,
13511                                           (FOLD) ? "_i" : ""
13512                                 );
13513                     name = savepvn(formatted, strlen(formatted));
13514
13515                     /* Look up the property name, and get its swash and
13516                      * inversion list, if the property is found  */
13517                     if (swash) {
13518                         SvREFCNT_dec_NN(swash);
13519                     }
13520                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
13521                                              1, /* binary */
13522                                              0, /* not tr/// */
13523                                              NULL, /* No inversion list */
13524                                              &swash_init_flags
13525                                             );
13526                     if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13527                         if (swash) {
13528                             SvREFCNT_dec_NN(swash);
13529                             swash = NULL;
13530                         }
13531
13532                         /* Here didn't find it.  It could be a user-defined
13533                          * property that will be available at run-time.  If we
13534                          * accept only compile-time properties, is an error;
13535                          * otherwise add it to the list for run-time look up */
13536                         if (ret_invlist) {
13537                             RExC_parse = e + 1;
13538                             vFAIL2utf8f(
13539                                 "Property '%"UTF8f"' is unknown",
13540                                 UTF8fARG(UTF, n, name));
13541                         }
13542                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13543                                         (value == 'p' ? '+' : '!'),
13544                                         UTF8fARG(UTF, n, name));
13545                         has_user_defined_property = TRUE;
13546
13547                         /* We don't know yet, so have to assume that the
13548                          * property could match something in the Latin1 range,
13549                          * hence something that isn't utf8.  Note that this
13550                          * would cause things in <depends_list> to match
13551                          * inappropriately, except that any \p{}, including
13552                          * this one forces Unicode semantics, which means there
13553                          * is no <depends_list> */
13554                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13555                     }
13556                     else {
13557
13558                         /* Here, did get the swash and its inversion list.  If
13559                          * the swash is from a user-defined property, then this
13560                          * whole character class should be regarded as such */
13561                         if (swash_init_flags
13562                             & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13563                         {
13564                             has_user_defined_property = TRUE;
13565                         }
13566                         else if
13567                             /* We warn on matching an above-Unicode code point
13568                              * if the match would return true, except don't
13569                              * warn for \p{All}, which has exactly one element
13570                              * = 0 */
13571                             (_invlist_contains_cp(invlist, 0x110000)
13572                                 && (! (_invlist_len(invlist) == 1
13573                                        && *invlist_array(invlist) == 0)))
13574                         {
13575                             warn_super = TRUE;
13576                         }
13577
13578
13579                         /* Invert if asking for the complement */
13580                         if (value == 'P') {
13581                             _invlist_union_complement_2nd(properties,
13582                                                           invlist,
13583                                                           &properties);
13584
13585                             /* The swash can't be used as-is, because we've
13586                              * inverted things; delay removing it to here after
13587                              * have copied its invlist above */
13588                             SvREFCNT_dec_NN(swash);
13589                             swash = NULL;
13590                         }
13591                         else {
13592                             _invlist_union(properties, invlist, &properties);
13593                         }
13594                     }
13595                     Safefree(name);
13596                 }
13597                 RExC_parse = e + 1;
13598                 namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13599                                                 named */
13600
13601                 /* \p means they want Unicode semantics */
13602                 RExC_uni_semantics = 1;
13603                 }
13604                 break;
13605             case 'n':   value = '\n';                   break;
13606             case 'r':   value = '\r';                   break;
13607             case 't':   value = '\t';                   break;
13608             case 'f':   value = '\f';                   break;
13609             case 'b':   value = '\b';                   break;
13610             case 'e':   value = ASCII_TO_NATIVE('\033');break;
13611             case 'a':   value = '\a';                   break;
13612             case 'o':
13613                 RExC_parse--;   /* function expects to be pointed at the 'o' */
13614                 {
13615                     const char* error_msg;
13616                     bool valid = grok_bslash_o(&RExC_parse,
13617                                                &value,
13618                                                &error_msg,
13619                                                SIZE_ONLY,   /* warnings in pass
13620                                                                1 only */
13621                                                strict,
13622                                                silence_non_portable,
13623                                                UTF);
13624                     if (! valid) {
13625                         vFAIL(error_msg);
13626                     }
13627                 }
13628                 if (PL_encoding && value < 0x100) {
13629                     goto recode_encoding;
13630                 }
13631                 break;
13632             case 'x':
13633                 RExC_parse--;   /* function expects to be pointed at the 'x' */
13634                 {
13635                     const char* error_msg;
13636                     bool valid = grok_bslash_x(&RExC_parse,
13637                                                &value,
13638                                                &error_msg,
13639                                                TRUE, /* Output warnings */
13640                                                strict,
13641                                                silence_non_portable,
13642                                                UTF);
13643                     if (! valid) {
13644                         vFAIL(error_msg);
13645                     }
13646                 }
13647                 if (PL_encoding && value < 0x100)
13648                     goto recode_encoding;
13649                 break;
13650             case 'c':
13651                 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13652                 break;
13653             case '0': case '1': case '2': case '3': case '4':
13654             case '5': case '6': case '7':
13655                 {
13656                     /* Take 1-3 octal digits */
13657                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13658                     numlen = (strict) ? 4 : 3;
13659                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13660                     RExC_parse += numlen;
13661                     if (numlen != 3) {
13662                         if (strict) {
13663                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13664                             vFAIL("Need exactly 3 octal digits");
13665                         }
13666                         else if (! SIZE_ONLY /* like \08, \178 */
13667                                  && numlen < 3
13668                                  && RExC_parse < RExC_end
13669                                  && isDIGIT(*RExC_parse)
13670                                  && ckWARN(WARN_REGEXP))
13671                         {
13672                             SAVEFREESV(RExC_rx_sv);
13673                             reg_warn_non_literal_string(
13674                                  RExC_parse + 1,
13675                                  form_short_octal_warning(RExC_parse, numlen));
13676                             (void)ReREFCNT_inc(RExC_rx_sv);
13677                         }
13678                     }
13679                     if (PL_encoding && value < 0x100)
13680                         goto recode_encoding;
13681                     break;
13682                 }
13683             recode_encoding:
13684                 if (! RExC_override_recoding) {
13685                     SV* enc = PL_encoding;
13686                     value = reg_recode((const char)(U8)value, &enc);
13687                     if (!enc) {
13688                         if (strict) {
13689                             vFAIL("Invalid escape in the specified encoding");
13690                         }
13691                         else if (SIZE_ONLY) {
13692                             ckWARNreg(RExC_parse,
13693                                   "Invalid escape in the specified encoding");
13694                         }
13695                     }
13696                     break;
13697                 }
13698             default:
13699                 /* Allow \_ to not give an error */
13700                 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13701                     if (strict) {
13702                         vFAIL2("Unrecognized escape \\%c in character class",
13703                                (int)value);
13704                     }
13705                     else {
13706                         SAVEFREESV(RExC_rx_sv);
13707                         ckWARN2reg(RExC_parse,
13708                             "Unrecognized escape \\%c in character class passed through",
13709                             (int)value);
13710                         (void)ReREFCNT_inc(RExC_rx_sv);
13711                     }
13712                 }
13713                 break;
13714             }   /* End of switch on char following backslash */
13715         } /* end of handling backslash escape sequences */
13716 #ifdef EBCDIC
13717         else
13718             literal_endpoint++;
13719 #endif
13720
13721         /* Here, we have the current token in 'value' */
13722
13723         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13724             U8 classnum;
13725
13726             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13727              * literal, as is the character that began the false range, i.e.
13728              * the 'a' in the examples */
13729             if (range) {
13730                 if (!SIZE_ONLY) {
13731                     const int w = (RExC_parse >= rangebegin)
13732                                   ? RExC_parse - rangebegin
13733                                   : 0;
13734                     if (strict) {
13735                         vFAIL2utf8f(
13736                             "False [] range \"%"UTF8f"\"",
13737                             UTF8fARG(UTF, w, rangebegin));
13738                     }
13739                     else {
13740                         SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13741                         ckWARN2reg(RExC_parse,
13742                             "False [] range \"%"UTF8f"\"",
13743                             UTF8fARG(UTF, w, rangebegin));
13744                         (void)ReREFCNT_inc(RExC_rx_sv);
13745                         cp_list = add_cp_to_invlist(cp_list, '-');
13746                         cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13747                                                              prevvalue);
13748                     }
13749                 }
13750
13751                 range = 0; /* this was not a true range */
13752                 element_count += 2; /* So counts for three values */
13753             }
13754
13755             classnum = namedclass_to_classnum(namedclass);
13756
13757             if (LOC && namedclass < ANYOF_POSIXL_MAX
13758 #ifndef HAS_ISASCII
13759                 && classnum != _CC_ASCII
13760 #endif
13761             ) {
13762                 /* What the Posix classes (like \w, [:space:]) match in locale
13763                  * isn't knowable under locale until actual match time.  Room
13764                  * must be reserved (one time per outer bracketed class) to
13765                  * store such classes.  The space will contain a bit for each
13766                  * named class that is to be matched against.  This isn't
13767                  * needed for \p{} and pseudo-classes, as they are not affected
13768                  * by locale, and hence are dealt with separately */
13769                 if (! need_class) {
13770                     need_class = 1;
13771                     if (SIZE_ONLY) {
13772                         RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13773                     }
13774                     else {
13775                         RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13776                     }
13777                     ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13778                     ANYOF_POSIXL_ZERO(ret);
13779                 }
13780
13781                 /* See if it already matches the complement of this POSIX
13782                  * class */
13783                 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13784                     && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13785                                                             ? -1
13786                                                             : 1)))
13787                 {
13788                     posixl_matches_all = TRUE;
13789                     break;  /* No need to continue.  Since it matches both
13790                                e.g., \w and \W, it matches everything, and the
13791                                bracketed class can be optimized into qr/./s */
13792                 }
13793
13794                 /* Add this class to those that should be checked at runtime */
13795                 ANYOF_POSIXL_SET(ret, namedclass);
13796
13797                 /* The above-Latin1 characters are not subject to locale rules.
13798                  * Just add them, in the second pass, to the
13799                  * unconditionally-matched list */
13800                 if (! SIZE_ONLY) {
13801                     SV* scratch_list = NULL;
13802
13803                     /* Get the list of the above-Latin1 code points this
13804                      * matches */
13805                     _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13806                                           PL_XPosix_ptrs[classnum],
13807
13808                                           /* Odd numbers are complements, like
13809                                            * NDIGIT, NASCII, ... */
13810                                           namedclass % 2 != 0,
13811                                           &scratch_list);
13812                     /* Checking if 'cp_list' is NULL first saves an extra
13813                      * clone.  Its reference count will be decremented at the
13814                      * next union, etc, or if this is the only instance, at the
13815                      * end of the routine */
13816                     if (! cp_list) {
13817                         cp_list = scratch_list;
13818                     }
13819                     else {
13820                         _invlist_union(cp_list, scratch_list, &cp_list);
13821                         SvREFCNT_dec_NN(scratch_list);
13822                     }
13823                     continue;   /* Go get next character */
13824                 }
13825             }
13826             else if (! SIZE_ONLY) {
13827
13828                 /* Here, not in pass1 (in that pass we skip calculating the
13829                  * contents of this class), and is /l, or is a POSIX class for
13830                  * which /l doesn't matter (or is a Unicode property, which is
13831                  * skipped here). */
13832                 if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13833                     if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13834
13835                         /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13836                          * nor /l make a difference in what these match,
13837                          * therefore we just add what they match to cp_list. */
13838                         if (classnum != _CC_VERTSPACE) {
13839                             assert(   namedclass == ANYOF_HORIZWS
13840                                    || namedclass == ANYOF_NHORIZWS);
13841
13842                             /* It turns out that \h is just a synonym for
13843                              * XPosixBlank */
13844                             classnum = _CC_BLANK;
13845                         }
13846
13847                         _invlist_union_maybe_complement_2nd(
13848                                 cp_list,
13849                                 PL_XPosix_ptrs[classnum],
13850                                 namedclass % 2 != 0,    /* Complement if odd
13851                                                           (NHORIZWS, NVERTWS)
13852                                                         */
13853                                 &cp_list);
13854                     }
13855                 }
13856                 else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13857                            complement and use nposixes */
13858                     SV** posixes_ptr = namedclass % 2 == 0
13859                                        ? &posixes
13860                                        : &nposixes;
13861                     SV** source_ptr = &PL_XPosix_ptrs[classnum];
13862                     _invlist_union_maybe_complement_2nd(
13863                                                      *posixes_ptr,
13864                                                      *source_ptr,
13865                                                      namedclass % 2 != 0,
13866                                                      posixes_ptr);
13867                 }
13868                 continue;   /* Go get next character */
13869             }
13870         } /* end of namedclass \blah */
13871
13872         /* Here, we have a single value.  If 'range' is set, it is the ending
13873          * of a range--check its validity.  Later, we will handle each
13874          * individual code point in the range.  If 'range' isn't set, this
13875          * could be the beginning of a range, so check for that by looking
13876          * ahead to see if the next real character to be processed is the range
13877          * indicator--the minus sign */
13878
13879         if (skip_white) {
13880             RExC_parse = regpatws(pRExC_state, RExC_parse,
13881                                 FALSE /* means don't recognize comments */);
13882         }
13883
13884         if (range) {
13885             if (prevvalue > value) /* b-a */ {
13886                 const int w = RExC_parse - rangebegin;
13887                 vFAIL2utf8f(
13888                     "Invalid [] range \"%"UTF8f"\"",
13889                     UTF8fARG(UTF, w, rangebegin));
13890                 range = 0; /* not a valid range */
13891             }
13892         }
13893         else {
13894             prevvalue = value; /* save the beginning of the potential range */
13895             if (! stop_at_1     /* Can't be a range if parsing just one thing */
13896                 && *RExC_parse == '-')
13897             {
13898                 char* next_char_ptr = RExC_parse + 1;
13899                 if (skip_white) {   /* Get the next real char after the '-' */
13900                     next_char_ptr = regpatws(pRExC_state,
13901                                              RExC_parse + 1,
13902                                              FALSE); /* means don't recognize
13903                                                         comments */
13904                 }
13905
13906                 /* If the '-' is at the end of the class (just before the ']',
13907                  * it is a literal minus; otherwise it is a range */
13908                 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13909                     RExC_parse = next_char_ptr;
13910
13911                     /* a bad range like \w-, [:word:]- ? */
13912                     if (namedclass > OOB_NAMEDCLASS) {
13913                         if (strict || ckWARN(WARN_REGEXP)) {
13914                             const int w =
13915                                 RExC_parse >= rangebegin ?
13916                                 RExC_parse - rangebegin : 0;
13917                             if (strict) {
13918                                 vFAIL4("False [] range \"%*.*s\"",
13919                                     w, w, rangebegin);
13920                             }
13921                             else {
13922                                 vWARN4(RExC_parse,
13923                                     "False [] range \"%*.*s\"",
13924                                     w, w, rangebegin);
13925                             }
13926                         }
13927                         if (!SIZE_ONLY) {
13928                             cp_list = add_cp_to_invlist(cp_list, '-');
13929                         }
13930                         element_count++;
13931                     } else
13932                         range = 1;      /* yeah, it's a range! */
13933                     continue;   /* but do it the next time */
13934                 }
13935             }
13936         }
13937
13938         /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13939          * if not */
13940
13941         /* non-Latin1 code point implies unicode semantics.  Must be set in
13942          * pass1 so is there for the whole of pass 2 */
13943         if (value > 255) {
13944             RExC_uni_semantics = 1;
13945         }
13946
13947         /* Ready to process either the single value, or the completed range.
13948          * For single-valued non-inverted ranges, we consider the possibility
13949          * of multi-char folds.  (We made a conscious decision to not do this
13950          * for the other cases because it can often lead to non-intuitive
13951          * results.  For example, you have the peculiar case that:
13952          *  "s s" =~ /^[^\xDF]+$/i => Y
13953          *  "ss"  =~ /^[^\xDF]+$/i => N
13954          *
13955          * See [perl #89750] */
13956         if (FOLD && allow_multi_folds && value == prevvalue) {
13957             if (value == LATIN_SMALL_LETTER_SHARP_S
13958                 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13959                                                         value)))
13960             {
13961                 /* Here <value> is indeed a multi-char fold.  Get what it is */
13962
13963                 U8 foldbuf[UTF8_MAXBYTES_CASE];
13964                 STRLEN foldlen;
13965
13966                 UV folded = _to_uni_fold_flags(
13967                                 value,
13968                                 foldbuf,
13969                                 &foldlen,
13970                                 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13971                                                    ? FOLD_FLAGS_NOMIX_ASCII
13972                                                    : 0)
13973                                 );
13974
13975                 /* Here, <folded> should be the first character of the
13976                  * multi-char fold of <value>, with <foldbuf> containing the
13977                  * whole thing.  But, if this fold is not allowed (because of
13978                  * the flags), <fold> will be the same as <value>, and should
13979                  * be processed like any other character, so skip the special
13980                  * handling */
13981                 if (folded != value) {
13982
13983                     /* Skip if we are recursed, currently parsing the class
13984                      * again.  Otherwise add this character to the list of
13985                      * multi-char folds. */
13986                     if (! RExC_in_multi_char_class) {
13987                         AV** this_array_ptr;
13988                         AV* this_array;
13989                         STRLEN cp_count = utf8_length(foldbuf,
13990                                                       foldbuf + foldlen);
13991                         SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13992
13993                         Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13994
13995
13996                         if (! multi_char_matches) {
13997                             multi_char_matches = newAV();
13998                         }
13999
14000                         /* <multi_char_matches> is actually an array of arrays.
14001                          * There will be one or two top-level elements: [2],
14002                          * and/or [3].  The [2] element is an array, each
14003                          * element thereof is a character which folds to TWO
14004                          * characters; [3] is for folds to THREE characters.
14005                          * (Unicode guarantees a maximum of 3 characters in any
14006                          * fold.)  When we rewrite the character class below,
14007                          * we will do so such that the longest folds are
14008                          * written first, so that it prefers the longest
14009                          * matching strings first.  This is done even if it
14010                          * turns out that any quantifier is non-greedy, out of
14011                          * programmer laziness.  Tom Christiansen has agreed
14012                          * that this is ok.  This makes the test for the
14013                          * ligature 'ffi' come before the test for 'ff' */
14014                         if (av_exists(multi_char_matches, cp_count)) {
14015                             this_array_ptr = (AV**) av_fetch(multi_char_matches,
14016                                                              cp_count, FALSE);
14017                             this_array = *this_array_ptr;
14018                         }
14019                         else {
14020                             this_array = newAV();
14021                             av_store(multi_char_matches, cp_count,
14022                                      (SV*) this_array);
14023                         }
14024                         av_push(this_array, multi_fold);
14025                     }
14026
14027                     /* This element should not be processed further in this
14028                      * class */
14029                     element_count--;
14030                     value = save_value;
14031                     prevvalue = save_prevvalue;
14032                     continue;
14033                 }
14034             }
14035         }
14036
14037         /* Deal with this element of the class */
14038         if (! SIZE_ONLY) {
14039 #ifndef EBCDIC
14040             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14041                                                      prevvalue, value);
14042 #else
14043             SV* this_range = _new_invlist(1);
14044             _append_range_to_invlist(this_range, prevvalue, value);
14045
14046             /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14047              * If this range was specified using something like 'i-j', we want
14048              * to include only the 'i' and the 'j', and not anything in
14049              * between, so exclude non-ASCII, non-alphabetics from it.
14050              * However, if the range was specified with something like
14051              * [\x89-\x91] or [\x89-j], all code points within it should be
14052              * included.  literal_endpoint==2 means both ends of the range used
14053              * a literal character, not \x{foo} */
14054             if (literal_endpoint == 2
14055                 && ((prevvalue >= 'a' && value <= 'z')
14056                     || (prevvalue >= 'A' && value <= 'Z')))
14057             {
14058                 _invlist_intersection(this_range, PL_ASCII,
14059                                       &this_range);
14060
14061                 /* Since this above only contains ascii, the intersection of it
14062                  * with anything will still yield only ascii */
14063                 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14064                                       &this_range);
14065             }
14066             _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14067             literal_endpoint = 0;
14068 #endif
14069         }
14070
14071         range = 0; /* this range (if it was one) is done now */
14072     } /* End of loop through all the text within the brackets */
14073
14074     /* If anything in the class expands to more than one character, we have to
14075      * deal with them by building up a substitute parse string, and recursively
14076      * calling reg() on it, instead of proceeding */
14077     if (multi_char_matches) {
14078         SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14079         I32 cp_count;
14080         STRLEN len;
14081         char *save_end = RExC_end;
14082         char *save_parse = RExC_parse;
14083         bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14084                                        a "|" */
14085         I32 reg_flags;
14086
14087         assert(! invert);
14088 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14089            because too confusing */
14090         if (invert) {
14091             sv_catpv(substitute_parse, "(?:");
14092         }
14093 #endif
14094
14095         /* Look at the longest folds first */
14096         for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14097
14098             if (av_exists(multi_char_matches, cp_count)) {
14099                 AV** this_array_ptr;
14100                 SV* this_sequence;
14101
14102                 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14103                                                  cp_count, FALSE);
14104                 while ((this_sequence = av_pop(*this_array_ptr)) !=
14105                                                                 &PL_sv_undef)
14106                 {
14107                     if (! first_time) {
14108                         sv_catpv(substitute_parse, "|");
14109                     }
14110                     first_time = FALSE;
14111
14112                     sv_catpv(substitute_parse, SvPVX(this_sequence));
14113                 }
14114             }
14115         }
14116
14117         /* If the character class contains anything else besides these
14118          * multi-character folds, have to include it in recursive parsing */
14119         if (element_count) {
14120             sv_catpv(substitute_parse, "|[");
14121             sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14122             sv_catpv(substitute_parse, "]");
14123         }
14124
14125         sv_catpv(substitute_parse, ")");
14126 #if 0
14127         if (invert) {
14128             /* This is a way to get the parse to skip forward a whole named
14129              * sequence instead of matching the 2nd character when it fails the
14130              * first */
14131             sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14132         }
14133 #endif
14134
14135         RExC_parse = SvPV(substitute_parse, len);
14136         RExC_end = RExC_parse + len;
14137         RExC_in_multi_char_class = 1;
14138         RExC_emit = (regnode *)orig_emit;
14139
14140         ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14141
14142         *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14143
14144         RExC_parse = save_parse;
14145         RExC_end = save_end;
14146         RExC_in_multi_char_class = 0;
14147         SvREFCNT_dec_NN(multi_char_matches);
14148         return ret;
14149     }
14150
14151     /* Here, we've gone through the entire class and dealt with multi-char
14152      * folds.  We are now in a position that we can do some checks to see if we
14153      * can optimize this ANYOF node into a simpler one, even in Pass 1.
14154      * Currently we only do two checks:
14155      * 1) is in the unlikely event that the user has specified both, eg. \w and
14156      *    \W under /l, then the class matches everything.  (This optimization
14157      *    is done only to make the optimizer code run later work.)
14158      * 2) if the character class contains only a single element (including a
14159      *    single range), we see if there is an equivalent node for it.
14160      * Other checks are possible */
14161     if (! ret_invlist   /* Can't optimize if returning the constructed
14162                            inversion list */
14163         && (UNLIKELY(posixl_matches_all) || element_count == 1))
14164     {
14165         U8 op = END;
14166         U8 arg = 0;
14167
14168         if (UNLIKELY(posixl_matches_all)) {
14169             op = SANY;
14170         }
14171         else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14172                                                    \w or [:digit:] or \p{foo}
14173                                                  */
14174
14175             /* All named classes are mapped into POSIXish nodes, with its FLAG
14176              * argument giving which class it is */
14177             switch ((I32)namedclass) {
14178                 case ANYOF_UNIPROP:
14179                     break;
14180
14181                 /* These don't depend on the charset modifiers.  They always
14182                  * match under /u rules */
14183                 case ANYOF_NHORIZWS:
14184                 case ANYOF_HORIZWS:
14185                     namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14186                     /* FALLTHROUGH */
14187
14188                 case ANYOF_NVERTWS:
14189                 case ANYOF_VERTWS:
14190                     op = POSIXU;
14191                     goto join_posix;
14192
14193                 /* The actual POSIXish node for all the rest depends on the
14194                  * charset modifier.  The ones in the first set depend only on
14195                  * ASCII or, if available on this platform, locale */
14196                 case ANYOF_ASCII:
14197                 case ANYOF_NASCII:
14198 #ifdef HAS_ISASCII
14199                     op = (LOC) ? POSIXL : POSIXA;
14200 #else
14201                     op = POSIXA;
14202 #endif
14203                     goto join_posix;
14204
14205                 case ANYOF_NCASED:
14206                 case ANYOF_LOWER:
14207                 case ANYOF_NLOWER:
14208                 case ANYOF_UPPER:
14209                 case ANYOF_NUPPER:
14210                     /* under /a could be alpha */
14211                     if (FOLD) {
14212                         if (ASCII_RESTRICTED) {
14213                             namedclass = ANYOF_ALPHA + (namedclass % 2);
14214                         }
14215                         else if (! LOC) {
14216                             break;
14217                         }
14218                     }
14219                     /* FALLTHROUGH */
14220
14221                 /* The rest have more possibilities depending on the charset.
14222                  * We take advantage of the enum ordering of the charset
14223                  * modifiers to get the exact node type, */
14224                 default:
14225                     op = POSIXD + get_regex_charset(RExC_flags);
14226                     if (op > POSIXA) { /* /aa is same as /a */
14227                         op = POSIXA;
14228                     }
14229
14230                 join_posix:
14231                     /* The odd numbered ones are the complements of the
14232                      * next-lower even number one */
14233                     if (namedclass % 2 == 1) {
14234                         invert = ! invert;
14235                         namedclass--;
14236                     }
14237                     arg = namedclass_to_classnum(namedclass);
14238                     break;
14239             }
14240         }
14241         else if (value == prevvalue) {
14242
14243             /* Here, the class consists of just a single code point */
14244
14245             if (invert) {
14246                 if (! LOC && value == '\n') {
14247                     op = REG_ANY; /* Optimize [^\n] */
14248                     *flagp |= HASWIDTH|SIMPLE;
14249                     RExC_naughty++;
14250                 }
14251             }
14252             else if (value < 256 || UTF) {
14253
14254                 /* Optimize a single value into an EXACTish node, but not if it
14255                  * would require converting the pattern to UTF-8. */
14256                 op = compute_EXACTish(pRExC_state);
14257             }
14258         } /* Otherwise is a range */
14259         else if (! LOC) {   /* locale could vary these */
14260             if (prevvalue == '0') {
14261                 if (value == '9') {
14262                     arg = _CC_DIGIT;
14263                     op = POSIXA;
14264                 }
14265             }
14266         }
14267
14268         /* Here, we have changed <op> away from its initial value iff we found
14269          * an optimization */
14270         if (op != END) {
14271
14272             /* Throw away this ANYOF regnode, and emit the calculated one,
14273              * which should correspond to the beginning, not current, state of
14274              * the parse */
14275             const char * cur_parse = RExC_parse;
14276             RExC_parse = (char *)orig_parse;
14277             if ( SIZE_ONLY) {
14278                 if (! LOC) {
14279
14280                     /* To get locale nodes to not use the full ANYOF size would
14281                      * require moving the code above that writes the portions
14282                      * of it that aren't in other nodes to after this point.
14283                      * e.g.  ANYOF_POSIXL_SET */
14284                     RExC_size = orig_size;
14285                 }
14286             }
14287             else {
14288                 RExC_emit = (regnode *)orig_emit;
14289                 if (PL_regkind[op] == POSIXD) {
14290                     if (op == POSIXL) {
14291                         RExC_contains_locale = 1;
14292                     }
14293                     if (invert) {
14294                         op += NPOSIXD - POSIXD;
14295                     }
14296                 }
14297             }
14298
14299             ret = reg_node(pRExC_state, op);
14300
14301             if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14302                 if (! SIZE_ONLY) {
14303                     FLAGS(ret) = arg;
14304                 }
14305                 *flagp |= HASWIDTH|SIMPLE;
14306             }
14307             else if (PL_regkind[op] == EXACT) {
14308                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14309                                            TRUE /* downgradable to EXACT */
14310                                            );
14311             }
14312
14313             RExC_parse = (char *) cur_parse;
14314
14315             SvREFCNT_dec(posixes);
14316             SvREFCNT_dec(nposixes);
14317             SvREFCNT_dec(cp_list);
14318             SvREFCNT_dec(cp_foldable_list);
14319             return ret;
14320         }
14321     }
14322
14323     if (SIZE_ONLY)
14324         return ret;
14325     /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14326
14327     /* If folding, we calculate all characters that could fold to or from the
14328      * ones already on the list */
14329     if (cp_foldable_list) {
14330         if (FOLD) {
14331             UV start, end;      /* End points of code point ranges */
14332
14333             SV* fold_intersection = NULL;
14334             SV** use_list;
14335
14336             /* Our calculated list will be for Unicode rules.  For locale
14337              * matching, we have to keep a separate list that is consulted at
14338              * runtime only when the locale indicates Unicode rules.  For
14339              * non-locale, we just use to the general list */
14340             if (LOC) {
14341                 use_list = &only_utf8_locale_list;
14342             }
14343             else {
14344                 use_list = &cp_list;
14345             }
14346
14347             /* Only the characters in this class that participate in folds need
14348              * be checked.  Get the intersection of this class and all the
14349              * possible characters that are foldable.  This can quickly narrow
14350              * down a large class */
14351             _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14352                                   &fold_intersection);
14353
14354             /* The folds for all the Latin1 characters are hard-coded into this
14355              * program, but we have to go out to disk to get the others. */
14356             if (invlist_highest(cp_foldable_list) >= 256) {
14357
14358                 /* This is a hash that for a particular fold gives all
14359                  * characters that are involved in it */
14360                 if (! PL_utf8_foldclosures) {
14361
14362                     /* If the folds haven't been read in, call a fold function
14363                      * to force that */
14364                     if (! PL_utf8_tofold) {
14365                         U8 dummy[UTF8_MAXBYTES_CASE+1];
14366
14367                         /* This string is just a short named one above \xff */
14368                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14369                         assert(PL_utf8_tofold); /* Verify that worked */
14370                     }
14371                     PL_utf8_foldclosures
14372                                       = _swash_inversion_hash(PL_utf8_tofold);
14373                 }
14374             }
14375
14376             /* Now look at the foldable characters in this class individually */
14377             invlist_iterinit(fold_intersection);
14378             while (invlist_iternext(fold_intersection, &start, &end)) {
14379                 UV j;
14380
14381                 /* Look at every character in the range */
14382                 for (j = start; j <= end; j++) {
14383                     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14384                     STRLEN foldlen;
14385                     SV** listp;
14386
14387                     if (j < 256) {
14388
14389                         /* We have the latin1 folding rules hard-coded here so
14390                          * that an innocent-looking character class, like
14391                          * /[ks]/i won't have to go out to disk to find the
14392                          * possible matches.  XXX It would be better to
14393                          * generate these via regen, in case a new version of
14394                          * the Unicode standard adds new mappings, though that
14395                          * is not really likely, and may be caught by the
14396                          * default: case of the switch below. */
14397
14398                         if (IS_IN_SOME_FOLD_L1(j)) {
14399
14400                             /* ASCII is always matched; non-ASCII is matched
14401                              * only under Unicode rules (which could happen
14402                              * under /l if the locale is a UTF-8 one */
14403                             if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14404                                 *use_list = add_cp_to_invlist(*use_list,
14405                                                             PL_fold_latin1[j]);
14406                             }
14407                             else {
14408                                 depends_list =
14409                                  add_cp_to_invlist(depends_list,
14410                                                    PL_fold_latin1[j]);
14411                             }
14412                         }
14413
14414                         if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14415                             && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14416                         {
14417                             /* Certain Latin1 characters have matches outside
14418                             * Latin1.  To get here, <j> is one of those
14419                             * characters.   None of these matches is valid for
14420                             * ASCII characters under /aa, which is why the 'if'
14421                             * just above excludes those.  These matches only
14422                             * happen when the target string is utf8.  The code
14423                             * below adds the single fold closures for <j> to the
14424                             * inversion list. */
14425
14426                             switch (j) {
14427                                 case 'k':
14428                                 case 'K':
14429                                   *use_list =
14430                                      add_cp_to_invlist(*use_list, KELVIN_SIGN);
14431                                     break;
14432                                 case 's':
14433                                 case 'S':
14434                                   *use_list = add_cp_to_invlist(*use_list,
14435                                                     LATIN_SMALL_LETTER_LONG_S);
14436                                     break;
14437                                 case MICRO_SIGN:
14438                                   *use_list = add_cp_to_invlist(*use_list,
14439                                                       GREEK_CAPITAL_LETTER_MU);
14440                                   *use_list = add_cp_to_invlist(*use_list,
14441                                                         GREEK_SMALL_LETTER_MU);
14442                                     break;
14443                                 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14444                                 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14445                                   *use_list =
14446                                    add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14447                                     break;
14448                                 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14449                                   *use_list = add_cp_to_invlist(*use_list,
14450                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14451                                     break;
14452                                 case LATIN_SMALL_LETTER_SHARP_S:
14453                                   *use_list = add_cp_to_invlist(*use_list,
14454                                                  LATIN_CAPITAL_LETTER_SHARP_S);
14455                                     break;
14456                                 case 'F': case 'f':
14457                                 case 'I': case 'i':
14458                                 case 'L': case 'l':
14459                                 case 'T': case 't':
14460                                 case 'A': case 'a':
14461                                 case 'H': case 'h':
14462                                 case 'J': case 'j':
14463                                 case 'N': case 'n':
14464                                 case 'W': case 'w':
14465                                 case 'Y': case 'y':
14466                                     /* These all are targets of multi-character
14467                                      * folds from code points that require UTF8
14468                                      * to express, so they can't match unless
14469                                      * the target string is in UTF-8, so no
14470                                      * action here is necessary, as regexec.c
14471                                      * properly handles the general case for
14472                                      * UTF-8 matching and multi-char folds */
14473                                     break;
14474                                 default:
14475                                     /* Use deprecated warning to increase the
14476                                     * chances of this being output */
14477                                     ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14478                                     break;
14479                             }
14480                         }
14481                         continue;
14482                     }
14483
14484                     /* Here is an above Latin1 character.  We don't have the
14485                      * rules hard-coded for it.  First, get its fold.  This is
14486                      * the simple fold, as the multi-character folds have been
14487                      * handled earlier and separated out */
14488                     _to_uni_fold_flags(j, foldbuf, &foldlen,
14489                                                         (ASCII_FOLD_RESTRICTED)
14490                                                         ? FOLD_FLAGS_NOMIX_ASCII
14491                                                         : 0);
14492
14493                     /* Single character fold of above Latin1.  Add everything in
14494                     * its fold closure to the list that this node should match.
14495                     * The fold closures data structure is a hash with the keys
14496                     * being the UTF-8 of every character that is folded to, like
14497                     * 'k', and the values each an array of all code points that
14498                     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14499                     * Multi-character folds are not included */
14500                     if ((listp = hv_fetch(PL_utf8_foldclosures,
14501                                         (char *) foldbuf, foldlen, FALSE)))
14502                     {
14503                         AV* list = (AV*) *listp;
14504                         IV k;
14505                         for (k = 0; k <= av_tindex(list); k++) {
14506                             SV** c_p = av_fetch(list, k, FALSE);
14507                             UV c;
14508                             if (c_p == NULL) {
14509                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14510                             }
14511                             c = SvUV(*c_p);
14512
14513                             /* /aa doesn't allow folds between ASCII and non- */
14514                             if ((ASCII_FOLD_RESTRICTED
14515                                 && (isASCII(c) != isASCII(j))))
14516                             {
14517                                 continue;
14518                             }
14519
14520                             /* Folds under /l which cross the 255/256 boundary
14521                              * are added to a separate list.  (These are valid
14522                              * only when the locale is UTF-8.) */
14523                             if (c < 256 && LOC) {
14524                                 *use_list = add_cp_to_invlist(*use_list, c);
14525                                 continue;
14526                             }
14527
14528                             if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14529                             {
14530                                 cp_list = add_cp_to_invlist(cp_list, c);
14531                             }
14532                             else {
14533                                 /* Similarly folds involving non-ascii Latin1
14534                                 * characters under /d are added to their list */
14535                                 depends_list = add_cp_to_invlist(depends_list,
14536                                                                  c);
14537                             }
14538                         }
14539                     }
14540                 }
14541             }
14542             SvREFCNT_dec_NN(fold_intersection);
14543         }
14544
14545         /* Now that we have finished adding all the folds, there is no reason
14546          * to keep the foldable list separate */
14547         _invlist_union(cp_list, cp_foldable_list, &cp_list);
14548         SvREFCNT_dec_NN(cp_foldable_list);
14549     }
14550
14551     /* And combine the result (if any) with any inversion list from posix
14552      * classes.  The lists are kept separate up to now because we don't want to
14553      * fold the classes (folding of those is automatically handled by the swash
14554      * fetching code) */
14555     if (posixes || nposixes) {
14556         if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14557             /* Under /a and /aa, nothing above ASCII matches these */
14558             _invlist_intersection(posixes,
14559                                   PL_XPosix_ptrs[_CC_ASCII],
14560                                   &posixes);
14561         }
14562         if (nposixes) {
14563             if (DEPENDS_SEMANTICS) {
14564                 /* Under /d, everything in the upper half of the Latin1 range
14565                  * matches these complements */
14566                 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14567             }
14568             else if (AT_LEAST_ASCII_RESTRICTED) {
14569                 /* Under /a and /aa, everything above ASCII matches these
14570                  * complements */
14571                 _invlist_union_complement_2nd(nposixes,
14572                                               PL_XPosix_ptrs[_CC_ASCII],
14573                                               &nposixes);
14574             }
14575             if (posixes) {
14576                 _invlist_union(posixes, nposixes, &posixes);
14577                 SvREFCNT_dec_NN(nposixes);
14578             }
14579             else {
14580                 posixes = nposixes;
14581             }
14582         }
14583         if (! DEPENDS_SEMANTICS) {
14584             if (cp_list) {
14585                 _invlist_union(cp_list, posixes, &cp_list);
14586                 SvREFCNT_dec_NN(posixes);
14587             }
14588             else {
14589                 cp_list = posixes;
14590             }
14591         }
14592         else {
14593             /* Under /d, we put into a separate list the Latin1 things that
14594              * match only when the target string is utf8 */
14595             SV* nonascii_but_latin1_properties = NULL;
14596             _invlist_intersection(posixes, PL_UpperLatin1,
14597                                   &nonascii_but_latin1_properties);
14598             _invlist_subtract(posixes, nonascii_but_latin1_properties,
14599                               &posixes);
14600             if (cp_list) {
14601                 _invlist_union(cp_list, posixes, &cp_list);
14602                 SvREFCNT_dec_NN(posixes);
14603             }
14604             else {
14605                 cp_list = posixes;
14606             }
14607
14608             if (depends_list) {
14609                 _invlist_union(depends_list, nonascii_but_latin1_properties,
14610                                &depends_list);
14611                 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14612             }
14613             else {
14614                 depends_list = nonascii_but_latin1_properties;
14615             }
14616         }
14617     }
14618
14619     /* And combine the result (if any) with any inversion list from properties.
14620      * The lists are kept separate up to now so that we can distinguish the two
14621      * in regards to matching above-Unicode.  A run-time warning is generated
14622      * if a Unicode property is matched against a non-Unicode code point. But,
14623      * we allow user-defined properties to match anything, without any warning,
14624      * and we also suppress the warning if there is a portion of the character
14625      * class that isn't a Unicode property, and which matches above Unicode, \W
14626      * or [\x{110000}] for example.
14627      * (Note that in this case, unlike the Posix one above, there is no
14628      * <depends_list>, because having a Unicode property forces Unicode
14629      * semantics */
14630     if (properties) {
14631         if (cp_list) {
14632
14633             /* If it matters to the final outcome, see if a non-property
14634              * component of the class matches above Unicode.  If so, the
14635              * warning gets suppressed.  This is true even if just a single
14636              * such code point is specified, as though not strictly correct if
14637              * another such code point is matched against, the fact that they
14638              * are using above-Unicode code points indicates they should know
14639              * the issues involved */
14640             if (warn_super) {
14641                 warn_super = ! (invert
14642                                ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14643             }
14644
14645             _invlist_union(properties, cp_list, &cp_list);
14646             SvREFCNT_dec_NN(properties);
14647         }
14648         else {
14649             cp_list = properties;
14650         }
14651
14652         if (warn_super) {
14653             ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14654         }
14655     }
14656
14657     /* Here, we have calculated what code points should be in the character
14658      * class.
14659      *
14660      * Now we can see about various optimizations.  Fold calculation (which we
14661      * did above) needs to take place before inversion.  Otherwise /[^k]/i
14662      * would invert to include K, which under /i would match k, which it
14663      * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14664      * folded until runtime */
14665
14666     /* If we didn't do folding, it's because some information isn't available
14667      * until runtime; set the run-time fold flag for these.  (We don't have to
14668      * worry about properties folding, as that is taken care of by the swash
14669      * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14670      * locales, or the class matches at least one 0-255 range code point */
14671     if (LOC && FOLD) {
14672         if (only_utf8_locale_list) {
14673             ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14674         }
14675         else if (cp_list) { /* Look to see if there a 0-255 code point is in
14676                                the list */
14677             UV start, end;
14678             invlist_iterinit(cp_list);
14679             if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14680                 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14681             }
14682             invlist_iterfinish(cp_list);
14683         }
14684     }
14685
14686     /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14687      * at compile time.  Besides not inverting folded locale now, we can't
14688      * invert if there are things such as \w, which aren't known until runtime
14689      * */
14690     if (cp_list
14691         && invert
14692         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14693         && ! depends_list
14694         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14695     {
14696         _invlist_invert(cp_list);
14697
14698         /* Any swash can't be used as-is, because we've inverted things */
14699         if (swash) {
14700             SvREFCNT_dec_NN(swash);
14701             swash = NULL;
14702         }
14703
14704         /* Clear the invert flag since have just done it here */
14705         invert = FALSE;
14706     }
14707
14708     if (ret_invlist) {
14709         *ret_invlist = cp_list;
14710         SvREFCNT_dec(swash);
14711
14712         /* Discard the generated node */
14713         if (SIZE_ONLY) {
14714             RExC_size = orig_size;
14715         }
14716         else {
14717             RExC_emit = orig_emit;
14718         }
14719         return orig_emit;
14720     }
14721
14722     /* Some character classes are equivalent to other nodes.  Such nodes take
14723      * up less room and generally fewer operations to execute than ANYOF nodes.
14724      * Above, we checked for and optimized into some such equivalents for
14725      * certain common classes that are easy to test.  Getting to this point in
14726      * the code means that the class didn't get optimized there.  Since this
14727      * code is only executed in Pass 2, it is too late to save space--it has
14728      * been allocated in Pass 1, and currently isn't given back.  But turning
14729      * things into an EXACTish node can allow the optimizer to join it to any
14730      * adjacent such nodes.  And if the class is equivalent to things like /./,
14731      * expensive run-time swashes can be avoided.  Now that we have more
14732      * complete information, we can find things necessarily missed by the
14733      * earlier code.  I (khw) am not sure how much to look for here.  It would
14734      * be easy, but perhaps too slow, to check any candidates against all the
14735      * node types they could possibly match using _invlistEQ(). */
14736
14737     if (cp_list
14738         && ! invert
14739         && ! depends_list
14740         && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14741         && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14742
14743            /* We don't optimize if we are supposed to make sure all non-Unicode
14744             * code points raise a warning, as only ANYOF nodes have this check.
14745             * */
14746         && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14747     {
14748         UV start, end;
14749         U8 op = END;  /* The optimzation node-type */
14750         const char * cur_parse= RExC_parse;
14751
14752         invlist_iterinit(cp_list);
14753         if (! invlist_iternext(cp_list, &start, &end)) {
14754
14755             /* Here, the list is empty.  This happens, for example, when a
14756              * Unicode property is the only thing in the character class, and
14757              * it doesn't match anything.  (perluniprops.pod notes such
14758              * properties) */
14759             op = OPFAIL;
14760             *flagp |= HASWIDTH|SIMPLE;
14761         }
14762         else if (start == end) {    /* The range is a single code point */
14763             if (! invlist_iternext(cp_list, &start, &end)
14764
14765                     /* Don't do this optimization if it would require changing
14766                      * the pattern to UTF-8 */
14767                 && (start < 256 || UTF))
14768             {
14769                 /* Here, the list contains a single code point.  Can optimize
14770                  * into an EXACTish node */
14771
14772                 value = start;
14773
14774                 if (! FOLD) {
14775                     op = EXACT;
14776                 }
14777                 else if (LOC) {
14778
14779                     /* A locale node under folding with one code point can be
14780                      * an EXACTFL, as its fold won't be calculated until
14781                      * runtime */
14782                     op = EXACTFL;
14783                 }
14784                 else {
14785
14786                     /* Here, we are generally folding, but there is only one
14787                      * code point to match.  If we have to, we use an EXACT
14788                      * node, but it would be better for joining with adjacent
14789                      * nodes in the optimization pass if we used the same
14790                      * EXACTFish node that any such are likely to be.  We can
14791                      * do this iff the code point doesn't participate in any
14792                      * folds.  For example, an EXACTF of a colon is the same as
14793                      * an EXACT one, since nothing folds to or from a colon. */
14794                     if (value < 256) {
14795                         if (IS_IN_SOME_FOLD_L1(value)) {
14796                             op = EXACT;
14797                         }
14798                     }
14799                     else {
14800                         if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14801                             op = EXACT;
14802                         }
14803                     }
14804
14805                     /* If we haven't found the node type, above, it means we
14806                      * can use the prevailing one */
14807                     if (op == END) {
14808                         op = compute_EXACTish(pRExC_state);
14809                     }
14810                 }
14811             }
14812         }
14813         else if (start == 0) {
14814             if (end == UV_MAX) {
14815                 op = SANY;
14816                 *flagp |= HASWIDTH|SIMPLE;
14817                 RExC_naughty++;
14818             }
14819             else if (end == '\n' - 1
14820                     && invlist_iternext(cp_list, &start, &end)
14821                     && start == '\n' + 1 && end == UV_MAX)
14822             {
14823                 op = REG_ANY;
14824                 *flagp |= HASWIDTH|SIMPLE;
14825                 RExC_naughty++;
14826             }
14827         }
14828         invlist_iterfinish(cp_list);
14829
14830         if (op != END) {
14831             RExC_parse = (char *)orig_parse;
14832             RExC_emit = (regnode *)orig_emit;
14833
14834             ret = reg_node(pRExC_state, op);
14835
14836             RExC_parse = (char *)cur_parse;
14837
14838             if (PL_regkind[op] == EXACT) {
14839                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14840                                            TRUE /* downgradable to EXACT */
14841                                           );
14842             }
14843
14844             SvREFCNT_dec_NN(cp_list);
14845             return ret;
14846         }
14847     }
14848
14849     /* Here, <cp_list> contains all the code points we can determine at
14850      * compile time that match under all conditions.  Go through it, and
14851      * for things that belong in the bitmap, put them there, and delete from
14852      * <cp_list>.  While we are at it, see if everything above 255 is in the
14853      * list, and if so, set a flag to speed up execution */
14854
14855     populate_ANYOF_from_invlist(ret, &cp_list);
14856
14857     if (invert) {
14858         ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14859     }
14860
14861     /* Here, the bitmap has been populated with all the Latin1 code points that
14862      * always match.  Can now add to the overall list those that match only
14863      * when the target string is UTF-8 (<depends_list>). */
14864     if (depends_list) {
14865         if (cp_list) {
14866             _invlist_union(cp_list, depends_list, &cp_list);
14867             SvREFCNT_dec_NN(depends_list);
14868         }
14869         else {
14870             cp_list = depends_list;
14871         }
14872         ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14873     }
14874
14875     /* If there is a swash and more than one element, we can't use the swash in
14876      * the optimization below. */
14877     if (swash && element_count > 1) {
14878         SvREFCNT_dec_NN(swash);
14879         swash = NULL;
14880     }
14881
14882     set_ANYOF_arg(pRExC_state, ret, cp_list,
14883                   (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14884                    ? listsv : NULL,
14885                   only_utf8_locale_list,
14886                   swash, has_user_defined_property);
14887
14888     *flagp |= HASWIDTH|SIMPLE;
14889
14890     if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14891         RExC_contains_locale = 1;
14892     }
14893
14894     return ret;
14895 }
14896
14897 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14898
14899 STATIC void
14900 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14901                 regnode* const node,
14902                 SV* const cp_list,
14903                 SV* const runtime_defns,
14904                 SV* const only_utf8_locale_list,
14905                 SV* const swash,
14906                 const bool has_user_defined_property)
14907 {
14908     /* Sets the arg field of an ANYOF-type node 'node', using information about
14909      * the node passed-in.  If there is nothing outside the node's bitmap, the
14910      * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14911      * the count returned by add_data(), having allocated and stored an array,
14912      * av, that that count references, as follows:
14913      *  av[0] stores the character class description in its textual form.
14914      *        This is used later (regexec.c:Perl_regclass_swash()) to
14915      *        initialize the appropriate swash, and is also useful for dumping
14916      *        the regnode.  This is set to &PL_sv_undef if the textual
14917      *        description is not needed at run-time (as happens if the other
14918      *        elements completely define the class)
14919      *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14920      *        computed from av[0].  But if no further computation need be done,
14921      *        the swash is stored here now (and av[0] is &PL_sv_undef).
14922      *  av[2] stores the inversion list of code points that match only if the
14923      *        current locale is UTF-8
14924      *  av[3] stores the cp_list inversion list for use in addition or instead
14925      *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14926      *        (Otherwise everything needed is already in av[0] and av[1])
14927      *  av[4] is set if any component of the class is from a user-defined
14928      *        property; used only if av[3] exists */
14929
14930     UV n;
14931
14932     PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14933
14934     if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14935         assert(! (ANYOF_FLAGS(node)
14936                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14937         ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14938     }
14939     else {
14940         AV * const av = newAV();
14941         SV *rv;
14942
14943         assert(ANYOF_FLAGS(node)
14944                     & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14945
14946         av_store(av, 0, (runtime_defns)
14947                         ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14948         if (swash) {
14949             av_store(av, 1, swash);
14950             SvREFCNT_dec_NN(cp_list);
14951         }
14952         else {
14953             av_store(av, 1, &PL_sv_undef);
14954             if (cp_list) {
14955                 av_store(av, 3, cp_list);
14956                 av_store(av, 4, newSVuv(has_user_defined_property));
14957             }
14958         }
14959
14960         if (only_utf8_locale_list) {
14961             av_store(av, 2, only_utf8_locale_list);
14962         }
14963         else {
14964             av_store(av, 2, &PL_sv_undef);
14965         }
14966
14967         rv = newRV_noinc(MUTABLE_SV(av));
14968         n = add_data(pRExC_state, STR_WITH_LEN("s"));
14969         RExC_rxi->data->data[n] = (void*)rv;
14970         ARG_SET(node, n);
14971     }
14972 }
14973
14974
14975 /* reg_skipcomment()
14976
14977    Absorbs an /x style # comments from the input stream.
14978    Returns true if there is more text remaining in the stream.
14979    Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
14980    terminates the pattern without including a newline.
14981
14982    Note its the callers responsibility to ensure that we are
14983    actually in /x mode
14984
14985 */
14986
14987 STATIC bool
14988 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14989 {
14990     bool ended = 0;
14991
14992     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14993
14994     while (RExC_parse < RExC_end)
14995         if (*RExC_parse++ == '\n') {
14996             ended = 1;
14997             break;
14998         }
14999     if (!ended) {
15000         /* we ran off the end of the pattern without ending
15001            the comment, so we have to add an \n when wrapping */
15002         RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15003         return 0;
15004     } else
15005         return 1;
15006 }
15007
15008 /* nextchar()
15009
15010    Advances the parse position, and optionally absorbs
15011    "whitespace" from the inputstream.
15012
15013    Without /x "whitespace" means (?#...) style comments only,
15014    with /x this means (?#...) and # comments and whitespace proper.
15015
15016    Returns the RExC_parse point from BEFORE the scan occurs.
15017
15018    This is the /x friendly way of saying RExC_parse++.
15019 */
15020
15021 STATIC char*
15022 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15023 {
15024     char* const retval = RExC_parse++;
15025
15026     PERL_ARGS_ASSERT_NEXTCHAR;
15027
15028     for (;;) {
15029         if (RExC_end - RExC_parse >= 3
15030             && *RExC_parse == '('
15031             && RExC_parse[1] == '?'
15032             && RExC_parse[2] == '#')
15033         {
15034             while (*RExC_parse != ')') {
15035                 if (RExC_parse == RExC_end)
15036                     FAIL("Sequence (?#... not terminated");
15037                 RExC_parse++;
15038             }
15039             RExC_parse++;
15040             continue;
15041         }
15042         if (RExC_flags & RXf_PMf_EXTENDED) {
15043             if (isSPACE(*RExC_parse)) {
15044                 RExC_parse++;
15045                 continue;
15046             }
15047             else if (*RExC_parse == '#') {
15048                 if ( reg_skipcomment( pRExC_state ) )
15049                     continue;
15050             }
15051         }
15052         return retval;
15053     }
15054 }
15055
15056 /*
15057 - reg_node - emit a node
15058 */
15059 STATIC regnode *                        /* Location. */
15060 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15061 {
15062     dVAR;
15063     regnode *ptr;
15064     regnode * const ret = RExC_emit;
15065     GET_RE_DEBUG_FLAGS_DECL;
15066
15067     PERL_ARGS_ASSERT_REG_NODE;
15068
15069     if (SIZE_ONLY) {
15070         SIZE_ALIGN(RExC_size);
15071         RExC_size += 1;
15072         return(ret);
15073     }
15074     if (RExC_emit >= RExC_emit_bound)
15075         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15076                    op, RExC_emit, RExC_emit_bound);
15077
15078     NODE_ALIGN_FILL(ret);
15079     ptr = ret;
15080     FILL_ADVANCE_NODE(ptr, op);
15081 #ifdef RE_TRACK_PATTERN_OFFSETS
15082     if (RExC_offsets) {         /* MJD */
15083         MJD_OFFSET_DEBUG(
15084               ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15085               "reg_node", __LINE__,
15086               PL_reg_name[op],
15087               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15088                 ? "Overwriting end of array!\n" : "OK",
15089               (UV)(RExC_emit - RExC_emit_start),
15090               (UV)(RExC_parse - RExC_start),
15091               (UV)RExC_offsets[0]));
15092         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15093     }
15094 #endif
15095     RExC_emit = ptr;
15096     return(ret);
15097 }
15098
15099 /*
15100 - reganode - emit a node with an argument
15101 */
15102 STATIC regnode *                        /* Location. */
15103 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15104 {
15105     dVAR;
15106     regnode *ptr;
15107     regnode * const ret = RExC_emit;
15108     GET_RE_DEBUG_FLAGS_DECL;
15109
15110     PERL_ARGS_ASSERT_REGANODE;
15111
15112     if (SIZE_ONLY) {
15113         SIZE_ALIGN(RExC_size);
15114         RExC_size += 2;
15115         /*
15116            We can't do this:
15117
15118            assert(2==regarglen[op]+1);
15119
15120            Anything larger than this has to allocate the extra amount.
15121            If we changed this to be:
15122
15123            RExC_size += (1 + regarglen[op]);
15124
15125            then it wouldn't matter. Its not clear what side effect
15126            might come from that so its not done so far.
15127            -- dmq
15128         */
15129         return(ret);
15130     }
15131     if (RExC_emit >= RExC_emit_bound)
15132         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15133                    op, RExC_emit, RExC_emit_bound);
15134
15135     NODE_ALIGN_FILL(ret);
15136     ptr = ret;
15137     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15138 #ifdef RE_TRACK_PATTERN_OFFSETS
15139     if (RExC_offsets) {         /* MJD */
15140         MJD_OFFSET_DEBUG(
15141               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15142               "reganode",
15143               __LINE__,
15144               PL_reg_name[op],
15145               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15146               "Overwriting end of array!\n" : "OK",
15147               (UV)(RExC_emit - RExC_emit_start),
15148               (UV)(RExC_parse - RExC_start),
15149               (UV)RExC_offsets[0]));
15150         Set_Cur_Node_Offset;
15151     }
15152 #endif
15153     RExC_emit = ptr;
15154     return(ret);
15155 }
15156
15157 /*
15158 - reguni - emit (if appropriate) a Unicode character
15159 */
15160 PERL_STATIC_INLINE STRLEN
15161 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15162 {
15163     dVAR;
15164
15165     PERL_ARGS_ASSERT_REGUNI;
15166
15167     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15168 }
15169
15170 /*
15171 - reginsert - insert an operator in front of already-emitted operand
15172 *
15173 * Means relocating the operand.
15174 */
15175 STATIC void
15176 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15177 {
15178     dVAR;
15179     regnode *src;
15180     regnode *dst;
15181     regnode *place;
15182     const int offset = regarglen[(U8)op];
15183     const int size = NODE_STEP_REGNODE + offset;
15184     GET_RE_DEBUG_FLAGS_DECL;
15185
15186     PERL_ARGS_ASSERT_REGINSERT;
15187     PERL_UNUSED_ARG(depth);
15188 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15189     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15190     if (SIZE_ONLY) {
15191         RExC_size += size;
15192         return;
15193     }
15194
15195     src = RExC_emit;
15196     RExC_emit += size;
15197     dst = RExC_emit;
15198     if (RExC_open_parens) {
15199         int paren;
15200         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15201         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15202             if ( RExC_open_parens[paren] >= opnd ) {
15203                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15204                 RExC_open_parens[paren] += size;
15205             } else {
15206                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15207             }
15208             if ( RExC_close_parens[paren] >= opnd ) {
15209                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15210                 RExC_close_parens[paren] += size;
15211             } else {
15212                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15213             }
15214         }
15215     }
15216
15217     while (src > opnd) {
15218         StructCopy(--src, --dst, regnode);
15219 #ifdef RE_TRACK_PATTERN_OFFSETS
15220         if (RExC_offsets) {     /* MJD 20010112 */
15221             MJD_OFFSET_DEBUG(
15222                  ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15223                   "reg_insert",
15224                   __LINE__,
15225                   PL_reg_name[op],
15226                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15227                     ? "Overwriting end of array!\n" : "OK",
15228                   (UV)(src - RExC_emit_start),
15229                   (UV)(dst - RExC_emit_start),
15230                   (UV)RExC_offsets[0]));
15231             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15232             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15233         }
15234 #endif
15235     }
15236
15237
15238     place = opnd;               /* Op node, where operand used to be. */
15239 #ifdef RE_TRACK_PATTERN_OFFSETS
15240     if (RExC_offsets) {         /* MJD */
15241         MJD_OFFSET_DEBUG(
15242               ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15243               "reginsert",
15244               __LINE__,
15245               PL_reg_name[op],
15246               (UV)(place - RExC_emit_start) > RExC_offsets[0]
15247               ? "Overwriting end of array!\n" : "OK",
15248               (UV)(place - RExC_emit_start),
15249               (UV)(RExC_parse - RExC_start),
15250               (UV)RExC_offsets[0]));
15251         Set_Node_Offset(place, RExC_parse);
15252         Set_Node_Length(place, 1);
15253     }
15254 #endif
15255     src = NEXTOPER(place);
15256     FILL_ADVANCE_NODE(place, op);
15257     Zero(src, offset, regnode);
15258 }
15259
15260 /*
15261 - regtail - set the next-pointer at the end of a node chain of p to val.
15262 - SEE ALSO: regtail_study
15263 */
15264 /* TODO: All three parms should be const */
15265 STATIC void
15266 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15267                 const regnode *val,U32 depth)
15268 {
15269     dVAR;
15270     regnode *scan;
15271     GET_RE_DEBUG_FLAGS_DECL;
15272
15273     PERL_ARGS_ASSERT_REGTAIL;
15274 #ifndef DEBUGGING
15275     PERL_UNUSED_ARG(depth);
15276 #endif
15277
15278     if (SIZE_ONLY)
15279         return;
15280
15281     /* Find last node. */
15282     scan = p;
15283     for (;;) {
15284         regnode * const temp = regnext(scan);
15285         DEBUG_PARSE_r({
15286             SV * const mysv=sv_newmortal();
15287             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15288             regprop(RExC_rx, mysv, scan, NULL);
15289             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15290                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15291                     (temp == NULL ? "->" : ""),
15292                     (temp == NULL ? PL_reg_name[OP(val)] : "")
15293             );
15294         });
15295         if (temp == NULL)
15296             break;
15297         scan = temp;
15298     }
15299
15300     if (reg_off_by_arg[OP(scan)]) {
15301         ARG_SET(scan, val - scan);
15302     }
15303     else {
15304         NEXT_OFF(scan) = val - scan;
15305     }
15306 }
15307
15308 #ifdef DEBUGGING
15309 /*
15310 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15311 - Look for optimizable sequences at the same time.
15312 - currently only looks for EXACT chains.
15313
15314 This is experimental code. The idea is to use this routine to perform
15315 in place optimizations on branches and groups as they are constructed,
15316 with the long term intention of removing optimization from study_chunk so
15317 that it is purely analytical.
15318
15319 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15320 to control which is which.
15321
15322 */
15323 /* TODO: All four parms should be const */
15324
15325 STATIC U8
15326 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15327                       const regnode *val,U32 depth)
15328 {
15329     dVAR;
15330     regnode *scan;
15331     U8 exact = PSEUDO;
15332 #ifdef EXPERIMENTAL_INPLACESCAN
15333     I32 min = 0;
15334 #endif
15335     GET_RE_DEBUG_FLAGS_DECL;
15336
15337     PERL_ARGS_ASSERT_REGTAIL_STUDY;
15338
15339
15340     if (SIZE_ONLY)
15341         return exact;
15342
15343     /* Find last node. */
15344
15345     scan = p;
15346     for (;;) {
15347         regnode * const temp = regnext(scan);
15348 #ifdef EXPERIMENTAL_INPLACESCAN
15349         if (PL_regkind[OP(scan)] == EXACT) {
15350             bool unfolded_multi_char;   /* Unexamined in this routine */
15351             if (join_exact(pRExC_state, scan, &min,
15352                            &unfolded_multi_char, 1, val, depth+1))
15353                 return EXACT;
15354         }
15355 #endif
15356         if ( exact ) {
15357             switch (OP(scan)) {
15358                 case EXACT:
15359                 case EXACTF:
15360                 case EXACTFA_NO_TRIE:
15361                 case EXACTFA:
15362                 case EXACTFU:
15363                 case EXACTFU_SS:
15364                 case EXACTFL:
15365                         if( exact == PSEUDO )
15366                             exact= OP(scan);
15367                         else if ( exact != OP(scan) )
15368                             exact= 0;
15369                 case NOTHING:
15370                     break;
15371                 default:
15372                     exact= 0;
15373             }
15374         }
15375         DEBUG_PARSE_r({
15376             SV * const mysv=sv_newmortal();
15377             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15378             regprop(RExC_rx, mysv, scan, NULL);
15379             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15380                 SvPV_nolen_const(mysv),
15381                 REG_NODE_NUM(scan),
15382                 PL_reg_name[exact]);
15383         });
15384         if (temp == NULL)
15385             break;
15386         scan = temp;
15387     }
15388     DEBUG_PARSE_r({
15389         SV * const mysv_val=sv_newmortal();
15390         DEBUG_PARSE_MSG("");
15391         regprop(RExC_rx, mysv_val, val, NULL);
15392         PerlIO_printf(Perl_debug_log,
15393                       "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15394                       SvPV_nolen_const(mysv_val),
15395                       (IV)REG_NODE_NUM(val),
15396                       (IV)(val - scan)
15397         );
15398     });
15399     if (reg_off_by_arg[OP(scan)]) {
15400         ARG_SET(scan, val - scan);
15401     }
15402     else {
15403         NEXT_OFF(scan) = val - scan;
15404     }
15405
15406     return exact;
15407 }
15408 #endif
15409
15410 /*
15411  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15412  */
15413 #ifdef DEBUGGING
15414
15415 static void
15416 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15417 {
15418     int bit;
15419     int set=0;
15420
15421     ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15422
15423     for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15424         if (flags & (1<<bit)) {
15425             if (!set++ && lead)
15426                 PerlIO_printf(Perl_debug_log, "%s",lead);
15427             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15428         }
15429     }
15430     if (lead)  {
15431         if (set)
15432             PerlIO_printf(Perl_debug_log, "\n");
15433         else
15434             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15435     }
15436 }
15437
15438 static void
15439 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15440 {
15441     int bit;
15442     int set=0;
15443     regex_charset cs;
15444
15445     ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15446
15447     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15448         if (flags & (1<<bit)) {
15449             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
15450                 continue;
15451             }
15452             if (!set++ && lead)
15453                 PerlIO_printf(Perl_debug_log, "%s",lead);
15454             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15455         }
15456     }
15457     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15458             if (!set++ && lead) {
15459                 PerlIO_printf(Perl_debug_log, "%s",lead);
15460             }
15461             switch (cs) {
15462                 case REGEX_UNICODE_CHARSET:
15463                     PerlIO_printf(Perl_debug_log, "UNICODE");
15464                     break;
15465                 case REGEX_LOCALE_CHARSET:
15466                     PerlIO_printf(Perl_debug_log, "LOCALE");
15467                     break;
15468                 case REGEX_ASCII_RESTRICTED_CHARSET:
15469                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15470                     break;
15471                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15472                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15473                     break;
15474                 default:
15475                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15476                     break;
15477             }
15478     }
15479     if (lead)  {
15480         if (set)
15481             PerlIO_printf(Perl_debug_log, "\n");
15482         else
15483             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15484     }
15485 }
15486 #endif
15487
15488 void
15489 Perl_regdump(pTHX_ const regexp *r)
15490 {
15491 #ifdef DEBUGGING
15492     dVAR;
15493     SV * const sv = sv_newmortal();
15494     SV *dsv= sv_newmortal();
15495     RXi_GET_DECL(r,ri);
15496     GET_RE_DEBUG_FLAGS_DECL;
15497
15498     PERL_ARGS_ASSERT_REGDUMP;
15499
15500     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15501
15502     /* Header fields of interest. */
15503     if (r->anchored_substr) {
15504         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15505             RE_SV_DUMPLEN(r->anchored_substr), 30);
15506         PerlIO_printf(Perl_debug_log,
15507                       "anchored %s%s at %"IVdf" ",
15508                       s, RE_SV_TAIL(r->anchored_substr),
15509                       (IV)r->anchored_offset);
15510     } else if (r->anchored_utf8) {
15511         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15512             RE_SV_DUMPLEN(r->anchored_utf8), 30);
15513         PerlIO_printf(Perl_debug_log,
15514                       "anchored utf8 %s%s at %"IVdf" ",
15515                       s, RE_SV_TAIL(r->anchored_utf8),
15516                       (IV)r->anchored_offset);
15517     }
15518     if (r->float_substr) {
15519         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15520             RE_SV_DUMPLEN(r->float_substr), 30);
15521         PerlIO_printf(Perl_debug_log,
15522                       "floating %s%s at %"IVdf"..%"UVuf" ",
15523                       s, RE_SV_TAIL(r->float_substr),
15524                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15525     } else if (r->float_utf8) {
15526         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15527             RE_SV_DUMPLEN(r->float_utf8), 30);
15528         PerlIO_printf(Perl_debug_log,
15529                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15530                       s, RE_SV_TAIL(r->float_utf8),
15531                       (IV)r->float_min_offset, (UV)r->float_max_offset);
15532     }
15533     if (r->check_substr || r->check_utf8)
15534         PerlIO_printf(Perl_debug_log,
15535                       (const char *)
15536                       (r->check_substr == r->float_substr
15537                        && r->check_utf8 == r->float_utf8
15538                        ? "(checking floating" : "(checking anchored"));
15539     if (r->intflags & PREGf_NOSCAN)
15540         PerlIO_printf(Perl_debug_log, " noscan");
15541     if (r->extflags & RXf_CHECK_ALL)
15542         PerlIO_printf(Perl_debug_log, " isall");
15543     if (r->check_substr || r->check_utf8)
15544         PerlIO_printf(Perl_debug_log, ") ");
15545
15546     if (ri->regstclass) {
15547         regprop(r, sv, ri->regstclass, NULL);
15548         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15549     }
15550     if (r->intflags & PREGf_ANCH) {
15551         PerlIO_printf(Perl_debug_log, "anchored");
15552         if (r->intflags & PREGf_ANCH_BOL)
15553             PerlIO_printf(Perl_debug_log, "(BOL)");
15554         if (r->intflags & PREGf_ANCH_MBOL)
15555             PerlIO_printf(Perl_debug_log, "(MBOL)");
15556         if (r->intflags & PREGf_ANCH_SBOL)
15557             PerlIO_printf(Perl_debug_log, "(SBOL)");
15558         if (r->intflags & PREGf_ANCH_GPOS)
15559             PerlIO_printf(Perl_debug_log, "(GPOS)");
15560         PerlIO_putc(Perl_debug_log, ' ');
15561     }
15562     if (r->intflags & PREGf_GPOS_SEEN)
15563         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15564     if (r->intflags & PREGf_SKIP)
15565         PerlIO_printf(Perl_debug_log, "plus ");
15566     if (r->intflags & PREGf_IMPLICIT)
15567         PerlIO_printf(Perl_debug_log, "implicit ");
15568     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15569     if (r->extflags & RXf_EVAL_SEEN)
15570         PerlIO_printf(Perl_debug_log, "with eval ");
15571     PerlIO_printf(Perl_debug_log, "\n");
15572     DEBUG_FLAGS_r({
15573         regdump_extflags("r->extflags: ",r->extflags);
15574         regdump_intflags("r->intflags: ",r->intflags);
15575     });
15576 #else
15577     PERL_ARGS_ASSERT_REGDUMP;
15578     PERL_UNUSED_CONTEXT;
15579     PERL_UNUSED_ARG(r);
15580 #endif  /* DEBUGGING */
15581 }
15582
15583 /*
15584 - regprop - printable representation of opcode, with run time support
15585 */
15586
15587 void
15588 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15589 {
15590 #ifdef DEBUGGING
15591     dVAR;
15592     int k;
15593
15594     /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15595     static const char * const anyofs[] = {
15596 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15597     || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15598     || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15599     || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15600     || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15601     || _CC_VERTSPACE != 16
15602   #error Need to adjust order of anyofs[]
15603 #endif
15604         "\\w",
15605         "\\W",
15606         "\\d",
15607         "\\D",
15608         "[:alpha:]",
15609         "[:^alpha:]",
15610         "[:lower:]",
15611         "[:^lower:]",
15612         "[:upper:]",
15613         "[:^upper:]",
15614         "[:punct:]",
15615         "[:^punct:]",
15616         "[:print:]",
15617         "[:^print:]",
15618         "[:alnum:]",
15619         "[:^alnum:]",
15620         "[:graph:]",
15621         "[:^graph:]",
15622         "[:cased:]",
15623         "[:^cased:]",
15624         "\\s",
15625         "\\S",
15626         "[:blank:]",
15627         "[:^blank:]",
15628         "[:xdigit:]",
15629         "[:^xdigit:]",
15630         "[:space:]",
15631         "[:^space:]",
15632         "[:cntrl:]",
15633         "[:^cntrl:]",
15634         "[:ascii:]",
15635         "[:^ascii:]",
15636         "\\v",
15637         "\\V"
15638     };
15639     RXi_GET_DECL(prog,progi);
15640     GET_RE_DEBUG_FLAGS_DECL;
15641
15642     PERL_ARGS_ASSERT_REGPROP;
15643
15644     sv_setpvs(sv, "");
15645
15646     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
15647         /* It would be nice to FAIL() here, but this may be called from
15648            regexec.c, and it would be hard to supply pRExC_state. */
15649         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15650                                               (int)OP(o), (int)REGNODE_MAX);
15651     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15652
15653     k = PL_regkind[OP(o)];
15654
15655     if (k == EXACT) {
15656         sv_catpvs(sv, " ");
15657         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15658          * is a crude hack but it may be the best for now since
15659          * we have no flag "this EXACTish node was UTF-8"
15660          * --jhi */
15661         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15662                   PERL_PV_ESCAPE_UNI_DETECT |
15663                   PERL_PV_ESCAPE_NONASCII   |
15664                   PERL_PV_PRETTY_ELLIPSES   |
15665                   PERL_PV_PRETTY_LTGT       |
15666                   PERL_PV_PRETTY_NOCLEAR
15667                   );
15668     } else if (k == TRIE) {
15669         /* print the details of the trie in dumpuntil instead, as
15670          * progi->data isn't available here */
15671         const char op = OP(o);
15672         const U32 n = ARG(o);
15673         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15674                (reg_ac_data *)progi->data->data[n] :
15675                NULL;
15676         const reg_trie_data * const trie
15677             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15678
15679         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15680         DEBUG_TRIE_COMPILE_r(
15681           Perl_sv_catpvf(aTHX_ sv,
15682             "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15683             (UV)trie->startstate,
15684             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15685             (UV)trie->wordcount,
15686             (UV)trie->minlen,
15687             (UV)trie->maxlen,
15688             (UV)TRIE_CHARCOUNT(trie),
15689             (UV)trie->uniquecharcount
15690           );
15691         );
15692         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15693             sv_catpvs(sv, "[");
15694             (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15695                                                    ? ANYOF_BITMAP(o)
15696                                                    : TRIE_BITMAP(trie));
15697             sv_catpvs(sv, "]");
15698         }
15699
15700     } else if (k == CURLY) {
15701         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15702             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15703         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15704     }
15705     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
15706         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15707     else if (k == REF || k == OPEN || k == CLOSE
15708              || k == GROUPP || OP(o)==ACCEPT)
15709     {
15710         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
15711         if ( RXp_PAREN_NAMES(prog) ) {
15712             if ( k != REF || (OP(o) < NREF)) {
15713                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15714                 SV **name= av_fetch(list, ARG(o), 0 );
15715                 if (name)
15716                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15717             }
15718             else {
15719                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15720                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15721                 I32 *nums=(I32*)SvPVX(sv_dat);
15722                 SV **name= av_fetch(list, nums[0], 0 );
15723                 I32 n;
15724                 if (name) {
15725                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
15726                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15727                                     (n ? "," : ""), (IV)nums[n]);
15728                     }
15729                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15730                 }
15731             }
15732         }
15733         if ( k == REF && reginfo) {
15734             U32 n = ARG(o);  /* which paren pair */
15735             I32 ln = prog->offs[n].start;
15736             if (prog->lastparen < n || ln == -1)
15737                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15738             else if (ln == prog->offs[n].end)
15739                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15740             else {
15741                 const char *s = reginfo->strbeg + ln;
15742                 Perl_sv_catpvf(aTHX_ sv, ": ");
15743                 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15744                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15745             }
15746         }
15747     } else if (k == GOSUB)
15748         /* Paren and offset */
15749         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15750     else if (k == VERB) {
15751         if (!o->flags)
15752             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15753                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15754     } else if (k == LOGICAL)
15755         /* 2: embedded, otherwise 1 */
15756         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15757     else if (k == ANYOF) {
15758         const U8 flags = ANYOF_FLAGS(o);
15759         int do_sep = 0;
15760
15761
15762         if (flags & ANYOF_LOCALE_FLAGS)
15763             sv_catpvs(sv, "{loc}");
15764         if (flags & ANYOF_LOC_FOLD)
15765             sv_catpvs(sv, "{i}");
15766         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15767         if (flags & ANYOF_INVERT)
15768             sv_catpvs(sv, "^");
15769
15770         /* output what the standard cp 0-255 bitmap matches */
15771         do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15772
15773         /* output any special charclass tests (used entirely under use
15774          * locale) * */
15775         if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15776             int i;
15777             for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15778                 if (ANYOF_POSIXL_TEST(o,i)) {
15779                     sv_catpv(sv, anyofs[i]);
15780                     do_sep = 1;
15781                 }
15782             }
15783         }
15784
15785         if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15786                       |ANYOF_UTF8
15787                       |ANYOF_NONBITMAP_NON_UTF8
15788                       |ANYOF_LOC_FOLD)))
15789         {
15790             if (do_sep) {
15791                 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15792                 if (flags & ANYOF_INVERT)
15793                     /*make sure the invert info is in each */
15794                     sv_catpvs(sv, "^");
15795             }
15796
15797             if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15798                 sv_catpvs(sv, "{non-utf8-latin1-all}");
15799             }
15800
15801             /* output information about the unicode matching */
15802             if (flags & ANYOF_ABOVE_LATIN1_ALL)
15803                 sv_catpvs(sv, "{unicode_all}");
15804             else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15805                 SV *lv; /* Set if there is something outside the bit map. */
15806                 bool byte_output = FALSE;   /* If something in the bitmap has
15807                                                been output */
15808                 SV *only_utf8_locale;
15809
15810                 /* Get the stuff that wasn't in the bitmap */
15811                 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15812                                                     &lv, &only_utf8_locale);
15813                 if (lv && lv != &PL_sv_undef) {
15814                     char *s = savesvpv(lv);
15815                     char * const origs = s;
15816
15817                     while (*s && *s != '\n')
15818                         s++;
15819
15820                     if (*s == '\n') {
15821                         const char * const t = ++s;
15822
15823                         if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15824                             sv_catpvs(sv, "{outside bitmap}");
15825                         }
15826                         else {
15827                             sv_catpvs(sv, "{utf8}");
15828                         }
15829
15830                         if (byte_output) {
15831                             sv_catpvs(sv, " ");
15832                         }
15833
15834                         while (*s) {
15835                             if (*s == '\n') {
15836
15837                                 /* Truncate very long output */
15838                                 if (s - origs > 256) {
15839                                     Perl_sv_catpvf(aTHX_ sv,
15840                                                 "%.*s...",
15841                                                 (int) (s - origs - 1),
15842                                                 t);
15843                                     goto out_dump;
15844                                 }
15845                                 *s = ' ';
15846                             }
15847                             else if (*s == '\t') {
15848                                 *s = '-';
15849                             }
15850                             s++;
15851                         }
15852                         if (s[-1] == ' ')
15853                             s[-1] = 0;
15854
15855                         sv_catpv(sv, t);
15856                     }
15857
15858                 out_dump:
15859
15860                     Safefree(origs);
15861                     SvREFCNT_dec_NN(lv);
15862                 }
15863
15864                 if ((flags & ANYOF_LOC_FOLD)
15865                      && only_utf8_locale
15866                      && only_utf8_locale != &PL_sv_undef)
15867                 {
15868                     UV start, end;
15869                     int max_entries = 256;
15870
15871                     sv_catpvs(sv, "{utf8 locale}");
15872                     invlist_iterinit(only_utf8_locale);
15873                     while (invlist_iternext(only_utf8_locale,
15874                                             &start, &end)) {
15875                         put_range(sv, start, end);
15876                         max_entries --;
15877                         if (max_entries < 0) {
15878                             sv_catpvs(sv, "...");
15879                             break;
15880                         }
15881                     }
15882                     invlist_iterfinish(only_utf8_locale);
15883                 }
15884             }
15885         }
15886
15887         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15888     }
15889     else if (k == POSIXD || k == NPOSIXD) {
15890         U8 index = FLAGS(o) * 2;
15891         if (index < C_ARRAY_LENGTH(anyofs)) {
15892             if (*anyofs[index] != '[')  {
15893                 sv_catpv(sv, "[");
15894             }
15895             sv_catpv(sv, anyofs[index]);
15896             if (*anyofs[index] != '[')  {
15897                 sv_catpv(sv, "]");
15898             }
15899         }
15900         else {
15901             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15902         }
15903     }
15904     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15905         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15906 #else
15907     PERL_UNUSED_CONTEXT;
15908     PERL_UNUSED_ARG(sv);
15909     PERL_UNUSED_ARG(o);
15910     PERL_UNUSED_ARG(prog);
15911     PERL_UNUSED_ARG(reginfo);
15912 #endif  /* DEBUGGING */
15913 }
15914
15915
15916
15917 SV *
15918 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15919 {                               /* Assume that RE_INTUIT is set */
15920     dVAR;
15921     struct regexp *const prog = ReANY(r);
15922     GET_RE_DEBUG_FLAGS_DECL;
15923
15924     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15925     PERL_UNUSED_CONTEXT;
15926
15927     DEBUG_COMPILE_r(
15928         {
15929             const char * const s = SvPV_nolen_const(prog->check_substr
15930                       ? prog->check_substr : prog->check_utf8);
15931
15932             if (!PL_colorset) reginitcolors();
15933             PerlIO_printf(Perl_debug_log,
15934                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15935                       PL_colors[4],
15936                       prog->check_substr ? "" : "utf8 ",
15937                       PL_colors[5],PL_colors[0],
15938                       s,
15939                       PL_colors[1],
15940                       (strlen(s) > 60 ? "..." : ""));
15941         } );
15942
15943     return prog->check_substr ? prog->check_substr : prog->check_utf8;
15944 }
15945
15946 /*
15947    pregfree()
15948
15949    handles refcounting and freeing the perl core regexp structure. When
15950    it is necessary to actually free the structure the first thing it
15951    does is call the 'free' method of the regexp_engine associated to
15952    the regexp, allowing the handling of the void *pprivate; member
15953    first. (This routine is not overridable by extensions, which is why
15954    the extensions free is called first.)
15955
15956    See regdupe and regdupe_internal if you change anything here.
15957 */
15958 #ifndef PERL_IN_XSUB_RE
15959 void
15960 Perl_pregfree(pTHX_ REGEXP *r)
15961 {
15962     SvREFCNT_dec(r);
15963 }
15964
15965 void
15966 Perl_pregfree2(pTHX_ REGEXP *rx)
15967 {
15968     dVAR;
15969     struct regexp *const r = ReANY(rx);
15970     GET_RE_DEBUG_FLAGS_DECL;
15971
15972     PERL_ARGS_ASSERT_PREGFREE2;
15973
15974     if (r->mother_re) {
15975         ReREFCNT_dec(r->mother_re);
15976     } else {
15977         CALLREGFREE_PVT(rx); /* free the private data */
15978         SvREFCNT_dec(RXp_PAREN_NAMES(r));
15979         Safefree(r->xpv_len_u.xpvlenu_pv);
15980     }
15981     if (r->substrs) {
15982         SvREFCNT_dec(r->anchored_substr);
15983         SvREFCNT_dec(r->anchored_utf8);
15984         SvREFCNT_dec(r->float_substr);
15985         SvREFCNT_dec(r->float_utf8);
15986         Safefree(r->substrs);
15987     }
15988     RX_MATCH_COPY_FREE(rx);
15989 #ifdef PERL_ANY_COW
15990     SvREFCNT_dec(r->saved_copy);
15991 #endif
15992     Safefree(r->offs);
15993     SvREFCNT_dec(r->qr_anoncv);
15994     rx->sv_u.svu_rx = 0;
15995 }
15996
15997 /*  reg_temp_copy()
15998
15999     This is a hacky workaround to the structural issue of match results
16000     being stored in the regexp structure which is in turn stored in
16001     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16002     could be PL_curpm in multiple contexts, and could require multiple
16003     result sets being associated with the pattern simultaneously, such
16004     as when doing a recursive match with (??{$qr})
16005
16006     The solution is to make a lightweight copy of the regexp structure
16007     when a qr// is returned from the code executed by (??{$qr}) this
16008     lightweight copy doesn't actually own any of its data except for
16009     the starp/end and the actual regexp structure itself.
16010
16011 */
16012
16013
16014 REGEXP *
16015 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16016 {
16017     struct regexp *ret;
16018     struct regexp *const r = ReANY(rx);
16019     const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16020
16021     PERL_ARGS_ASSERT_REG_TEMP_COPY;
16022
16023     if (!ret_x)
16024         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16025     else {
16026         SvOK_off((SV *)ret_x);
16027         if (islv) {
16028             /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16029                to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16030                made both spots point to the same regexp body.) */
16031             REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16032             assert(!SvPVX(ret_x));
16033             ret_x->sv_u.svu_rx = temp->sv_any;
16034             temp->sv_any = NULL;
16035             SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16036             SvREFCNT_dec_NN(temp);
16037             /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16038                ing below will not set it. */
16039             SvCUR_set(ret_x, SvCUR(rx));
16040         }
16041     }
16042     /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16043        sv_force_normal(sv) is called.  */
16044     SvFAKE_on(ret_x);
16045     ret = ReANY(ret_x);
16046
16047     SvFLAGS(ret_x) |= SvUTF8(rx);
16048     /* We share the same string buffer as the original regexp, on which we
16049        hold a reference count, incremented when mother_re is set below.
16050        The string pointer is copied here, being part of the regexp struct.
16051      */
16052     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16053            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16054     if (r->offs) {
16055         const I32 npar = r->nparens+1;
16056         Newx(ret->offs, npar, regexp_paren_pair);
16057         Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16058     }
16059     if (r->substrs) {
16060         Newx(ret->substrs, 1, struct reg_substr_data);
16061         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16062
16063         SvREFCNT_inc_void(ret->anchored_substr);
16064         SvREFCNT_inc_void(ret->anchored_utf8);
16065         SvREFCNT_inc_void(ret->float_substr);
16066         SvREFCNT_inc_void(ret->float_utf8);
16067
16068         /* check_substr and check_utf8, if non-NULL, point to either their
16069            anchored or float namesakes, and don't hold a second reference.  */
16070     }
16071     RX_MATCH_COPIED_off(ret_x);
16072 #ifdef PERL_ANY_COW
16073     ret->saved_copy = NULL;
16074 #endif
16075     ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16076     SvREFCNT_inc_void(ret->qr_anoncv);
16077
16078     return ret_x;
16079 }
16080 #endif
16081
16082 /* regfree_internal()
16083
16084    Free the private data in a regexp. This is overloadable by
16085    extensions. Perl takes care of the regexp structure in pregfree(),
16086    this covers the *pprivate pointer which technically perl doesn't
16087    know about, however of course we have to handle the
16088    regexp_internal structure when no extension is in use.
16089
16090    Note this is called before freeing anything in the regexp
16091    structure.
16092  */
16093
16094 void
16095 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16096 {
16097     dVAR;
16098     struct regexp *const r = ReANY(rx);
16099     RXi_GET_DECL(r,ri);
16100     GET_RE_DEBUG_FLAGS_DECL;
16101
16102     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16103
16104     DEBUG_COMPILE_r({
16105         if (!PL_colorset)
16106             reginitcolors();
16107         {
16108             SV *dsv= sv_newmortal();
16109             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16110                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16111             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16112                 PL_colors[4],PL_colors[5],s);
16113         }
16114     });
16115 #ifdef RE_TRACK_PATTERN_OFFSETS
16116     if (ri->u.offsets)
16117         Safefree(ri->u.offsets);             /* 20010421 MJD */
16118 #endif
16119     if (ri->code_blocks) {
16120         int n;
16121         for (n = 0; n < ri->num_code_blocks; n++)
16122             SvREFCNT_dec(ri->code_blocks[n].src_regex);
16123         Safefree(ri->code_blocks);
16124     }
16125
16126     if (ri->data) {
16127         int n = ri->data->count;
16128
16129         while (--n >= 0) {
16130           /* If you add a ->what type here, update the comment in regcomp.h */
16131             switch (ri->data->what[n]) {
16132             case 'a':
16133             case 'r':
16134             case 's':
16135             case 'S':
16136             case 'u':
16137                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16138                 break;
16139             case 'f':
16140                 Safefree(ri->data->data[n]);
16141                 break;
16142             case 'l':
16143             case 'L':
16144                 break;
16145             case 'T':
16146                 { /* Aho Corasick add-on structure for a trie node.
16147                      Used in stclass optimization only */
16148                     U32 refcount;
16149                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16150                     OP_REFCNT_LOCK;
16151                     refcount = --aho->refcount;
16152                     OP_REFCNT_UNLOCK;
16153                     if ( !refcount ) {
16154                         PerlMemShared_free(aho->states);
16155                         PerlMemShared_free(aho->fail);
16156                          /* do this last!!!! */
16157                         PerlMemShared_free(ri->data->data[n]);
16158                         PerlMemShared_free(ri->regstclass);
16159                     }
16160                 }
16161                 break;
16162             case 't':
16163                 {
16164                     /* trie structure. */
16165                     U32 refcount;
16166                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16167                     OP_REFCNT_LOCK;
16168                     refcount = --trie->refcount;
16169                     OP_REFCNT_UNLOCK;
16170                     if ( !refcount ) {
16171                         PerlMemShared_free(trie->charmap);
16172                         PerlMemShared_free(trie->states);
16173                         PerlMemShared_free(trie->trans);
16174                         if (trie->bitmap)
16175                             PerlMemShared_free(trie->bitmap);
16176                         if (trie->jump)
16177                             PerlMemShared_free(trie->jump);
16178                         PerlMemShared_free(trie->wordinfo);
16179                         /* do this last!!!! */
16180                         PerlMemShared_free(ri->data->data[n]);
16181                     }
16182                 }
16183                 break;
16184             default:
16185                 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16186                                                     ri->data->what[n]);
16187             }
16188         }
16189         Safefree(ri->data->what);
16190         Safefree(ri->data);
16191     }
16192
16193     Safefree(ri);
16194 }
16195
16196 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16197 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16198 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
16199
16200 /*
16201    re_dup - duplicate a regexp.
16202
16203    This routine is expected to clone a given regexp structure. It is only
16204    compiled under USE_ITHREADS.
16205
16206    After all of the core data stored in struct regexp is duplicated
16207    the regexp_engine.dupe method is used to copy any private data
16208    stored in the *pprivate pointer. This allows extensions to handle
16209    any duplication it needs to do.
16210
16211    See pregfree() and regfree_internal() if you change anything here.
16212 */
16213 #if defined(USE_ITHREADS)
16214 #ifndef PERL_IN_XSUB_RE
16215 void
16216 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16217 {
16218     dVAR;
16219     I32 npar;
16220     const struct regexp *r = ReANY(sstr);
16221     struct regexp *ret = ReANY(dstr);
16222
16223     PERL_ARGS_ASSERT_RE_DUP_GUTS;
16224
16225     npar = r->nparens+1;
16226     Newx(ret->offs, npar, regexp_paren_pair);
16227     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16228
16229     if (ret->substrs) {
16230         /* Do it this way to avoid reading from *r after the StructCopy().
16231            That way, if any of the sv_dup_inc()s dislodge *r from the L1
16232            cache, it doesn't matter.  */
16233         const bool anchored = r->check_substr
16234             ? r->check_substr == r->anchored_substr
16235             : r->check_utf8 == r->anchored_utf8;
16236         Newx(ret->substrs, 1, struct reg_substr_data);
16237         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16238
16239         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16240         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16241         ret->float_substr = sv_dup_inc(ret->float_substr, param);
16242         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16243
16244         /* check_substr and check_utf8, if non-NULL, point to either their
16245            anchored or float namesakes, and don't hold a second reference.  */
16246
16247         if (ret->check_substr) {
16248             if (anchored) {
16249                 assert(r->check_utf8 == r->anchored_utf8);
16250                 ret->check_substr = ret->anchored_substr;
16251                 ret->check_utf8 = ret->anchored_utf8;
16252             } else {
16253                 assert(r->check_substr == r->float_substr);
16254                 assert(r->check_utf8 == r->float_utf8);
16255                 ret->check_substr = ret->float_substr;
16256                 ret->check_utf8 = ret->float_utf8;
16257             }
16258         } else if (ret->check_utf8) {
16259             if (anchored) {
16260                 ret->check_utf8 = ret->anchored_utf8;
16261             } else {
16262                 ret->check_utf8 = ret->float_utf8;
16263             }
16264         }
16265     }
16266
16267     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16268     ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16269
16270     if (ret->pprivate)
16271         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16272
16273     if (RX_MATCH_COPIED(dstr))
16274         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16275     else
16276         ret->subbeg = NULL;
16277 #ifdef PERL_ANY_COW
16278     ret->saved_copy = NULL;
16279 #endif
16280
16281     /* Whether mother_re be set or no, we need to copy the string.  We
16282        cannot refrain from copying it when the storage points directly to
16283        our mother regexp, because that's
16284                1: a buffer in a different thread
16285                2: something we no longer hold a reference on
16286                so we need to copy it locally.  */
16287     RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16288     ret->mother_re   = NULL;
16289 }
16290 #endif /* PERL_IN_XSUB_RE */
16291
16292 /*
16293    regdupe_internal()
16294
16295    This is the internal complement to regdupe() which is used to copy
16296    the structure pointed to by the *pprivate pointer in the regexp.
16297    This is the core version of the extension overridable cloning hook.
16298    The regexp structure being duplicated will be copied by perl prior
16299    to this and will be provided as the regexp *r argument, however
16300    with the /old/ structures pprivate pointer value. Thus this routine
16301    may override any copying normally done by perl.
16302
16303    It returns a pointer to the new regexp_internal structure.
16304 */
16305
16306 void *
16307 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16308 {
16309     dVAR;
16310     struct regexp *const r = ReANY(rx);
16311     regexp_internal *reti;
16312     int len;
16313     RXi_GET_DECL(r,ri);
16314
16315     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16316
16317     len = ProgLen(ri);
16318
16319     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16320           char, regexp_internal);
16321     Copy(ri->program, reti->program, len+1, regnode);
16322
16323     reti->num_code_blocks = ri->num_code_blocks;
16324     if (ri->code_blocks) {
16325         int n;
16326         Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16327                 struct reg_code_block);
16328         Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16329                 struct reg_code_block);
16330         for (n = 0; n < ri->num_code_blocks; n++)
16331              reti->code_blocks[n].src_regex = (REGEXP*)
16332                     sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16333     }
16334     else
16335         reti->code_blocks = NULL;
16336
16337     reti->regstclass = NULL;
16338
16339     if (ri->data) {
16340         struct reg_data *d;
16341         const int count = ri->data->count;
16342         int i;
16343
16344         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16345                 char, struct reg_data);
16346         Newx(d->what, count, U8);
16347
16348         d->count = count;
16349         for (i = 0; i < count; i++) {
16350             d->what[i] = ri->data->what[i];
16351             switch (d->what[i]) {
16352                 /* see also regcomp.h and regfree_internal() */
16353             case 'a': /* actually an AV, but the dup function is identical.  */
16354             case 'r':
16355             case 's':
16356             case 'S':
16357             case 'u': /* actually an HV, but the dup function is identical.  */
16358                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16359                 break;
16360             case 'f':
16361                 /* This is cheating. */
16362                 Newx(d->data[i], 1, regnode_ssc);
16363                 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16364                 reti->regstclass = (regnode*)d->data[i];
16365                 break;
16366             case 'T':
16367                 /* Trie stclasses are readonly and can thus be shared
16368                  * without duplication. We free the stclass in pregfree
16369                  * when the corresponding reg_ac_data struct is freed.
16370                  */
16371                 reti->regstclass= ri->regstclass;
16372                 /* Fall through */
16373             case 't':
16374                 OP_REFCNT_LOCK;
16375                 ((reg_trie_data*)ri->data->data[i])->refcount++;
16376                 OP_REFCNT_UNLOCK;
16377                 /* Fall through */
16378             case 'l':
16379             case 'L':
16380                 d->data[i] = ri->data->data[i];
16381                 break;
16382             default:
16383                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16384                                                            ri->data->what[i]);
16385             }
16386         }
16387
16388         reti->data = d;
16389     }
16390     else
16391         reti->data = NULL;
16392
16393     reti->name_list_idx = ri->name_list_idx;
16394
16395 #ifdef RE_TRACK_PATTERN_OFFSETS
16396     if (ri->u.offsets) {
16397         Newx(reti->u.offsets, 2*len+1, U32);
16398         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16399     }
16400 #else
16401     SetProgLen(reti,len);
16402 #endif
16403
16404     return (void*)reti;
16405 }
16406
16407 #endif    /* USE_ITHREADS */
16408
16409 #ifndef PERL_IN_XSUB_RE
16410
16411 /*
16412  - regnext - dig the "next" pointer out of a node
16413  */
16414 regnode *
16415 Perl_regnext(pTHX_ regnode *p)
16416 {
16417     dVAR;
16418     I32 offset;
16419
16420     if (!p)
16421         return(NULL);
16422
16423     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
16424         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16425                                                 (int)OP(p), (int)REGNODE_MAX);
16426     }
16427
16428     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16429     if (offset == 0)
16430         return(NULL);
16431
16432     return(p+offset);
16433 }
16434 #endif
16435
16436 STATIC void
16437 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16438 {
16439     va_list args;
16440     STRLEN l1 = strlen(pat1);
16441     STRLEN l2 = strlen(pat2);
16442     char buf[512];
16443     SV *msv;
16444     const char *message;
16445
16446     PERL_ARGS_ASSERT_RE_CROAK2;
16447
16448     if (l1 > 510)
16449         l1 = 510;
16450     if (l1 + l2 > 510)
16451         l2 = 510 - l1;
16452     Copy(pat1, buf, l1 , char);
16453     Copy(pat2, buf + l1, l2 , char);
16454     buf[l1 + l2] = '\n';
16455     buf[l1 + l2 + 1] = '\0';
16456     va_start(args, pat2);
16457     msv = vmess(buf, &args);
16458     va_end(args);
16459     message = SvPV_const(msv,l1);
16460     if (l1 > 512)
16461         l1 = 512;
16462     Copy(message, buf, l1 , char);
16463     /* l1-1 to avoid \n */
16464     Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16465 }
16466
16467 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16468
16469 #ifndef PERL_IN_XSUB_RE
16470 void
16471 Perl_save_re_context(pTHX)
16472 {
16473     dVAR;
16474
16475     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16476     if (PL_curpm) {
16477         const REGEXP * const rx = PM_GETRE(PL_curpm);
16478         if (rx) {
16479             U32 i;
16480             for (i = 1; i <= RX_NPARENS(rx); i++) {
16481                 char digits[TYPE_CHARS(long)];
16482                 const STRLEN len = my_snprintf(digits, sizeof(digits),
16483                                                "%lu", (long)i);
16484                 GV *const *const gvp
16485                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16486
16487                 if (gvp) {
16488                     GV * const gv = *gvp;
16489                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16490                         save_scalar(gv);
16491                 }
16492             }
16493         }
16494     }
16495 }
16496 #endif
16497
16498 #ifdef DEBUGGING
16499
16500 STATIC void
16501 S_put_byte(pTHX_ SV *sv, int c)
16502 {
16503     PERL_ARGS_ASSERT_PUT_BYTE;
16504
16505     if (!isPRINT(c)) {
16506         switch (c) {
16507             case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16508             case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16509             case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16510             case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16511             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16512
16513             default:
16514                 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16515                 break;
16516         }
16517     }
16518     else {
16519         const char string = c;
16520         if (c == '-' || c == ']' || c == '\\' || c == '^')
16521             sv_catpvs(sv, "\\");
16522         sv_catpvn(sv, &string, 1);
16523     }
16524 }
16525
16526 STATIC void
16527 S_put_range(pTHX_ SV *sv, UV start, UV end)
16528 {
16529
16530     /* Appends to 'sv' a displayable version of the range of code points from
16531      * 'start' to 'end' */
16532
16533     assert(start <= end);
16534
16535     PERL_ARGS_ASSERT_PUT_RANGE;
16536
16537     if (end - start < 3) {  /* Individual chars in short ranges */
16538         for (; start <= end; start++)
16539             put_byte(sv, start);
16540     }
16541     else if (   end > 255
16542              || ! isALPHANUMERIC(start)
16543              || ! isALPHANUMERIC(end)
16544              || isDIGIT(start) != isDIGIT(end)
16545              || isUPPER(start) != isUPPER(end)
16546              || isLOWER(start) != isLOWER(end)
16547
16548                 /* This final test should get optimized out except on EBCDIC
16549                  * platforms, where it causes ranges that cross discontinuities
16550                  * like i/j to be shown as hex instead of the misleading,
16551                  * e.g. H-K (since that range includes more than H, I, J, K).
16552                  * */
16553              || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16554     {
16555         Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16556                        start,
16557                        (end < 256) ? end : 255);
16558     }
16559     else { /* Here, the ends of the range are both digits, or both uppercase,
16560               or both lowercase; and there's no discontinuity in the range
16561               (which could happen on EBCDIC platforms) */
16562         put_byte(sv, start);
16563         sv_catpvs(sv, "-");
16564         put_byte(sv, end);
16565     }
16566 }
16567
16568 STATIC bool
16569 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16570 {
16571     /* Appends to 'sv' a displayable version of the innards of the bracketed
16572      * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16573      * output anything */
16574
16575     int i;
16576     bool has_output_anything = FALSE;
16577
16578     PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16579
16580     for (i = 0; i < 256; i++) {
16581         if (BITMAP_TEST((U8 *) bitmap,i)) {
16582
16583             /* The character at index i should be output.  Find the next
16584              * character that should NOT be output */
16585             int j;
16586             for (j = i + 1; j < 256; j++) {
16587                 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16588                     break;
16589                 }
16590             }
16591
16592             /* Everything between them is a single range that should be output
16593              * */
16594             put_range(sv, i, j - 1);
16595             has_output_anything = TRUE;
16596             i = j;
16597         }
16598     }
16599
16600     return has_output_anything;
16601 }
16602
16603 #define CLEAR_OPTSTART \
16604     if (optstart) STMT_START {                                               \
16605         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16606                               " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16607         optstart=NULL;                                                       \
16608     } STMT_END
16609
16610 #define DUMPUNTIL(b,e)                                                       \
16611                     CLEAR_OPTSTART;                                          \
16612                     node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16613
16614 STATIC const regnode *
16615 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16616             const regnode *last, const regnode *plast,
16617             SV* sv, I32 indent, U32 depth)
16618 {
16619     dVAR;
16620     U8 op = PSEUDO;     /* Arbitrary non-END op. */
16621     const regnode *next;
16622     const regnode *optstart= NULL;
16623
16624     RXi_GET_DECL(r,ri);
16625     GET_RE_DEBUG_FLAGS_DECL;
16626
16627     PERL_ARGS_ASSERT_DUMPUNTIL;
16628
16629 #ifdef DEBUG_DUMPUNTIL
16630     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16631         last ? last-start : 0,plast ? plast-start : 0);
16632 #endif
16633
16634     if (plast && plast < last)
16635         last= plast;
16636
16637     while (PL_regkind[op] != END && (!last || node < last)) {
16638         /* While that wasn't END last time... */
16639         NODE_ALIGN(node);
16640         op = OP(node);
16641         if (op == CLOSE || op == WHILEM)
16642             indent--;
16643         next = regnext((regnode *)node);
16644
16645         /* Where, what. */
16646         if (OP(node) == OPTIMIZED) {
16647             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16648                 optstart = node;
16649             else
16650                 goto after_print;
16651         } else
16652             CLEAR_OPTSTART;
16653
16654         regprop(r, sv, node, NULL);
16655         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16656                       (int)(2*indent + 1), "", SvPVX_const(sv));
16657
16658         if (OP(node) != OPTIMIZED) {
16659             if (next == NULL)           /* Next ptr. */
16660                 PerlIO_printf(Perl_debug_log, " (0)");
16661             else if (PL_regkind[(U8)op] == BRANCH
16662                      && PL_regkind[OP(next)] != BRANCH )
16663                 PerlIO_printf(Perl_debug_log, " (FAIL)");
16664             else
16665                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16666             (void)PerlIO_putc(Perl_debug_log, '\n');
16667         }
16668
16669       after_print:
16670         if (PL_regkind[(U8)op] == BRANCHJ) {
16671             assert(next);
16672             {
16673                 const regnode *nnode = (OP(next) == LONGJMP
16674                                        ? regnext((regnode *)next)
16675                                        : next);
16676                 if (last && nnode > last)
16677                     nnode = last;
16678                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16679             }
16680         }
16681         else if (PL_regkind[(U8)op] == BRANCH) {
16682             assert(next);
16683             DUMPUNTIL(NEXTOPER(node), next);
16684         }
16685         else if ( PL_regkind[(U8)op]  == TRIE ) {
16686             const regnode *this_trie = node;
16687             const char op = OP(node);
16688             const U32 n = ARG(node);
16689             const reg_ac_data * const ac = op>=AHOCORASICK ?
16690                (reg_ac_data *)ri->data->data[n] :
16691                NULL;
16692             const reg_trie_data * const trie =
16693                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16694 #ifdef DEBUGGING
16695             AV *const trie_words
16696                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16697 #endif
16698             const regnode *nextbranch= NULL;
16699             I32 word_idx;
16700             sv_setpvs(sv, "");
16701             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16702                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16703
16704                 PerlIO_printf(Perl_debug_log, "%*s%s ",
16705                    (int)(2*(indent+3)), "",
16706                     elem_ptr
16707                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16708                                 SvCUR(*elem_ptr), 60,
16709                                 PL_colors[0], PL_colors[1],
16710                                 (SvUTF8(*elem_ptr)
16711                                  ? PERL_PV_ESCAPE_UNI
16712                                  : 0)
16713                                 | PERL_PV_PRETTY_ELLIPSES
16714                                 | PERL_PV_PRETTY_LTGT
16715                             )
16716                     : "???"
16717                 );
16718                 if (trie->jump) {
16719                     U16 dist= trie->jump[word_idx+1];
16720                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16721                                (UV)((dist ? this_trie + dist : next) - start));
16722                     if (dist) {
16723                         if (!nextbranch)
16724                             nextbranch= this_trie + trie->jump[0];
16725                         DUMPUNTIL(this_trie + dist, nextbranch);
16726                     }
16727                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16728                         nextbranch= regnext((regnode *)nextbranch);
16729                 } else {
16730                     PerlIO_printf(Perl_debug_log, "\n");
16731                 }
16732             }
16733             if (last && next > last)
16734                 node= last;
16735             else
16736                 node= next;
16737         }
16738         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16739             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16740                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16741         }
16742         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16743             assert(next);
16744             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16745         }
16746         else if ( op == PLUS || op == STAR) {
16747             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16748         }
16749         else if (PL_regkind[(U8)op] == ANYOF) {
16750             /* arglen 1 + class block */
16751             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16752                           ? ANYOF_POSIXL_SKIP
16753                           : ANYOF_SKIP);
16754             node = NEXTOPER(node);
16755         }
16756         else if (PL_regkind[(U8)op] == EXACT) {
16757             /* Literal string, where present. */
16758             node += NODE_SZ_STR(node) - 1;
16759             node = NEXTOPER(node);
16760         }
16761         else {
16762             node = NEXTOPER(node);
16763             node += regarglen[(U8)op];
16764         }
16765         if (op == CURLYX || op == OPEN)
16766             indent++;
16767     }
16768     CLEAR_OPTSTART;
16769 #ifdef DEBUG_DUMPUNTIL
16770     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16771 #endif
16772     return node;
16773 }
16774
16775 #endif  /* DEBUGGING */
16776
16777 /*
16778  * Local variables:
16779  * c-indentation-style: bsd
16780  * c-basic-offset: 4
16781  * indent-tabs-mode: nil
16782  * End:
16783  *
16784  * ex: set ts=8 sts=4 sw=4 et:
16785  */